1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .widenScalarToNextPow2(1, 32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32); 936 937 938 // The hardware instructions return a different result on 0 than the generic 939 // instructions expect. The hardware produces -1, but these produce the 940 // bitwidth. 941 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 942 .scalarize(0) 943 .clampScalar(0, S32, S32) 944 .clampScalar(1, S32, S64) 945 .widenScalarToNextPow2(0, 32) 946 .widenScalarToNextPow2(1, 32) 947 .custom(); 948 949 // The 64-bit versions produce 32-bit results, but only on the SALU. 950 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 951 .legalFor({{S32, S32}, {S32, S64}}) 952 .clampScalar(0, S32, S32) 953 .clampScalar(1, S32, S64) 954 .scalarize(0) 955 .widenScalarToNextPow2(0, 32) 956 .widenScalarToNextPow2(1, 32); 957 958 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 959 // RegBankSelect. 960 getActionDefinitionsBuilder(G_BITREVERSE) 961 .legalFor({S32, S64}) 962 .clampScalar(0, S32, S64) 963 .scalarize(0) 964 .widenScalarToNextPow2(0); 965 966 if (ST.has16BitInsts()) { 967 getActionDefinitionsBuilder(G_BSWAP) 968 .legalFor({S16, S32, V2S16}) 969 .clampMaxNumElementsStrict(0, S16, 2) 970 // FIXME: Fixing non-power-of-2 before clamp is workaround for 971 // narrowScalar limitation. 972 .widenScalarToNextPow2(0) 973 .clampScalar(0, S16, S32) 974 .scalarize(0); 975 976 if (ST.hasVOP3PInsts()) { 977 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 978 .legalFor({S32, S16, V2S16}) 979 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 980 .clampMaxNumElements(0, S16, 2) 981 .minScalar(0, S16) 982 .widenScalarToNextPow2(0) 983 .scalarize(0) 984 .lower(); 985 } else { 986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 987 .legalFor({S32, S16}) 988 .widenScalarToNextPow2(0) 989 .minScalar(0, S16) 990 .scalarize(0) 991 .lower(); 992 } 993 } else { 994 // TODO: Should have same legality without v_perm_b32 995 getActionDefinitionsBuilder(G_BSWAP) 996 .legalFor({S32}) 997 .lowerIf(scalarNarrowerThan(0, 32)) 998 // FIXME: Fixing non-power-of-2 before clamp is workaround for 999 // narrowScalar limitation. 1000 .widenScalarToNextPow2(0) 1001 .maxScalar(0, S32) 1002 .scalarize(0) 1003 .lower(); 1004 1005 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1006 .legalFor({S32}) 1007 .minScalar(0, S32) 1008 .widenScalarToNextPow2(0) 1009 .scalarize(0) 1010 .lower(); 1011 } 1012 1013 getActionDefinitionsBuilder(G_INTTOPTR) 1014 // List the common cases 1015 .legalForCartesianProduct(AddrSpaces64, {S64}) 1016 .legalForCartesianProduct(AddrSpaces32, {S32}) 1017 .scalarize(0) 1018 // Accept any address space as long as the size matches 1019 .legalIf(sameSize(0, 1)) 1020 .widenScalarIf(smallerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }) 1024 .narrowScalarIf(largerThan(1, 0), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_PTRTOINT) 1030 // List the common cases 1031 .legalForCartesianProduct(AddrSpaces64, {S64}) 1032 .legalForCartesianProduct(AddrSpaces32, {S32}) 1033 .scalarize(0) 1034 // Accept any address space as long as the size matches 1035 .legalIf(sameSize(0, 1)) 1036 .widenScalarIf(smallerThan(0, 1), 1037 [](const LegalityQuery &Query) { 1038 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1039 }) 1040 .narrowScalarIf( 1041 largerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }); 1045 1046 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1047 .scalarize(0) 1048 .custom(); 1049 1050 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1051 bool IsLoad) -> bool { 1052 const LLT DstTy = Query.Types[0]; 1053 1054 // Split vector extloads. 1055 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 return false; 1078 }; 1079 1080 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1081 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1082 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1083 1084 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1085 // LDS 1086 // TODO: Unsupported flat for SI. 1087 1088 for (unsigned Op : {G_LOAD, G_STORE}) { 1089 const bool IsStore = Op == G_STORE; 1090 1091 auto &Actions = getActionDefinitionsBuilder(Op); 1092 // Explicitly list some common cases. 1093 // TODO: Does this help compile time at all? 1094 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1095 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1096 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1097 {S64, GlobalPtr, S64, GlobalAlign32}, 1098 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1099 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1100 {S32, GlobalPtr, S8, GlobalAlign8}, 1101 {S32, GlobalPtr, S16, GlobalAlign16}, 1102 1103 {S32, LocalPtr, S32, 32}, 1104 {S64, LocalPtr, S64, 32}, 1105 {V2S32, LocalPtr, V2S32, 32}, 1106 {S32, LocalPtr, S8, 8}, 1107 {S32, LocalPtr, S16, 16}, 1108 {V2S16, LocalPtr, S32, 32}, 1109 1110 {S32, PrivatePtr, S32, 32}, 1111 {S32, PrivatePtr, S8, 8}, 1112 {S32, PrivatePtr, S16, 16}, 1113 {V2S16, PrivatePtr, S32, 32}, 1114 1115 {S32, ConstantPtr, S32, GlobalAlign32}, 1116 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1117 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1118 {S64, ConstantPtr, S64, GlobalAlign32}, 1119 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1120 Actions.legalIf( 1121 [=](const LegalityQuery &Query) -> bool { 1122 return isLoadStoreLegal(ST, Query); 1123 }); 1124 1125 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1126 // 64-bits. 1127 // 1128 // TODO: Should generalize bitcast action into coerce, which will also cover 1129 // inserting addrspacecasts. 1130 Actions.customIf(typeIs(1, Constant32Ptr)); 1131 1132 // Turn any illegal element vectors into something easier to deal 1133 // with. These will ultimately produce 32-bit scalar shifts to extract the 1134 // parts anyway. 1135 // 1136 // For odd 16-bit element vectors, prefer to split those into pieces with 1137 // 16-bit vector parts. 1138 Actions.bitcastIf( 1139 [=](const LegalityQuery &Query) -> bool { 1140 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1141 Query.MMODescrs[0].MemoryTy); 1142 }, bitcastToRegisterType(0)); 1143 1144 if (!IsStore) { 1145 // Widen suitably aligned loads by loading extra bytes. The standard 1146 // legalization actions can't properly express widening memory operands. 1147 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1148 return shouldWidenLoad(ST, Query, G_LOAD); 1149 }); 1150 } 1151 1152 // FIXME: load/store narrowing should be moved to lower action 1153 Actions 1154 .narrowScalarIf( 1155 [=](const LegalityQuery &Query) -> bool { 1156 return !Query.Types[0].isVector() && 1157 needToSplitMemOp(Query, Op == G_LOAD); 1158 }, 1159 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1160 const LLT DstTy = Query.Types[0]; 1161 const LLT PtrTy = Query.Types[1]; 1162 1163 const unsigned DstSize = DstTy.getSizeInBits(); 1164 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1165 1166 // Split extloads. 1167 if (DstSize > MemSize) 1168 return std::make_pair(0, LLT::scalar(MemSize)); 1169 1170 unsigned MaxSize = maxSizeForAddrSpace(ST, 1171 PtrTy.getAddressSpace(), 1172 Op == G_LOAD); 1173 if (MemSize > MaxSize) 1174 return std::make_pair(0, LLT::scalar(MaxSize)); 1175 1176 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1177 return std::make_pair(0, LLT::scalar(Align)); 1178 }) 1179 .fewerElementsIf( 1180 [=](const LegalityQuery &Query) -> bool { 1181 return Query.Types[0].isVector() && 1182 needToSplitMemOp(Query, Op == G_LOAD); 1183 }, 1184 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1185 const LLT DstTy = Query.Types[0]; 1186 const LLT PtrTy = Query.Types[1]; 1187 1188 LLT EltTy = DstTy.getElementType(); 1189 unsigned MaxSize = maxSizeForAddrSpace(ST, 1190 PtrTy.getAddressSpace(), 1191 Op == G_LOAD); 1192 1193 // FIXME: Handle widened to power of 2 results better. This ends 1194 // up scalarizing. 1195 // FIXME: 3 element stores scalarized on SI 1196 1197 // Split if it's too large for the address space. 1198 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1199 if (MemSize > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector( 1206 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1207 } 1208 1209 unsigned NumPieces = MemSize / MaxSize; 1210 1211 // FIXME: Refine when odd breakdowns handled 1212 // The scalars will need to be re-legalized. 1213 if (NumPieces == 1 || NumPieces >= NumElts || 1214 NumElts % NumPieces != 0) 1215 return std::make_pair(0, EltTy); 1216 1217 return std::make_pair( 1218 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1219 } 1220 1221 // FIXME: We could probably handle weird extending loads better. 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector( 1234 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1235 } 1236 1237 // May need relegalization for the scalars. 1238 return std::make_pair(0, EltTy); 1239 }) 1240 .minScalar(0, S32) 1241 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1242 .widenScalarToNextPow2(0) 1243 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1244 .lower(); 1245 } 1246 1247 // FIXME: Unaligned accesses not lowered. 1248 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1249 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1250 {S32, GlobalPtr, S16, 2 * 8}, 1251 {S32, LocalPtr, S8, 8}, 1252 {S32, LocalPtr, S16, 16}, 1253 {S32, PrivatePtr, S8, 8}, 1254 {S32, PrivatePtr, S16, 16}, 1255 {S32, ConstantPtr, S8, 8}, 1256 {S32, ConstantPtr, S16, 2 * 8}}) 1257 .legalIf( 1258 [=](const LegalityQuery &Query) -> bool { 1259 return isLoadStoreLegal(ST, Query); 1260 }); 1261 1262 if (ST.hasFlatAddressSpace()) { 1263 ExtLoads.legalForTypesWithMemDesc( 1264 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1265 } 1266 1267 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1268 // 64-bits. 1269 // 1270 // TODO: Should generalize bitcast action into coerce, which will also cover 1271 // inserting addrspacecasts. 1272 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1273 1274 ExtLoads.clampScalar(0, S32, S32) 1275 .widenScalarToNextPow2(0) 1276 .lower(); 1277 1278 auto &Atomics = getActionDefinitionsBuilder( 1279 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1280 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1281 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1282 G_ATOMICRMW_UMIN}) 1283 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1284 {S64, GlobalPtr}, {S64, LocalPtr}, 1285 {S32, RegionPtr}, {S64, RegionPtr}}); 1286 if (ST.hasFlatAddressSpace()) { 1287 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1288 } 1289 1290 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1291 if (ST.hasLDSFPAtomicAdd()) { 1292 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1293 if (ST.hasGFX90AInsts()) 1294 Atomic.legalFor({{S64, LocalPtr}}); 1295 } 1296 if (ST.hasAtomicFaddInsts()) 1297 Atomic.legalFor({{S32, GlobalPtr}}); 1298 1299 if (ST.hasGFX90AInsts()) { 1300 // These are legal with some caveats, and should have undergone expansion in 1301 // the IR in most situations 1302 // TODO: Move atomic expansion into legalizer 1303 // TODO: Also supports <2 x f16> 1304 Atomic.legalFor({ 1305 {S32, GlobalPtr}, 1306 {S64, GlobalPtr}, 1307 {S64, FlatPtr} 1308 }); 1309 } 1310 1311 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1312 // demarshalling 1313 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1314 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1315 {S32, FlatPtr}, {S64, FlatPtr}}) 1316 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1317 {S32, RegionPtr}, {S64, RegionPtr}}); 1318 // TODO: Pointer types, any 32-bit or 64-bit vector 1319 1320 // Condition should be s32 for scalar, s1 for vector. 1321 getActionDefinitionsBuilder(G_SELECT) 1322 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1323 LocalPtr, FlatPtr, PrivatePtr, 1324 LLT::fixed_vector(2, LocalPtr), 1325 LLT::fixed_vector(2, PrivatePtr)}, 1326 {S1, S32}) 1327 .clampScalar(0, S16, S64) 1328 .scalarize(1) 1329 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1330 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1331 .clampMaxNumElements(0, S32, 2) 1332 .clampMaxNumElements(0, LocalPtr, 2) 1333 .clampMaxNumElements(0, PrivatePtr, 2) 1334 .scalarize(0) 1335 .widenScalarToNextPow2(0) 1336 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1337 1338 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1339 // be more flexible with the shift amount type. 1340 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1341 .legalFor({{S32, S32}, {S64, S32}}); 1342 if (ST.has16BitInsts()) { 1343 if (ST.hasVOP3PInsts()) { 1344 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1345 .clampMaxNumElements(0, S16, 2); 1346 } else 1347 Shifts.legalFor({{S16, S16}}); 1348 1349 // TODO: Support 16-bit shift amounts for all types 1350 Shifts.widenScalarIf( 1351 [=](const LegalityQuery &Query) { 1352 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1353 // 32-bit amount. 1354 const LLT ValTy = Query.Types[0]; 1355 const LLT AmountTy = Query.Types[1]; 1356 return ValTy.getSizeInBits() <= 16 && 1357 AmountTy.getSizeInBits() < 16; 1358 }, changeTo(1, S16)); 1359 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1360 Shifts.clampScalar(1, S32, S32); 1361 Shifts.widenScalarToNextPow2(0, 16); 1362 Shifts.clampScalar(0, S16, S64); 1363 1364 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1365 .minScalar(0, S16) 1366 .scalarize(0) 1367 .lower(); 1368 } else { 1369 // Make sure we legalize the shift amount type first, as the general 1370 // expansion for the shifted type will produce much worse code if it hasn't 1371 // been truncated already. 1372 Shifts.clampScalar(1, S32, S32); 1373 Shifts.widenScalarToNextPow2(0, 32); 1374 Shifts.clampScalar(0, S32, S64); 1375 1376 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1377 .minScalar(0, S32) 1378 .scalarize(0) 1379 .lower(); 1380 } 1381 Shifts.scalarize(0); 1382 1383 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1384 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1385 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1386 unsigned IdxTypeIdx = 2; 1387 1388 getActionDefinitionsBuilder(Op) 1389 .customIf([=](const LegalityQuery &Query) { 1390 const LLT EltTy = Query.Types[EltTypeIdx]; 1391 const LLT VecTy = Query.Types[VecTypeIdx]; 1392 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1393 const unsigned EltSize = EltTy.getSizeInBits(); 1394 return (EltSize == 32 || EltSize == 64) && 1395 VecTy.getSizeInBits() % 32 == 0 && 1396 VecTy.getSizeInBits() <= MaxRegisterSize && 1397 IdxTy.getSizeInBits() == 32; 1398 }) 1399 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1400 bitcastToVectorElement32(VecTypeIdx)) 1401 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1402 .bitcastIf( 1403 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1404 [=](const LegalityQuery &Query) { 1405 // For > 64-bit element types, try to turn this into a 64-bit 1406 // element vector since we may be able to do better indexing 1407 // if this is scalar. If not, fall back to 32. 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const unsigned DstEltSize = EltTy.getSizeInBits(); 1411 const unsigned VecSize = VecTy.getSizeInBits(); 1412 1413 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1414 return std::make_pair( 1415 VecTypeIdx, 1416 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1417 }) 1418 .clampScalar(EltTypeIdx, S32, S64) 1419 .clampScalar(VecTypeIdx, S32, S64) 1420 .clampScalar(IdxTypeIdx, S32, S32) 1421 .clampMaxNumElements(VecTypeIdx, S32, 32) 1422 // TODO: Clamp elements for 64-bit vectors? 1423 // It should only be necessary with variable indexes. 1424 // As a last resort, lower to the stack 1425 .lower(); 1426 } 1427 1428 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1429 .unsupportedIf([=](const LegalityQuery &Query) { 1430 const LLT &EltTy = Query.Types[1].getElementType(); 1431 return Query.Types[0] != EltTy; 1432 }); 1433 1434 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1435 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1436 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1437 1438 // FIXME: Doesn't handle extract of illegal sizes. 1439 getActionDefinitionsBuilder(Op) 1440 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1441 .lowerIf([=](const LegalityQuery &Query) { 1442 // Sub-vector(or single element) insert and extract. 1443 // TODO: verify immediate offset here since lower only works with 1444 // whole elements. 1445 const LLT BigTy = Query.Types[BigTyIdx]; 1446 return BigTy.isVector(); 1447 }) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .clampMaxNumElementsStrict(0, S16, 2); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1621 .scalarize(0) 1622 .lower(); 1623 1624 // TODO: Only Try to form v2s16 with legal packed instructions. 1625 getActionDefinitionsBuilder(G_FSHR) 1626 .legalFor({{S32, S32}}) 1627 .lowerFor({{V2S16, V2S16}}) 1628 .clampMaxNumElementsStrict(0, S16, 2) 1629 .scalarize(0) 1630 .lower(); 1631 1632 if (ST.hasVOP3PInsts()) { 1633 getActionDefinitionsBuilder(G_FSHL) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 } else { 1639 getActionDefinitionsBuilder(G_FSHL) 1640 .scalarize(0) 1641 .lower(); 1642 } 1643 1644 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1645 .legalFor({S64}); 1646 1647 getActionDefinitionsBuilder(G_FENCE) 1648 .alwaysLegal(); 1649 1650 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1651 .scalarize(0) 1652 .minScalar(0, S32) 1653 .lower(); 1654 1655 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1656 .legalFor({{S32, S32}, {S64, S32}}) 1657 .clampScalar(1, S32, S32) 1658 .clampScalar(0, S32, S64) 1659 .widenScalarToNextPow2(0) 1660 .scalarize(0); 1661 1662 getActionDefinitionsBuilder({ 1663 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1664 G_FCOPYSIGN, 1665 1666 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1667 G_ATOMICRMW_NAND, 1668 G_ATOMICRMW_FSUB, 1669 G_READ_REGISTER, 1670 G_WRITE_REGISTER, 1671 1672 G_SADDO, G_SSUBO, 1673 1674 // TODO: Implement 1675 G_FMINIMUM, G_FMAXIMUM}).lower(); 1676 1677 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1678 .lower(); 1679 1680 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1681 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1682 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1683 .unsupported(); 1684 1685 getLegacyLegalizerInfo().computeTables(); 1686 verify(*ST.getInstrInfo()); 1687 } 1688 1689 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1690 MachineInstr &MI) const { 1691 MachineIRBuilder &B = Helper.MIRBuilder; 1692 MachineRegisterInfo &MRI = *B.getMRI(); 1693 1694 switch (MI.getOpcode()) { 1695 case TargetOpcode::G_ADDRSPACE_CAST: 1696 return legalizeAddrSpaceCast(MI, MRI, B); 1697 case TargetOpcode::G_FRINT: 1698 return legalizeFrint(MI, MRI, B); 1699 case TargetOpcode::G_FCEIL: 1700 return legalizeFceil(MI, MRI, B); 1701 case TargetOpcode::G_FREM: 1702 return legalizeFrem(MI, MRI, B); 1703 case TargetOpcode::G_INTRINSIC_TRUNC: 1704 return legalizeIntrinsicTrunc(MI, MRI, B); 1705 case TargetOpcode::G_SITOFP: 1706 return legalizeITOFP(MI, MRI, B, true); 1707 case TargetOpcode::G_UITOFP: 1708 return legalizeITOFP(MI, MRI, B, false); 1709 case TargetOpcode::G_FPTOSI: 1710 return legalizeFPTOI(MI, MRI, B, true); 1711 case TargetOpcode::G_FPTOUI: 1712 return legalizeFPTOI(MI, MRI, B, false); 1713 case TargetOpcode::G_FMINNUM: 1714 case TargetOpcode::G_FMAXNUM: 1715 case TargetOpcode::G_FMINNUM_IEEE: 1716 case TargetOpcode::G_FMAXNUM_IEEE: 1717 return legalizeMinNumMaxNum(Helper, MI); 1718 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1719 return legalizeExtractVectorElt(MI, MRI, B); 1720 case TargetOpcode::G_INSERT_VECTOR_ELT: 1721 return legalizeInsertVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_SHUFFLE_VECTOR: 1723 return legalizeShuffleVector(MI, MRI, B); 1724 case TargetOpcode::G_FSIN: 1725 case TargetOpcode::G_FCOS: 1726 return legalizeSinCos(MI, MRI, B); 1727 case TargetOpcode::G_GLOBAL_VALUE: 1728 return legalizeGlobalValue(MI, MRI, B); 1729 case TargetOpcode::G_LOAD: 1730 case TargetOpcode::G_SEXTLOAD: 1731 case TargetOpcode::G_ZEXTLOAD: 1732 return legalizeLoad(Helper, MI); 1733 case TargetOpcode::G_FMAD: 1734 return legalizeFMad(MI, MRI, B); 1735 case TargetOpcode::G_FDIV: 1736 return legalizeFDIV(MI, MRI, B); 1737 case TargetOpcode::G_UDIV: 1738 case TargetOpcode::G_UREM: 1739 case TargetOpcode::G_UDIVREM: 1740 return legalizeUnsignedDIV_REM(MI, MRI, B); 1741 case TargetOpcode::G_SDIV: 1742 case TargetOpcode::G_SREM: 1743 case TargetOpcode::G_SDIVREM: 1744 return legalizeSignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_ATOMIC_CMPXCHG: 1746 return legalizeAtomicCmpXChg(MI, MRI, B); 1747 case TargetOpcode::G_FLOG: 1748 return legalizeFlog(MI, B, numbers::ln2f); 1749 case TargetOpcode::G_FLOG10: 1750 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1751 case TargetOpcode::G_FEXP: 1752 return legalizeFExp(MI, B); 1753 case TargetOpcode::G_FPOW: 1754 return legalizeFPow(MI, B); 1755 case TargetOpcode::G_FFLOOR: 1756 return legalizeFFloor(MI, MRI, B); 1757 case TargetOpcode::G_BUILD_VECTOR: 1758 return legalizeBuildVector(MI, MRI, B); 1759 case TargetOpcode::G_CTLZ: 1760 case TargetOpcode::G_CTTZ: 1761 return legalizeCTLZ_CTTZ(MI, MRI, B); 1762 default: 1763 return false; 1764 } 1765 1766 llvm_unreachable("expected switch to return"); 1767 } 1768 1769 Register AMDGPULegalizerInfo::getSegmentAperture( 1770 unsigned AS, 1771 MachineRegisterInfo &MRI, 1772 MachineIRBuilder &B) const { 1773 MachineFunction &MF = B.getMF(); 1774 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1775 const LLT S32 = LLT::scalar(32); 1776 1777 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1778 1779 if (ST.hasApertureRegs()) { 1780 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1781 // getreg. 1782 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1783 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1784 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1785 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1786 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1788 unsigned Encoding = 1789 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1790 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1791 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1792 1793 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1794 1795 B.buildInstr(AMDGPU::S_GETREG_B32) 1796 .addDef(GetReg) 1797 .addImm(Encoding); 1798 MRI.setType(GetReg, S32); 1799 1800 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1801 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1802 } 1803 1804 Register QueuePtr = MRI.createGenericVirtualRegister( 1805 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1806 1807 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1808 return Register(); 1809 1810 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1811 // private_segment_aperture_base_hi. 1812 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1813 1814 // TODO: can we be smarter about machine pointer info? 1815 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1816 MachineMemOperand *MMO = MF.getMachineMemOperand( 1817 PtrInfo, 1818 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1819 MachineMemOperand::MOInvariant, 1820 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1821 1822 Register LoadAddr; 1823 1824 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1825 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1826 } 1827 1828 /// Return true if the value is a known valid address, such that a null check is 1829 /// not necessary. 1830 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1831 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1832 MachineInstr *Def = MRI.getVRegDef(Val); 1833 switch (Def->getOpcode()) { 1834 case AMDGPU::G_FRAME_INDEX: 1835 case AMDGPU::G_GLOBAL_VALUE: 1836 case AMDGPU::G_BLOCK_ADDR: 1837 return true; 1838 case AMDGPU::G_CONSTANT: { 1839 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1840 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1841 } 1842 default: 1843 return false; 1844 } 1845 1846 return false; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 MachineFunction &MF = B.getMF(); 1853 1854 const LLT S32 = LLT::scalar(32); 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src = MI.getOperand(1).getReg(); 1857 1858 LLT DstTy = MRI.getType(Dst); 1859 LLT SrcTy = MRI.getType(Src); 1860 unsigned DestAS = DstTy.getAddressSpace(); 1861 unsigned SrcAS = SrcTy.getAddressSpace(); 1862 1863 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1864 // vector element. 1865 assert(!DstTy.isVector()); 1866 1867 const AMDGPUTargetMachine &TM 1868 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1869 1870 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1871 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1872 return true; 1873 } 1874 1875 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1876 // Truncate. 1877 B.buildExtract(Dst, Src, 0); 1878 MI.eraseFromParent(); 1879 return true; 1880 } 1881 1882 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1883 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1884 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1885 1886 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1887 // another. Merge operands are required to be the same type, but creating an 1888 // extra ptrtoint would be kind of pointless. 1889 auto HighAddr = B.buildConstant( 1890 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1891 B.buildMerge(Dst, {Src, HighAddr}); 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1897 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1898 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1899 1900 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1901 // Extract low 32-bits of the pointer. 1902 B.buildExtract(Dst, Src, 0); 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 unsigned NullVal = TM.getNullPointerValue(DestAS); 1908 1909 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1910 auto FlatNull = B.buildConstant(SrcTy, 0); 1911 1912 // Extract low 32-bits of the pointer. 1913 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1914 1915 auto CmpRes = 1916 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1917 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1924 return false; 1925 1926 if (!ST.hasFlatAddressSpace()) 1927 return false; 1928 1929 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1930 if (!ApertureReg.isValid()) 1931 return false; 1932 1933 // Coerce the type of the low half of the result so we can use merge_values. 1934 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1935 1936 // TODO: Should we allow mismatched types but matching sizes in merges to 1937 // avoid the ptrtoint? 1938 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1939 1940 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1941 B.buildCopy(Dst, BuildPtr); 1942 MI.eraseFromParent(); 1943 return true; 1944 } 1945 1946 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1947 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1948 1949 auto CmpRes = 1950 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1951 1952 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeFrint( 1959 MachineInstr &MI, MachineRegisterInfo &MRI, 1960 MachineIRBuilder &B) const { 1961 Register Src = MI.getOperand(1).getReg(); 1962 LLT Ty = MRI.getType(Src); 1963 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1964 1965 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1966 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1967 1968 auto C1 = B.buildFConstant(Ty, C1Val); 1969 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1970 1971 // TODO: Should this propagate fast-math-flags? 1972 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1973 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1974 1975 auto C2 = B.buildFConstant(Ty, C2Val); 1976 auto Fabs = B.buildFAbs(Ty, Src); 1977 1978 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1979 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeFceil( 1985 MachineInstr &MI, MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 1988 const LLT S1 = LLT::scalar(1); 1989 const LLT S64 = LLT::scalar(64); 1990 1991 Register Src = MI.getOperand(1).getReg(); 1992 assert(MRI.getType(Src) == S64); 1993 1994 // result = trunc(src) 1995 // if (src > 0.0 && src != result) 1996 // result += 1.0 1997 1998 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1999 2000 const auto Zero = B.buildFConstant(S64, 0.0); 2001 const auto One = B.buildFConstant(S64, 1.0); 2002 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2003 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2004 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2005 auto Add = B.buildSelect(S64, And, One, Zero); 2006 2007 // TODO: Should this propagate fast-math-flags? 2008 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeFrem( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 Register DstReg = MI.getOperand(0).getReg(); 2017 Register Src0Reg = MI.getOperand(1).getReg(); 2018 Register Src1Reg = MI.getOperand(2).getReg(); 2019 auto Flags = MI.getFlags(); 2020 LLT Ty = MRI.getType(DstReg); 2021 2022 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2023 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2024 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2025 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 static MachineInstrBuilder extractF64Exponent(Register Hi, 2031 MachineIRBuilder &B) { 2032 const unsigned FractBits = 52; 2033 const unsigned ExpBits = 11; 2034 LLT S32 = LLT::scalar(32); 2035 2036 auto Const0 = B.buildConstant(S32, FractBits - 32); 2037 auto Const1 = B.buildConstant(S32, ExpBits); 2038 2039 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2040 .addUse(Hi) 2041 .addUse(Const0.getReg(0)) 2042 .addUse(Const1.getReg(0)); 2043 2044 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 MachineIRBuilder &B) const { 2050 const LLT S1 = LLT::scalar(1); 2051 const LLT S32 = LLT::scalar(32); 2052 const LLT S64 = LLT::scalar(64); 2053 2054 Register Src = MI.getOperand(1).getReg(); 2055 assert(MRI.getType(Src) == S64); 2056 2057 // TODO: Should this use extract since the low half is unused? 2058 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2059 Register Hi = Unmerge.getReg(1); 2060 2061 // Extract the upper half, since this is where we will find the sign and 2062 // exponent. 2063 auto Exp = extractF64Exponent(Hi, B); 2064 2065 const unsigned FractBits = 52; 2066 2067 // Extract the sign bit. 2068 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2069 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2070 2071 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2072 2073 const auto Zero32 = B.buildConstant(S32, 0); 2074 2075 // Extend back to 64-bits. 2076 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2077 2078 auto Shr = B.buildAShr(S64, FractMask, Exp); 2079 auto Not = B.buildNot(S64, Shr); 2080 auto Tmp0 = B.buildAnd(S64, Src, Not); 2081 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2082 2083 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2084 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2085 2086 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2087 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2088 MI.eraseFromParent(); 2089 return true; 2090 } 2091 2092 bool AMDGPULegalizerInfo::legalizeITOFP( 2093 MachineInstr &MI, MachineRegisterInfo &MRI, 2094 MachineIRBuilder &B, bool Signed) const { 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src = MI.getOperand(1).getReg(); 2098 2099 const LLT S64 = LLT::scalar(64); 2100 const LLT S32 = LLT::scalar(32); 2101 2102 assert(MRI.getType(Src) == S64); 2103 2104 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2105 auto ThirtyTwo = B.buildConstant(S32, 32); 2106 2107 if (MRI.getType(Dst) == S64) { 2108 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2109 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2110 2111 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2112 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2113 .addUse(CvtHi.getReg(0)) 2114 .addUse(ThirtyTwo.getReg(0)); 2115 2116 // TODO: Should this propagate fast-math-flags? 2117 B.buildFAdd(Dst, LdExp, CvtLo); 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 assert(MRI.getType(Dst) == S32); 2123 2124 auto One = B.buildConstant(S32, 1); 2125 2126 MachineInstrBuilder ShAmt; 2127 if (Signed) { 2128 auto ThirtyOne = B.buildConstant(S32, 31); 2129 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2130 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2131 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2132 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2133 /*HasSideEffects=*/false) 2134 .addUse(Unmerge.getReg(1)); 2135 auto LS2 = B.buildSub(S32, LS, One); 2136 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2137 } else 2138 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2139 auto Norm = B.buildShl(S64, Src, ShAmt); 2140 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2141 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2142 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2143 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2144 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2145 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2146 /*HasSideEffects=*/false) 2147 .addUse(FVal.getReg(0)) 2148 .addUse(Scale.getReg(0)); 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 // TODO: Copied from DAG implementation. Verify logic and document how this 2154 // actually works. 2155 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2156 MachineRegisterInfo &MRI, 2157 MachineIRBuilder &B, 2158 bool Signed) const { 2159 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register Src = MI.getOperand(1).getReg(); 2162 2163 const LLT S64 = LLT::scalar(64); 2164 const LLT S32 = LLT::scalar(32); 2165 2166 const LLT SrcLT = MRI.getType(Src); 2167 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2168 2169 unsigned Flags = MI.getFlags(); 2170 2171 // The basic idea of converting a floating point number into a pair of 32-bit 2172 // integers is illustrated as follows: 2173 // 2174 // tf := trunc(val); 2175 // hif := floor(tf * 2^-32); 2176 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2177 // hi := fptoi(hif); 2178 // lo := fptoi(lof); 2179 // 2180 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2181 MachineInstrBuilder Sign; 2182 if (Signed && SrcLT == S32) { 2183 // However, a 32-bit floating point number has only 23 bits mantissa and 2184 // it's not enough to hold all the significant bits of `lof` if val is 2185 // negative. To avoid the loss of precision, We need to take the absolute 2186 // value after truncating and flip the result back based on the original 2187 // signedness. 2188 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2189 Trunc = B.buildFAbs(S32, Trunc, Flags); 2190 } 2191 MachineInstrBuilder K0, K1; 2192 if (SrcLT == S64) { 2193 K0 = B.buildFConstant(S64, 2194 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2195 K1 = B.buildFConstant(S64, 2196 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2197 } else { 2198 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2199 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2200 } 2201 2202 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2203 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2204 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2205 2206 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2207 : B.buildFPTOUI(S32, FloorMul); 2208 auto Lo = B.buildFPTOUI(S32, Fma); 2209 2210 if (Signed && SrcLT == S32) { 2211 // Flip the result based on the signedness, which is either all 0s or 1s. 2212 Sign = B.buildMerge(S64, {Sign, Sign}); 2213 // r := xor({lo, hi}, sign) - sign; 2214 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2215 } else 2216 B.buildMerge(Dst, {Lo, Hi}); 2217 MI.eraseFromParent(); 2218 2219 return true; 2220 } 2221 2222 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2223 MachineInstr &MI) const { 2224 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2225 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2226 2227 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2228 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2229 2230 // With ieee_mode disabled, the instructions have the correct behavior 2231 // already for G_FMINNUM/G_FMAXNUM 2232 if (!MFI->getMode().IEEE) 2233 return !IsIEEEOp; 2234 2235 if (IsIEEEOp) 2236 return true; 2237 2238 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2239 } 2240 2241 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2242 MachineInstr &MI, MachineRegisterInfo &MRI, 2243 MachineIRBuilder &B) const { 2244 // TODO: Should move some of this into LegalizerHelper. 2245 2246 // TODO: Promote dynamic indexing of s16 to s32 2247 2248 // FIXME: Artifact combiner probably should have replaced the truncated 2249 // constant before this, so we shouldn't need 2250 // getIConstantVRegValWithLookThrough. 2251 Optional<ValueAndVReg> MaybeIdxVal = 2252 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2253 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2254 return true; 2255 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2256 2257 Register Dst = MI.getOperand(0).getReg(); 2258 Register Vec = MI.getOperand(1).getReg(); 2259 2260 LLT VecTy = MRI.getType(Vec); 2261 LLT EltTy = VecTy.getElementType(); 2262 assert(EltTy == MRI.getType(Dst)); 2263 2264 if (IdxVal < VecTy.getNumElements()) { 2265 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2266 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2267 } else { 2268 B.buildUndef(Dst); 2269 } 2270 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2276 MachineInstr &MI, MachineRegisterInfo &MRI, 2277 MachineIRBuilder &B) const { 2278 // TODO: Should move some of this into LegalizerHelper. 2279 2280 // TODO: Promote dynamic indexing of s16 to s32 2281 2282 // FIXME: Artifact combiner probably should have replaced the truncated 2283 // constant before this, so we shouldn't need 2284 // getIConstantVRegValWithLookThrough. 2285 Optional<ValueAndVReg> MaybeIdxVal = 2286 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2287 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2288 return true; 2289 2290 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2291 Register Dst = MI.getOperand(0).getReg(); 2292 Register Vec = MI.getOperand(1).getReg(); 2293 Register Ins = MI.getOperand(2).getReg(); 2294 2295 LLT VecTy = MRI.getType(Vec); 2296 LLT EltTy = VecTy.getElementType(); 2297 assert(EltTy == MRI.getType(Ins)); 2298 (void)Ins; 2299 2300 unsigned NumElts = VecTy.getNumElements(); 2301 if (IdxVal < NumElts) { 2302 SmallVector<Register, 8> SrcRegs; 2303 for (unsigned i = 0; i < NumElts; ++i) 2304 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2305 B.buildUnmerge(SrcRegs, Vec); 2306 2307 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2308 B.buildMerge(Dst, SrcRegs); 2309 } else { 2310 B.buildUndef(Dst); 2311 } 2312 2313 MI.eraseFromParent(); 2314 return true; 2315 } 2316 2317 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2318 MachineInstr &MI, MachineRegisterInfo &MRI, 2319 MachineIRBuilder &B) const { 2320 const LLT V2S16 = LLT::fixed_vector(2, 16); 2321 2322 Register Dst = MI.getOperand(0).getReg(); 2323 Register Src0 = MI.getOperand(1).getReg(); 2324 LLT DstTy = MRI.getType(Dst); 2325 LLT SrcTy = MRI.getType(Src0); 2326 2327 if (SrcTy == V2S16 && DstTy == V2S16 && 2328 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2329 return true; 2330 2331 MachineIRBuilder HelperBuilder(MI); 2332 GISelObserverWrapper DummyObserver; 2333 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2334 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2335 } 2336 2337 bool AMDGPULegalizerInfo::legalizeSinCos( 2338 MachineInstr &MI, MachineRegisterInfo &MRI, 2339 MachineIRBuilder &B) const { 2340 2341 Register DstReg = MI.getOperand(0).getReg(); 2342 Register SrcReg = MI.getOperand(1).getReg(); 2343 LLT Ty = MRI.getType(DstReg); 2344 unsigned Flags = MI.getFlags(); 2345 2346 Register TrigVal; 2347 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2348 if (ST.hasTrigReducedRange()) { 2349 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2350 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2351 .addUse(MulVal.getReg(0)) 2352 .setMIFlags(Flags).getReg(0); 2353 } else 2354 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2355 2356 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2357 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2358 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2359 .addUse(TrigVal) 2360 .setMIFlags(Flags); 2361 MI.eraseFromParent(); 2362 return true; 2363 } 2364 2365 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2366 MachineIRBuilder &B, 2367 const GlobalValue *GV, 2368 int64_t Offset, 2369 unsigned GAFlags) const { 2370 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2371 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2372 // to the following code sequence: 2373 // 2374 // For constant address space: 2375 // s_getpc_b64 s[0:1] 2376 // s_add_u32 s0, s0, $symbol 2377 // s_addc_u32 s1, s1, 0 2378 // 2379 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2380 // a fixup or relocation is emitted to replace $symbol with a literal 2381 // constant, which is a pc-relative offset from the encoding of the $symbol 2382 // operand to the global variable. 2383 // 2384 // For global address space: 2385 // s_getpc_b64 s[0:1] 2386 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2387 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2388 // 2389 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2390 // fixups or relocations are emitted to replace $symbol@*@lo and 2391 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2392 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2393 // operand to the global variable. 2394 // 2395 // What we want here is an offset from the value returned by s_getpc 2396 // (which is the address of the s_add_u32 instruction) to the global 2397 // variable, but since the encoding of $symbol starts 4 bytes after the start 2398 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2399 // small. This requires us to add 4 to the global variable offset in order to 2400 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2401 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2402 // instruction. 2403 2404 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2405 2406 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2407 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2408 2409 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2410 .addDef(PCReg); 2411 2412 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2413 if (GAFlags == SIInstrInfo::MO_NONE) 2414 MIB.addImm(0); 2415 else 2416 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2417 2418 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2419 2420 if (PtrTy.getSizeInBits() == 32) 2421 B.buildExtract(DstReg, PCReg, 0); 2422 return true; 2423 } 2424 2425 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2426 MachineInstr &MI, MachineRegisterInfo &MRI, 2427 MachineIRBuilder &B) const { 2428 Register DstReg = MI.getOperand(0).getReg(); 2429 LLT Ty = MRI.getType(DstReg); 2430 unsigned AS = Ty.getAddressSpace(); 2431 2432 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2433 MachineFunction &MF = B.getMF(); 2434 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2435 2436 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2437 if (!MFI->isModuleEntryFunction() && 2438 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2439 const Function &Fn = MF.getFunction(); 2440 DiagnosticInfoUnsupported BadLDSDecl( 2441 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2442 DS_Warning); 2443 Fn.getContext().diagnose(BadLDSDecl); 2444 2445 // We currently don't have a way to correctly allocate LDS objects that 2446 // aren't directly associated with a kernel. We do force inlining of 2447 // functions that use local objects. However, if these dead functions are 2448 // not eliminated, we don't want a compile time error. Just emit a warning 2449 // and a trap, since there should be no callable path here. 2450 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2451 B.buildUndef(DstReg); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 // TODO: We could emit code to handle the initialization somewhere. 2457 // We ignore the initializer for now and legalize it to allow selection. 2458 // The initializer will anyway get errored out during assembly emission. 2459 const SITargetLowering *TLI = ST.getTargetLowering(); 2460 if (!TLI->shouldUseLDSConstAddress(GV)) { 2461 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2462 return true; // Leave in place; 2463 } 2464 2465 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2466 Type *Ty = GV->getValueType(); 2467 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2468 // zero-sized type in other languages to declare the dynamic shared 2469 // memory which size is not known at the compile time. They will be 2470 // allocated by the runtime and placed directly after the static 2471 // allocated ones. They all share the same offset. 2472 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2473 // Adjust alignment for that dynamic shared memory array. 2474 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2475 LLT S32 = LLT::scalar(32); 2476 auto Sz = 2477 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2478 B.buildIntToPtr(DstReg, Sz); 2479 MI.eraseFromParent(); 2480 return true; 2481 } 2482 } 2483 2484 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2485 *cast<GlobalVariable>(GV))); 2486 MI.eraseFromParent(); 2487 return true; 2488 } 2489 2490 const SITargetLowering *TLI = ST.getTargetLowering(); 2491 2492 if (TLI->shouldEmitFixup(GV)) { 2493 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2494 MI.eraseFromParent(); 2495 return true; 2496 } 2497 2498 if (TLI->shouldEmitPCReloc(GV)) { 2499 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2505 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2506 2507 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2508 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2509 MachinePointerInfo::getGOT(MF), 2510 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2511 MachineMemOperand::MOInvariant, 2512 LoadTy, Align(8)); 2513 2514 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2515 2516 if (Ty.getSizeInBits() == 32) { 2517 // Truncate if this is a 32-bit constant address. 2518 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2519 B.buildExtract(DstReg, Load, 0); 2520 } else 2521 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2522 2523 MI.eraseFromParent(); 2524 return true; 2525 } 2526 2527 static LLT widenToNextPowerOf2(LLT Ty) { 2528 if (Ty.isVector()) 2529 return Ty.changeElementCount( 2530 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2531 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2532 } 2533 2534 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2535 MachineInstr &MI) const { 2536 MachineIRBuilder &B = Helper.MIRBuilder; 2537 MachineRegisterInfo &MRI = *B.getMRI(); 2538 GISelChangeObserver &Observer = Helper.Observer; 2539 2540 Register PtrReg = MI.getOperand(1).getReg(); 2541 LLT PtrTy = MRI.getType(PtrReg); 2542 unsigned AddrSpace = PtrTy.getAddressSpace(); 2543 2544 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2545 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2546 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2547 Observer.changingInstr(MI); 2548 MI.getOperand(1).setReg(Cast.getReg(0)); 2549 Observer.changedInstr(MI); 2550 return true; 2551 } 2552 2553 if (MI.getOpcode() != AMDGPU::G_LOAD) 2554 return false; 2555 2556 Register ValReg = MI.getOperand(0).getReg(); 2557 LLT ValTy = MRI.getType(ValReg); 2558 2559 MachineMemOperand *MMO = *MI.memoperands_begin(); 2560 const unsigned ValSize = ValTy.getSizeInBits(); 2561 const LLT MemTy = MMO->getMemoryType(); 2562 const Align MemAlign = MMO->getAlign(); 2563 const unsigned MemSize = MemTy.getSizeInBits(); 2564 const uint64_t AlignInBits = 8 * MemAlign.value(); 2565 2566 // Widen non-power-of-2 loads to the alignment if needed 2567 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2568 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2569 2570 // This was already the correct extending load result type, so just adjust 2571 // the memory type. 2572 if (WideMemSize == ValSize) { 2573 MachineFunction &MF = B.getMF(); 2574 2575 MachineMemOperand *WideMMO = 2576 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2577 Observer.changingInstr(MI); 2578 MI.setMemRefs(MF, {WideMMO}); 2579 Observer.changedInstr(MI); 2580 return true; 2581 } 2582 2583 // Don't bother handling edge case that should probably never be produced. 2584 if (ValSize > WideMemSize) 2585 return false; 2586 2587 LLT WideTy = widenToNextPowerOf2(ValTy); 2588 2589 Register WideLoad; 2590 if (!WideTy.isVector()) { 2591 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2592 B.buildTrunc(ValReg, WideLoad).getReg(0); 2593 } else { 2594 // Extract the subvector. 2595 2596 if (isRegisterType(ValTy)) { 2597 // If this a case where G_EXTRACT is legal, use it. 2598 // (e.g. <3 x s32> -> <4 x s32>) 2599 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2600 B.buildExtract(ValReg, WideLoad, 0); 2601 } else { 2602 // For cases where the widened type isn't a nice register value, unmerge 2603 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2604 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2605 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2606 } 2607 } 2608 2609 MI.eraseFromParent(); 2610 return true; 2611 } 2612 2613 return false; 2614 } 2615 2616 bool AMDGPULegalizerInfo::legalizeFMad( 2617 MachineInstr &MI, MachineRegisterInfo &MRI, 2618 MachineIRBuilder &B) const { 2619 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2620 assert(Ty.isScalar()); 2621 2622 MachineFunction &MF = B.getMF(); 2623 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2624 2625 // TODO: Always legal with future ftz flag. 2626 // FIXME: Do we need just output? 2627 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2628 return true; 2629 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2630 return true; 2631 2632 MachineIRBuilder HelperBuilder(MI); 2633 GISelObserverWrapper DummyObserver; 2634 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2635 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2636 } 2637 2638 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2639 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2640 Register DstReg = MI.getOperand(0).getReg(); 2641 Register PtrReg = MI.getOperand(1).getReg(); 2642 Register CmpVal = MI.getOperand(2).getReg(); 2643 Register NewVal = MI.getOperand(3).getReg(); 2644 2645 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2646 "this should not have been custom lowered"); 2647 2648 LLT ValTy = MRI.getType(CmpVal); 2649 LLT VecTy = LLT::fixed_vector(2, ValTy); 2650 2651 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2652 2653 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2654 .addDef(DstReg) 2655 .addUse(PtrReg) 2656 .addUse(PackedVal) 2657 .setMemRefs(MI.memoperands()); 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 bool AMDGPULegalizerInfo::legalizeFlog( 2664 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2665 Register Dst = MI.getOperand(0).getReg(); 2666 Register Src = MI.getOperand(1).getReg(); 2667 LLT Ty = B.getMRI()->getType(Dst); 2668 unsigned Flags = MI.getFlags(); 2669 2670 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2671 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2672 2673 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2674 MI.eraseFromParent(); 2675 return true; 2676 } 2677 2678 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2679 MachineIRBuilder &B) const { 2680 Register Dst = MI.getOperand(0).getReg(); 2681 Register Src = MI.getOperand(1).getReg(); 2682 unsigned Flags = MI.getFlags(); 2683 LLT Ty = B.getMRI()->getType(Dst); 2684 2685 auto K = B.buildFConstant(Ty, numbers::log2e); 2686 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2687 B.buildFExp2(Dst, Mul, Flags); 2688 MI.eraseFromParent(); 2689 return true; 2690 } 2691 2692 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2693 MachineIRBuilder &B) const { 2694 Register Dst = MI.getOperand(0).getReg(); 2695 Register Src0 = MI.getOperand(1).getReg(); 2696 Register Src1 = MI.getOperand(2).getReg(); 2697 unsigned Flags = MI.getFlags(); 2698 LLT Ty = B.getMRI()->getType(Dst); 2699 const LLT S16 = LLT::scalar(16); 2700 const LLT S32 = LLT::scalar(32); 2701 2702 if (Ty == S32) { 2703 auto Log = B.buildFLog2(S32, Src0, Flags); 2704 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2705 .addUse(Log.getReg(0)) 2706 .addUse(Src1) 2707 .setMIFlags(Flags); 2708 B.buildFExp2(Dst, Mul, Flags); 2709 } else if (Ty == S16) { 2710 // There's no f16 fmul_legacy, so we need to convert for it. 2711 auto Log = B.buildFLog2(S16, Src0, Flags); 2712 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2713 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2714 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2715 .addUse(Ext0.getReg(0)) 2716 .addUse(Ext1.getReg(0)) 2717 .setMIFlags(Flags); 2718 2719 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2720 } else 2721 return false; 2722 2723 MI.eraseFromParent(); 2724 return true; 2725 } 2726 2727 // Find a source register, ignoring any possible source modifiers. 2728 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2729 Register ModSrc = OrigSrc; 2730 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2731 ModSrc = SrcFNeg->getOperand(1).getReg(); 2732 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2733 ModSrc = SrcFAbs->getOperand(1).getReg(); 2734 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2735 ModSrc = SrcFAbs->getOperand(1).getReg(); 2736 return ModSrc; 2737 } 2738 2739 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2740 MachineRegisterInfo &MRI, 2741 MachineIRBuilder &B) const { 2742 2743 const LLT S1 = LLT::scalar(1); 2744 const LLT S64 = LLT::scalar(64); 2745 Register Dst = MI.getOperand(0).getReg(); 2746 Register OrigSrc = MI.getOperand(1).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2749 "this should not have been custom lowered"); 2750 2751 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2752 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2753 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2754 // V_FRACT bug is: 2755 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2756 // 2757 // Convert floor(x) to (x - fract(x)) 2758 2759 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2760 .addUse(OrigSrc) 2761 .setMIFlags(Flags); 2762 2763 // Give source modifier matching some assistance before obscuring a foldable 2764 // pattern. 2765 2766 // TODO: We can avoid the neg on the fract? The input sign to fract 2767 // shouldn't matter? 2768 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2769 2770 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2771 2772 Register Min = MRI.createGenericVirtualRegister(S64); 2773 2774 // We don't need to concern ourselves with the snan handling difference, so 2775 // use the one which will directly select. 2776 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2777 if (MFI->getMode().IEEE) 2778 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2779 else 2780 B.buildFMinNum(Min, Fract, Const, Flags); 2781 2782 Register CorrectedFract = Min; 2783 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2784 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2785 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2786 } 2787 2788 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2789 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2790 2791 MI.eraseFromParent(); 2792 return true; 2793 } 2794 2795 // Turn an illegal packed v2s16 build vector into bit operations. 2796 // TODO: This should probably be a bitcast action in LegalizerHelper. 2797 bool AMDGPULegalizerInfo::legalizeBuildVector( 2798 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2799 Register Dst = MI.getOperand(0).getReg(); 2800 const LLT S32 = LLT::scalar(32); 2801 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2802 2803 Register Src0 = MI.getOperand(1).getReg(); 2804 Register Src1 = MI.getOperand(2).getReg(); 2805 assert(MRI.getType(Src0) == LLT::scalar(16)); 2806 2807 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2808 B.buildBitcast(Dst, Merge); 2809 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 2814 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2815 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2816 // case with a single min instruction instead of a compare+select. 2817 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 Register Dst = MI.getOperand(0).getReg(); 2821 Register Src = MI.getOperand(1).getReg(); 2822 LLT DstTy = MRI.getType(Dst); 2823 LLT SrcTy = MRI.getType(Src); 2824 2825 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2826 ? AMDGPU::G_AMDGPU_FFBH_U32 2827 : AMDGPU::G_AMDGPU_FFBL_B32; 2828 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2829 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2830 2831 MI.eraseFromParent(); 2832 return true; 2833 } 2834 2835 // Check that this is a G_XOR x, -1 2836 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2837 if (MI.getOpcode() != TargetOpcode::G_XOR) 2838 return false; 2839 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2840 return ConstVal && *ConstVal == -1; 2841 } 2842 2843 // Return the use branch instruction, otherwise null if the usage is invalid. 2844 static MachineInstr * 2845 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2846 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2847 Register CondDef = MI.getOperand(0).getReg(); 2848 if (!MRI.hasOneNonDBGUse(CondDef)) 2849 return nullptr; 2850 2851 MachineBasicBlock *Parent = MI.getParent(); 2852 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2853 2854 if (isNot(MRI, *UseMI)) { 2855 Register NegatedCond = UseMI->getOperand(0).getReg(); 2856 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2857 return nullptr; 2858 2859 // We're deleting the def of this value, so we need to remove it. 2860 eraseInstr(*UseMI, MRI); 2861 2862 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2863 Negated = true; 2864 } 2865 2866 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2867 return nullptr; 2868 2869 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2870 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2871 if (Next == Parent->end()) { 2872 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2873 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2874 return nullptr; 2875 UncondBrTarget = &*NextMBB; 2876 } else { 2877 if (Next->getOpcode() != AMDGPU::G_BR) 2878 return nullptr; 2879 Br = &*Next; 2880 UncondBrTarget = Br->getOperand(0).getMBB(); 2881 } 2882 2883 return UseMI; 2884 } 2885 2886 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2887 const ArgDescriptor *Arg, 2888 const TargetRegisterClass *ArgRC, 2889 LLT ArgTy) const { 2890 MCRegister SrcReg = Arg->getRegister(); 2891 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2892 assert(DstReg.isVirtual() && "Virtual register expected"); 2893 2894 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2895 *ArgRC, B.getDebugLoc(), ArgTy); 2896 if (Arg->isMasked()) { 2897 // TODO: Should we try to emit this once in the entry block? 2898 const LLT S32 = LLT::scalar(32); 2899 const unsigned Mask = Arg->getMask(); 2900 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2901 2902 Register AndMaskSrc = LiveIn; 2903 2904 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2905 // 0. 2906 if (Shift != 0) { 2907 auto ShiftAmt = B.buildConstant(S32, Shift); 2908 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2909 } 2910 2911 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2912 } else { 2913 B.buildCopy(DstReg, LiveIn); 2914 } 2915 2916 return true; 2917 } 2918 2919 bool AMDGPULegalizerInfo::loadInputValue( 2920 Register DstReg, MachineIRBuilder &B, 2921 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2922 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2923 const ArgDescriptor *Arg; 2924 const TargetRegisterClass *ArgRC; 2925 LLT ArgTy; 2926 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2927 2928 if (!Arg) { 2929 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2930 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2931 // case the pointer argument may be missing and we use null. 2932 B.buildConstant(DstReg, 0); 2933 return true; 2934 } 2935 2936 // It's undefined behavior if a function marked with the amdgpu-no-* 2937 // attributes uses the corresponding intrinsic. 2938 B.buildUndef(DstReg); 2939 return true; 2940 } 2941 2942 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2943 return false; // TODO: Handle these 2944 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2948 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2949 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2950 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2951 return false; 2952 2953 MI.eraseFromParent(); 2954 return true; 2955 } 2956 2957 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2958 MachineRegisterInfo &MRI, 2959 MachineIRBuilder &B) const { 2960 Register Dst = MI.getOperand(0).getReg(); 2961 LLT DstTy = MRI.getType(Dst); 2962 LLT S16 = LLT::scalar(16); 2963 LLT S32 = LLT::scalar(32); 2964 LLT S64 = LLT::scalar(64); 2965 2966 if (DstTy == S16) 2967 return legalizeFDIV16(MI, MRI, B); 2968 if (DstTy == S32) 2969 return legalizeFDIV32(MI, MRI, B); 2970 if (DstTy == S64) 2971 return legalizeFDIV64(MI, MRI, B); 2972 2973 return false; 2974 } 2975 2976 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2977 Register DstDivReg, 2978 Register DstRemReg, 2979 Register X, 2980 Register Y) const { 2981 const LLT S1 = LLT::scalar(1); 2982 const LLT S32 = LLT::scalar(32); 2983 2984 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2985 // algorithm used here. 2986 2987 // Initial estimate of inv(y). 2988 auto FloatY = B.buildUITOFP(S32, Y); 2989 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2990 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2991 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2992 auto Z = B.buildFPTOUI(S32, ScaledY); 2993 2994 // One round of UNR. 2995 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2996 auto NegYZ = B.buildMul(S32, NegY, Z); 2997 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2998 2999 // Quotient/remainder estimate. 3000 auto Q = B.buildUMulH(S32, X, Z); 3001 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3002 3003 // First quotient/remainder refinement. 3004 auto One = B.buildConstant(S32, 1); 3005 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3006 if (DstDivReg) 3007 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3008 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3009 3010 // Second quotient/remainder refinement. 3011 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3012 if (DstDivReg) 3013 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3014 3015 if (DstRemReg) 3016 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3017 } 3018 3019 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3020 // 3021 // Return lo, hi of result 3022 // 3023 // %cvt.lo = G_UITOFP Val.lo 3024 // %cvt.hi = G_UITOFP Val.hi 3025 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3026 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3027 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3028 // %mul2 = G_FMUL %mul1, 2**(-32) 3029 // %trunc = G_INTRINSIC_TRUNC %mul2 3030 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3031 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3032 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3033 Register Val) { 3034 const LLT S32 = LLT::scalar(32); 3035 auto Unmerge = B.buildUnmerge(S32, Val); 3036 3037 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3038 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3039 3040 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3041 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3042 3043 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3044 auto Mul1 = 3045 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3046 3047 // 2**(-32) 3048 auto Mul2 = 3049 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3050 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3051 3052 // -(2**32) 3053 auto Mad2 = B.buildFMAD(S32, Trunc, 3054 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3055 3056 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3057 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3058 3059 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3060 } 3061 3062 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3063 Register DstDivReg, 3064 Register DstRemReg, 3065 Register Numer, 3066 Register Denom) const { 3067 const LLT S32 = LLT::scalar(32); 3068 const LLT S64 = LLT::scalar(64); 3069 const LLT S1 = LLT::scalar(1); 3070 Register RcpLo, RcpHi; 3071 3072 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3073 3074 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3075 3076 auto Zero64 = B.buildConstant(S64, 0); 3077 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3078 3079 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3080 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3081 3082 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3083 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3084 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3085 3086 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3087 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3088 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3089 3090 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3091 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3092 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3093 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3094 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3095 3096 auto Zero32 = B.buildConstant(S32, 0); 3097 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3098 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3099 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3100 3101 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3102 Register NumerLo = UnmergeNumer.getReg(0); 3103 Register NumerHi = UnmergeNumer.getReg(1); 3104 3105 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3106 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3107 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3108 Register Mul3_Lo = UnmergeMul3.getReg(0); 3109 Register Mul3_Hi = UnmergeMul3.getReg(1); 3110 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3111 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3112 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3113 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3114 3115 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3116 Register DenomLo = UnmergeDenom.getReg(0); 3117 Register DenomHi = UnmergeDenom.getReg(1); 3118 3119 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3120 auto C1 = B.buildSExt(S32, CmpHi); 3121 3122 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3123 auto C2 = B.buildSExt(S32, CmpLo); 3124 3125 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3126 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3127 3128 // TODO: Here and below portions of the code can be enclosed into if/endif. 3129 // Currently control flow is unconditional and we have 4 selects after 3130 // potential endif to substitute PHIs. 3131 3132 // if C3 != 0 ... 3133 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3134 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3135 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3136 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3137 3138 auto One64 = B.buildConstant(S64, 1); 3139 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3140 3141 auto C4 = 3142 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3143 auto C5 = 3144 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3145 auto C6 = B.buildSelect( 3146 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3147 3148 // if (C6 != 0) 3149 auto Add4 = B.buildAdd(S64, Add3, One64); 3150 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3151 3152 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3153 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3154 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3155 3156 // endif C6 3157 // endif C3 3158 3159 if (DstDivReg) { 3160 auto Sel1 = B.buildSelect( 3161 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3162 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3163 Sel1, MulHi3); 3164 } 3165 3166 if (DstRemReg) { 3167 auto Sel2 = B.buildSelect( 3168 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3169 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3170 Sel2, Sub1); 3171 } 3172 } 3173 3174 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3175 MachineRegisterInfo &MRI, 3176 MachineIRBuilder &B) const { 3177 Register DstDivReg, DstRemReg; 3178 switch (MI.getOpcode()) { 3179 default: 3180 llvm_unreachable("Unexpected opcode!"); 3181 case AMDGPU::G_UDIV: { 3182 DstDivReg = MI.getOperand(0).getReg(); 3183 break; 3184 } 3185 case AMDGPU::G_UREM: { 3186 DstRemReg = MI.getOperand(0).getReg(); 3187 break; 3188 } 3189 case AMDGPU::G_UDIVREM: { 3190 DstDivReg = MI.getOperand(0).getReg(); 3191 DstRemReg = MI.getOperand(1).getReg(); 3192 break; 3193 } 3194 } 3195 3196 const LLT S64 = LLT::scalar(64); 3197 const LLT S32 = LLT::scalar(32); 3198 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3199 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3200 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3201 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3202 3203 if (Ty == S32) 3204 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3205 else if (Ty == S64) 3206 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3207 else 3208 return false; 3209 3210 MI.eraseFromParent(); 3211 return true; 3212 } 3213 3214 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3215 MachineRegisterInfo &MRI, 3216 MachineIRBuilder &B) const { 3217 const LLT S64 = LLT::scalar(64); 3218 const LLT S32 = LLT::scalar(32); 3219 3220 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3221 if (Ty != S32 && Ty != S64) 3222 return false; 3223 3224 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3225 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3226 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3227 3228 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3229 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3230 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3231 3232 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3233 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3234 3235 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3236 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3237 3238 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3239 switch (MI.getOpcode()) { 3240 default: 3241 llvm_unreachable("Unexpected opcode!"); 3242 case AMDGPU::G_SDIV: { 3243 DstDivReg = MI.getOperand(0).getReg(); 3244 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3245 break; 3246 } 3247 case AMDGPU::G_SREM: { 3248 DstRemReg = MI.getOperand(0).getReg(); 3249 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3250 break; 3251 } 3252 case AMDGPU::G_SDIVREM: { 3253 DstDivReg = MI.getOperand(0).getReg(); 3254 DstRemReg = MI.getOperand(1).getReg(); 3255 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3256 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3257 break; 3258 } 3259 } 3260 3261 if (Ty == S32) 3262 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3263 else 3264 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3265 3266 if (DstDivReg) { 3267 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3268 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3269 B.buildSub(DstDivReg, SignXor, Sign); 3270 } 3271 3272 if (DstRemReg) { 3273 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3274 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3275 B.buildSub(DstRemReg, SignXor, Sign); 3276 } 3277 3278 MI.eraseFromParent(); 3279 return true; 3280 } 3281 3282 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3283 MachineRegisterInfo &MRI, 3284 MachineIRBuilder &B) const { 3285 Register Res = MI.getOperand(0).getReg(); 3286 Register LHS = MI.getOperand(1).getReg(); 3287 Register RHS = MI.getOperand(2).getReg(); 3288 uint16_t Flags = MI.getFlags(); 3289 LLT ResTy = MRI.getType(Res); 3290 3291 const MachineFunction &MF = B.getMF(); 3292 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3293 MI.getFlag(MachineInstr::FmAfn); 3294 3295 if (!AllowInaccurateRcp) 3296 return false; 3297 3298 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3299 // 1 / x -> RCP(x) 3300 if (CLHS->isExactlyValue(1.0)) { 3301 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3302 .addUse(RHS) 3303 .setMIFlags(Flags); 3304 3305 MI.eraseFromParent(); 3306 return true; 3307 } 3308 3309 // -1 / x -> RCP( FNEG(x) ) 3310 if (CLHS->isExactlyValue(-1.0)) { 3311 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3312 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3313 .addUse(FNeg.getReg(0)) 3314 .setMIFlags(Flags); 3315 3316 MI.eraseFromParent(); 3317 return true; 3318 } 3319 } 3320 3321 // x / y -> x * (1.0 / y) 3322 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3323 .addUse(RHS) 3324 .setMIFlags(Flags); 3325 B.buildFMul(Res, LHS, RCP, Flags); 3326 3327 MI.eraseFromParent(); 3328 return true; 3329 } 3330 3331 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3332 MachineRegisterInfo &MRI, 3333 MachineIRBuilder &B) const { 3334 Register Res = MI.getOperand(0).getReg(); 3335 Register X = MI.getOperand(1).getReg(); 3336 Register Y = MI.getOperand(2).getReg(); 3337 uint16_t Flags = MI.getFlags(); 3338 LLT ResTy = MRI.getType(Res); 3339 3340 const MachineFunction &MF = B.getMF(); 3341 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3342 MI.getFlag(MachineInstr::FmAfn); 3343 3344 if (!AllowInaccurateRcp) 3345 return false; 3346 3347 auto NegY = B.buildFNeg(ResTy, Y); 3348 auto One = B.buildFConstant(ResTy, 1.0); 3349 3350 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3351 .addUse(Y) 3352 .setMIFlags(Flags); 3353 3354 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3355 R = B.buildFMA(ResTy, Tmp0, R, R); 3356 3357 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3358 R = B.buildFMA(ResTy, Tmp1, R, R); 3359 3360 auto Ret = B.buildFMul(ResTy, X, R); 3361 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3362 3363 B.buildFMA(Res, Tmp2, R, Ret); 3364 MI.eraseFromParent(); 3365 return true; 3366 } 3367 3368 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3369 MachineRegisterInfo &MRI, 3370 MachineIRBuilder &B) const { 3371 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3372 return true; 3373 3374 Register Res = MI.getOperand(0).getReg(); 3375 Register LHS = MI.getOperand(1).getReg(); 3376 Register RHS = MI.getOperand(2).getReg(); 3377 3378 uint16_t Flags = MI.getFlags(); 3379 3380 LLT S16 = LLT::scalar(16); 3381 LLT S32 = LLT::scalar(32); 3382 3383 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3384 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3385 3386 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3387 .addUse(RHSExt.getReg(0)) 3388 .setMIFlags(Flags); 3389 3390 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3391 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3392 3393 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3394 .addUse(RDst.getReg(0)) 3395 .addUse(RHS) 3396 .addUse(LHS) 3397 .setMIFlags(Flags); 3398 3399 MI.eraseFromParent(); 3400 return true; 3401 } 3402 3403 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3404 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3405 static void toggleSPDenormMode(bool Enable, 3406 MachineIRBuilder &B, 3407 const GCNSubtarget &ST, 3408 AMDGPU::SIModeRegisterDefaults Mode) { 3409 // Set SP denorm mode to this value. 3410 unsigned SPDenormMode = 3411 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3412 3413 if (ST.hasDenormModeInst()) { 3414 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3415 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3416 3417 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3418 B.buildInstr(AMDGPU::S_DENORM_MODE) 3419 .addImm(NewDenormModeValue); 3420 3421 } else { 3422 // Select FP32 bit field in mode register. 3423 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3424 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3425 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3426 3427 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3428 .addImm(SPDenormMode) 3429 .addImm(SPDenormModeBitField); 3430 } 3431 } 3432 3433 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3434 MachineRegisterInfo &MRI, 3435 MachineIRBuilder &B) const { 3436 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3437 return true; 3438 3439 Register Res = MI.getOperand(0).getReg(); 3440 Register LHS = MI.getOperand(1).getReg(); 3441 Register RHS = MI.getOperand(2).getReg(); 3442 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3443 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3444 3445 uint16_t Flags = MI.getFlags(); 3446 3447 LLT S32 = LLT::scalar(32); 3448 LLT S1 = LLT::scalar(1); 3449 3450 auto One = B.buildFConstant(S32, 1.0f); 3451 3452 auto DenominatorScaled = 3453 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3454 .addUse(LHS) 3455 .addUse(RHS) 3456 .addImm(0) 3457 .setMIFlags(Flags); 3458 auto NumeratorScaled = 3459 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3460 .addUse(LHS) 3461 .addUse(RHS) 3462 .addImm(1) 3463 .setMIFlags(Flags); 3464 3465 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3466 .addUse(DenominatorScaled.getReg(0)) 3467 .setMIFlags(Flags); 3468 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3469 3470 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3471 // aren't modeled as reading it. 3472 if (!Mode.allFP32Denormals()) 3473 toggleSPDenormMode(true, B, ST, Mode); 3474 3475 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3476 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3477 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3478 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3479 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3480 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3481 3482 if (!Mode.allFP32Denormals()) 3483 toggleSPDenormMode(false, B, ST, Mode); 3484 3485 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3486 .addUse(Fma4.getReg(0)) 3487 .addUse(Fma1.getReg(0)) 3488 .addUse(Fma3.getReg(0)) 3489 .addUse(NumeratorScaled.getReg(1)) 3490 .setMIFlags(Flags); 3491 3492 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3493 .addUse(Fmas.getReg(0)) 3494 .addUse(RHS) 3495 .addUse(LHS) 3496 .setMIFlags(Flags); 3497 3498 MI.eraseFromParent(); 3499 return true; 3500 } 3501 3502 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3503 MachineRegisterInfo &MRI, 3504 MachineIRBuilder &B) const { 3505 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3506 return true; 3507 3508 Register Res = MI.getOperand(0).getReg(); 3509 Register LHS = MI.getOperand(1).getReg(); 3510 Register RHS = MI.getOperand(2).getReg(); 3511 3512 uint16_t Flags = MI.getFlags(); 3513 3514 LLT S64 = LLT::scalar(64); 3515 LLT S1 = LLT::scalar(1); 3516 3517 auto One = B.buildFConstant(S64, 1.0); 3518 3519 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3520 .addUse(LHS) 3521 .addUse(RHS) 3522 .addImm(0) 3523 .setMIFlags(Flags); 3524 3525 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3526 3527 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3528 .addUse(DivScale0.getReg(0)) 3529 .setMIFlags(Flags); 3530 3531 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3532 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3533 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3534 3535 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3536 .addUse(LHS) 3537 .addUse(RHS) 3538 .addImm(1) 3539 .setMIFlags(Flags); 3540 3541 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3542 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3543 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3544 3545 Register Scale; 3546 if (!ST.hasUsableDivScaleConditionOutput()) { 3547 // Workaround a hardware bug on SI where the condition output from div_scale 3548 // is not usable. 3549 3550 LLT S32 = LLT::scalar(32); 3551 3552 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3553 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3554 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3555 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3556 3557 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3558 Scale1Unmerge.getReg(1)); 3559 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3560 Scale0Unmerge.getReg(1)); 3561 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3562 } else { 3563 Scale = DivScale1.getReg(1); 3564 } 3565 3566 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3567 .addUse(Fma4.getReg(0)) 3568 .addUse(Fma3.getReg(0)) 3569 .addUse(Mul.getReg(0)) 3570 .addUse(Scale) 3571 .setMIFlags(Flags); 3572 3573 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3574 .addUse(Fmas.getReg(0)) 3575 .addUse(RHS) 3576 .addUse(LHS) 3577 .setMIFlags(Flags); 3578 3579 MI.eraseFromParent(); 3580 return true; 3581 } 3582 3583 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3584 MachineRegisterInfo &MRI, 3585 MachineIRBuilder &B) const { 3586 Register Res = MI.getOperand(0).getReg(); 3587 Register LHS = MI.getOperand(2).getReg(); 3588 Register RHS = MI.getOperand(3).getReg(); 3589 uint16_t Flags = MI.getFlags(); 3590 3591 LLT S32 = LLT::scalar(32); 3592 LLT S1 = LLT::scalar(1); 3593 3594 auto Abs = B.buildFAbs(S32, RHS, Flags); 3595 const APFloat C0Val(1.0f); 3596 3597 auto C0 = B.buildConstant(S32, 0x6f800000); 3598 auto C1 = B.buildConstant(S32, 0x2f800000); 3599 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3600 3601 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3602 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3603 3604 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3605 3606 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3607 .addUse(Mul0.getReg(0)) 3608 .setMIFlags(Flags); 3609 3610 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3611 3612 B.buildFMul(Res, Sel, Mul1, Flags); 3613 3614 MI.eraseFromParent(); 3615 return true; 3616 } 3617 3618 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3619 // FIXME: Why do we handle this one but not other removed instructions? 3620 // 3621 // Reciprocal square root. The clamp prevents infinite results, clamping 3622 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3623 // +-max_float. 3624 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3625 MachineRegisterInfo &MRI, 3626 MachineIRBuilder &B) const { 3627 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3628 return true; 3629 3630 Register Dst = MI.getOperand(0).getReg(); 3631 Register Src = MI.getOperand(2).getReg(); 3632 auto Flags = MI.getFlags(); 3633 3634 LLT Ty = MRI.getType(Dst); 3635 3636 const fltSemantics *FltSemantics; 3637 if (Ty == LLT::scalar(32)) 3638 FltSemantics = &APFloat::IEEEsingle(); 3639 else if (Ty == LLT::scalar(64)) 3640 FltSemantics = &APFloat::IEEEdouble(); 3641 else 3642 return false; 3643 3644 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3645 .addUse(Src) 3646 .setMIFlags(Flags); 3647 3648 // We don't need to concern ourselves with the snan handling difference, since 3649 // the rsq quieted (or not) so use the one which will directly select. 3650 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3651 const bool UseIEEE = MFI->getMode().IEEE; 3652 3653 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3654 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3655 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3656 3657 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3658 3659 if (UseIEEE) 3660 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3661 else 3662 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3663 MI.eraseFromParent(); 3664 return true; 3665 } 3666 3667 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3668 switch (IID) { 3669 case Intrinsic::amdgcn_ds_fadd: 3670 return AMDGPU::G_ATOMICRMW_FADD; 3671 case Intrinsic::amdgcn_ds_fmin: 3672 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3673 case Intrinsic::amdgcn_ds_fmax: 3674 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3675 default: 3676 llvm_unreachable("not a DS FP intrinsic"); 3677 } 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3681 MachineInstr &MI, 3682 Intrinsic::ID IID) const { 3683 GISelChangeObserver &Observer = Helper.Observer; 3684 Observer.changingInstr(MI); 3685 3686 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3687 3688 // The remaining operands were used to set fields in the MemOperand on 3689 // construction. 3690 for (int I = 6; I > 3; --I) 3691 MI.RemoveOperand(I); 3692 3693 MI.RemoveOperand(1); // Remove the intrinsic ID. 3694 Observer.changedInstr(MI); 3695 return true; 3696 } 3697 3698 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3699 MachineRegisterInfo &MRI, 3700 MachineIRBuilder &B) const { 3701 uint64_t Offset = 3702 ST.getTargetLowering()->getImplicitParameterOffset( 3703 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3704 LLT DstTy = MRI.getType(DstReg); 3705 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3706 3707 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3708 if (!loadInputValue(KernargPtrReg, B, 3709 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3710 return false; 3711 3712 // FIXME: This should be nuw 3713 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3714 return true; 3715 } 3716 3717 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3718 MachineRegisterInfo &MRI, 3719 MachineIRBuilder &B) const { 3720 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3721 if (!MFI->isEntryFunction()) { 3722 return legalizePreloadedArgIntrin(MI, MRI, B, 3723 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3724 } 3725 3726 Register DstReg = MI.getOperand(0).getReg(); 3727 if (!getImplicitArgPtr(DstReg, MRI, B)) 3728 return false; 3729 3730 MI.eraseFromParent(); 3731 return true; 3732 } 3733 3734 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3735 MachineRegisterInfo &MRI, 3736 MachineIRBuilder &B, 3737 unsigned AddrSpace) const { 3738 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3739 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3740 Register Hi32 = Unmerge.getReg(1); 3741 3742 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3743 MI.eraseFromParent(); 3744 return true; 3745 } 3746 3747 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3748 // offset (the offset that is included in bounds checking and swizzling, to be 3749 // split between the instruction's voffset and immoffset fields) and soffset 3750 // (the offset that is excluded from bounds checking and swizzling, to go in 3751 // the instruction's soffset field). This function takes the first kind of 3752 // offset and figures out how to split it between voffset and immoffset. 3753 std::pair<Register, unsigned> 3754 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3755 Register OrigOffset) const { 3756 const unsigned MaxImm = 4095; 3757 Register BaseReg; 3758 unsigned ImmOffset; 3759 const LLT S32 = LLT::scalar(32); 3760 MachineRegisterInfo &MRI = *B.getMRI(); 3761 3762 std::tie(BaseReg, ImmOffset) = 3763 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3764 3765 // If BaseReg is a pointer, convert it to int. 3766 if (MRI.getType(BaseReg).isPointer()) 3767 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3768 3769 // If the immediate value is too big for the immoffset field, put the value 3770 // and -4096 into the immoffset field so that the value that is copied/added 3771 // for the voffset field is a multiple of 4096, and it stands more chance 3772 // of being CSEd with the copy/add for another similar load/store. 3773 // However, do not do that rounding down to a multiple of 4096 if that is a 3774 // negative number, as it appears to be illegal to have a negative offset 3775 // in the vgpr, even if adding the immediate offset makes it positive. 3776 unsigned Overflow = ImmOffset & ~MaxImm; 3777 ImmOffset -= Overflow; 3778 if ((int32_t)Overflow < 0) { 3779 Overflow += ImmOffset; 3780 ImmOffset = 0; 3781 } 3782 3783 if (Overflow != 0) { 3784 if (!BaseReg) { 3785 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3786 } else { 3787 auto OverflowVal = B.buildConstant(S32, Overflow); 3788 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3789 } 3790 } 3791 3792 if (!BaseReg) 3793 BaseReg = B.buildConstant(S32, 0).getReg(0); 3794 3795 return std::make_pair(BaseReg, ImmOffset); 3796 } 3797 3798 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3799 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3800 Register VOffset, Register SOffset, 3801 unsigned ImmOffset, Register VIndex, 3802 MachineRegisterInfo &MRI) const { 3803 Optional<ValueAndVReg> MaybeVOffsetVal = 3804 getIConstantVRegValWithLookThrough(VOffset, MRI); 3805 Optional<ValueAndVReg> MaybeSOffsetVal = 3806 getIConstantVRegValWithLookThrough(SOffset, MRI); 3807 Optional<ValueAndVReg> MaybeVIndexVal = 3808 getIConstantVRegValWithLookThrough(VIndex, MRI); 3809 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3810 // update the MMO with that offset. The stride is unknown so we can only do 3811 // this if VIndex is constant 0. 3812 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3813 MaybeVIndexVal->Value == 0) { 3814 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3815 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3816 MMO->setOffset(TotalOffset); 3817 } else { 3818 // We don't have a constant combined offset to use in the MMO. Give up. 3819 MMO->setValue((Value *)nullptr); 3820 } 3821 } 3822 3823 /// Handle register layout difference for f16 images for some subtargets. 3824 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3825 MachineRegisterInfo &MRI, 3826 Register Reg, 3827 bool ImageStore) const { 3828 const LLT S16 = LLT::scalar(16); 3829 const LLT S32 = LLT::scalar(32); 3830 LLT StoreVT = MRI.getType(Reg); 3831 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3832 3833 if (ST.hasUnpackedD16VMem()) { 3834 auto Unmerge = B.buildUnmerge(S16, Reg); 3835 3836 SmallVector<Register, 4> WideRegs; 3837 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3838 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3839 3840 int NumElts = StoreVT.getNumElements(); 3841 3842 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3843 .getReg(0); 3844 } 3845 3846 if (ImageStore && ST.hasImageStoreD16Bug()) { 3847 if (StoreVT.getNumElements() == 2) { 3848 SmallVector<Register, 4> PackedRegs; 3849 Reg = B.buildBitcast(S32, Reg).getReg(0); 3850 PackedRegs.push_back(Reg); 3851 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3852 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3853 .getReg(0); 3854 } 3855 3856 if (StoreVT.getNumElements() == 3) { 3857 SmallVector<Register, 4> PackedRegs; 3858 auto Unmerge = B.buildUnmerge(S16, Reg); 3859 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3860 PackedRegs.push_back(Unmerge.getReg(I)); 3861 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3862 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3863 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3864 } 3865 3866 if (StoreVT.getNumElements() == 4) { 3867 SmallVector<Register, 4> PackedRegs; 3868 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3869 auto Unmerge = B.buildUnmerge(S32, Reg); 3870 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3871 PackedRegs.push_back(Unmerge.getReg(I)); 3872 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3873 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3874 .getReg(0); 3875 } 3876 3877 llvm_unreachable("invalid data type"); 3878 } 3879 3880 if (StoreVT == LLT::fixed_vector(3, S16)) { 3881 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3882 .getReg(0); 3883 } 3884 return Reg; 3885 } 3886 3887 Register AMDGPULegalizerInfo::fixStoreSourceType( 3888 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3889 MachineRegisterInfo *MRI = B.getMRI(); 3890 LLT Ty = MRI->getType(VData); 3891 3892 const LLT S16 = LLT::scalar(16); 3893 3894 // Fixup illegal register types for i8 stores. 3895 if (Ty == LLT::scalar(8) || Ty == S16) { 3896 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3897 return AnyExt; 3898 } 3899 3900 if (Ty.isVector()) { 3901 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3902 if (IsFormat) 3903 return handleD16VData(B, *MRI, VData); 3904 } 3905 } 3906 3907 return VData; 3908 } 3909 3910 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3911 MachineRegisterInfo &MRI, 3912 MachineIRBuilder &B, 3913 bool IsTyped, 3914 bool IsFormat) const { 3915 Register VData = MI.getOperand(1).getReg(); 3916 LLT Ty = MRI.getType(VData); 3917 LLT EltTy = Ty.getScalarType(); 3918 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3919 const LLT S32 = LLT::scalar(32); 3920 3921 VData = fixStoreSourceType(B, VData, IsFormat); 3922 Register RSrc = MI.getOperand(2).getReg(); 3923 3924 MachineMemOperand *MMO = *MI.memoperands_begin(); 3925 const int MemSize = MMO->getSize(); 3926 3927 unsigned ImmOffset; 3928 3929 // The typed intrinsics add an immediate after the registers. 3930 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3931 3932 // The struct intrinsic variants add one additional operand over raw. 3933 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3934 Register VIndex; 3935 int OpOffset = 0; 3936 if (HasVIndex) { 3937 VIndex = MI.getOperand(3).getReg(); 3938 OpOffset = 1; 3939 } else { 3940 VIndex = B.buildConstant(S32, 0).getReg(0); 3941 } 3942 3943 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3944 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3945 3946 unsigned Format = 0; 3947 if (IsTyped) { 3948 Format = MI.getOperand(5 + OpOffset).getImm(); 3949 ++OpOffset; 3950 } 3951 3952 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3953 3954 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3955 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3956 3957 unsigned Opc; 3958 if (IsTyped) { 3959 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3960 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3961 } else if (IsFormat) { 3962 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3963 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3964 } else { 3965 switch (MemSize) { 3966 case 1: 3967 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3968 break; 3969 case 2: 3970 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3971 break; 3972 default: 3973 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3974 break; 3975 } 3976 } 3977 3978 auto MIB = B.buildInstr(Opc) 3979 .addUse(VData) // vdata 3980 .addUse(RSrc) // rsrc 3981 .addUse(VIndex) // vindex 3982 .addUse(VOffset) // voffset 3983 .addUse(SOffset) // soffset 3984 .addImm(ImmOffset); // offset(imm) 3985 3986 if (IsTyped) 3987 MIB.addImm(Format); 3988 3989 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3990 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3991 .addMemOperand(MMO); 3992 3993 MI.eraseFromParent(); 3994 return true; 3995 } 3996 3997 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3998 MachineRegisterInfo &MRI, 3999 MachineIRBuilder &B, 4000 bool IsFormat, 4001 bool IsTyped) const { 4002 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4003 MachineMemOperand *MMO = *MI.memoperands_begin(); 4004 const LLT MemTy = MMO->getMemoryType(); 4005 const LLT S32 = LLT::scalar(32); 4006 4007 Register Dst = MI.getOperand(0).getReg(); 4008 Register RSrc = MI.getOperand(2).getReg(); 4009 4010 // The typed intrinsics add an immediate after the registers. 4011 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4012 4013 // The struct intrinsic variants add one additional operand over raw. 4014 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4015 Register VIndex; 4016 int OpOffset = 0; 4017 if (HasVIndex) { 4018 VIndex = MI.getOperand(3).getReg(); 4019 OpOffset = 1; 4020 } else { 4021 VIndex = B.buildConstant(S32, 0).getReg(0); 4022 } 4023 4024 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4025 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4026 4027 unsigned Format = 0; 4028 if (IsTyped) { 4029 Format = MI.getOperand(5 + OpOffset).getImm(); 4030 ++OpOffset; 4031 } 4032 4033 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4034 unsigned ImmOffset; 4035 4036 LLT Ty = MRI.getType(Dst); 4037 LLT EltTy = Ty.getScalarType(); 4038 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4039 const bool Unpacked = ST.hasUnpackedD16VMem(); 4040 4041 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4042 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4043 4044 unsigned Opc; 4045 4046 if (IsTyped) { 4047 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4048 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4049 } else if (IsFormat) { 4050 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4051 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4052 } else { 4053 switch (MemTy.getSizeInBits()) { 4054 case 8: 4055 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4056 break; 4057 case 16: 4058 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4059 break; 4060 default: 4061 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4062 break; 4063 } 4064 } 4065 4066 Register LoadDstReg; 4067 4068 bool IsExtLoad = 4069 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4070 LLT UnpackedTy = Ty.changeElementSize(32); 4071 4072 if (IsExtLoad) 4073 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4074 else if (Unpacked && IsD16 && Ty.isVector()) 4075 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4076 else 4077 LoadDstReg = Dst; 4078 4079 auto MIB = B.buildInstr(Opc) 4080 .addDef(LoadDstReg) // vdata 4081 .addUse(RSrc) // rsrc 4082 .addUse(VIndex) // vindex 4083 .addUse(VOffset) // voffset 4084 .addUse(SOffset) // soffset 4085 .addImm(ImmOffset); // offset(imm) 4086 4087 if (IsTyped) 4088 MIB.addImm(Format); 4089 4090 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4091 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4092 .addMemOperand(MMO); 4093 4094 if (LoadDstReg != Dst) { 4095 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4096 4097 // Widen result for extending loads was widened. 4098 if (IsExtLoad) 4099 B.buildTrunc(Dst, LoadDstReg); 4100 else { 4101 // Repack to original 16-bit vector result 4102 // FIXME: G_TRUNC should work, but legalization currently fails 4103 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4104 SmallVector<Register, 4> Repack; 4105 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4106 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4107 B.buildMerge(Dst, Repack); 4108 } 4109 } 4110 4111 MI.eraseFromParent(); 4112 return true; 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4116 MachineIRBuilder &B, 4117 bool IsInc) const { 4118 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4119 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4120 B.buildInstr(Opc) 4121 .addDef(MI.getOperand(0).getReg()) 4122 .addUse(MI.getOperand(2).getReg()) 4123 .addUse(MI.getOperand(3).getReg()) 4124 .cloneMemRefs(MI); 4125 MI.eraseFromParent(); 4126 return true; 4127 } 4128 4129 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4130 switch (IntrID) { 4131 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4132 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4133 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4134 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4135 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4136 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4137 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4138 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4140 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4141 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4143 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4144 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4145 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4146 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4147 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4148 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4149 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4150 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4151 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4152 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4153 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4155 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4156 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4158 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4159 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4161 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4162 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4164 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4165 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4166 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4167 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4168 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4169 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4170 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4171 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4173 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4174 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4175 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4176 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4177 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4178 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4179 default: 4180 llvm_unreachable("unhandled atomic opcode"); 4181 } 4182 } 4183 4184 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4185 MachineIRBuilder &B, 4186 Intrinsic::ID IID) const { 4187 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4188 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4189 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4190 4191 Register Dst; 4192 4193 int OpOffset = 0; 4194 if (HasReturn) { 4195 // A few FP atomics do not support return values. 4196 Dst = MI.getOperand(0).getReg(); 4197 } else { 4198 OpOffset = -1; 4199 } 4200 4201 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4202 Register CmpVal; 4203 4204 if (IsCmpSwap) { 4205 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4206 ++OpOffset; 4207 } 4208 4209 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4210 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4211 4212 // The struct intrinsic variants add one additional operand over raw. 4213 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4214 Register VIndex; 4215 if (HasVIndex) { 4216 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4217 ++OpOffset; 4218 } else { 4219 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4220 } 4221 4222 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4223 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4224 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4225 4226 MachineMemOperand *MMO = *MI.memoperands_begin(); 4227 4228 unsigned ImmOffset; 4229 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4230 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4231 4232 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4233 4234 if (HasReturn) 4235 MIB.addDef(Dst); 4236 4237 MIB.addUse(VData); // vdata 4238 4239 if (IsCmpSwap) 4240 MIB.addReg(CmpVal); 4241 4242 MIB.addUse(RSrc) // rsrc 4243 .addUse(VIndex) // vindex 4244 .addUse(VOffset) // voffset 4245 .addUse(SOffset) // soffset 4246 .addImm(ImmOffset) // offset(imm) 4247 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4248 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4249 .addMemOperand(MMO); 4250 4251 MI.eraseFromParent(); 4252 return true; 4253 } 4254 4255 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4256 /// vector with s16 typed elements. 4257 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4258 SmallVectorImpl<Register> &PackedAddrs, 4259 unsigned ArgOffset, 4260 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4261 bool IsA16, bool IsG16) { 4262 const LLT S16 = LLT::scalar(16); 4263 const LLT V2S16 = LLT::fixed_vector(2, 16); 4264 auto EndIdx = Intr->VAddrEnd; 4265 4266 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4267 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4268 if (!SrcOp.isReg()) 4269 continue; // _L to _LZ may have eliminated this. 4270 4271 Register AddrReg = SrcOp.getReg(); 4272 4273 if ((I < Intr->GradientStart) || 4274 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4275 (I >= Intr->CoordStart && !IsA16)) { 4276 // Handle any gradient or coordinate operands that should not be packed 4277 if ((I < Intr->GradientStart) && IsA16 && 4278 (B.getMRI()->getType(AddrReg) == S16)) { 4279 // Special handling of bias when A16 is on. Bias is of type half but 4280 // occupies full 32-bit. 4281 PackedAddrs.push_back( 4282 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4283 .getReg(0)); 4284 } else { 4285 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4286 PackedAddrs.push_back(AddrReg); 4287 } 4288 } else { 4289 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4290 // derivatives dx/dh and dx/dv are packed with undef. 4291 if (((I + 1) >= EndIdx) || 4292 ((Intr->NumGradients / 2) % 2 == 1 && 4293 (I == static_cast<unsigned>(Intr->GradientStart + 4294 (Intr->NumGradients / 2) - 1) || 4295 I == static_cast<unsigned>(Intr->GradientStart + 4296 Intr->NumGradients - 1))) || 4297 // Check for _L to _LZ optimization 4298 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4299 PackedAddrs.push_back( 4300 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4301 .getReg(0)); 4302 } else { 4303 PackedAddrs.push_back( 4304 B.buildBuildVector( 4305 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4306 .getReg(0)); 4307 ++I; 4308 } 4309 } 4310 } 4311 } 4312 4313 /// Convert from separate vaddr components to a single vector address register, 4314 /// and replace the remaining operands with $noreg. 4315 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4316 int DimIdx, int NumVAddrs) { 4317 const LLT S32 = LLT::scalar(32); 4318 4319 SmallVector<Register, 8> AddrRegs; 4320 for (int I = 0; I != NumVAddrs; ++I) { 4321 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4322 if (SrcOp.isReg()) { 4323 AddrRegs.push_back(SrcOp.getReg()); 4324 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4325 } 4326 } 4327 4328 int NumAddrRegs = AddrRegs.size(); 4329 if (NumAddrRegs != 1) { 4330 // Above 8 elements round up to next power of 2 (i.e. 16). 4331 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4332 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4333 auto Undef = B.buildUndef(S32); 4334 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4335 NumAddrRegs = RoundedNumRegs; 4336 } 4337 4338 auto VAddr = 4339 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4340 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4341 } 4342 4343 for (int I = 1; I != NumVAddrs; ++I) { 4344 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4345 if (SrcOp.isReg()) 4346 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4347 } 4348 } 4349 4350 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4351 /// 4352 /// Depending on the subtarget, load/store with 16-bit element data need to be 4353 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4354 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4355 /// registers. 4356 /// 4357 /// We don't want to directly select image instructions just yet, but also want 4358 /// to exposes all register repacking to the legalizer/combiners. We also don't 4359 /// want a selected instrution entering RegBankSelect. In order to avoid 4360 /// defining a multitude of intermediate image instructions, directly hack on 4361 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4362 /// padding now unnecessary arguments with $noreg. 4363 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4364 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4365 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4366 4367 const unsigned NumDefs = MI.getNumExplicitDefs(); 4368 const unsigned ArgOffset = NumDefs + 1; 4369 bool IsTFE = NumDefs == 2; 4370 // We are only processing the operands of d16 image operations on subtargets 4371 // that use the unpacked register layout, or need to repack the TFE result. 4372 4373 // TODO: Do we need to guard against already legalized intrinsics? 4374 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4375 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4376 4377 MachineRegisterInfo *MRI = B.getMRI(); 4378 const LLT S32 = LLT::scalar(32); 4379 const LLT S16 = LLT::scalar(16); 4380 const LLT V2S16 = LLT::fixed_vector(2, 16); 4381 4382 unsigned DMask = 0; 4383 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4384 LLT Ty = MRI->getType(VData); 4385 4386 // Check for 16 bit addresses and pack if true. 4387 LLT GradTy = 4388 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4389 LLT AddrTy = 4390 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4391 const bool IsG16 = GradTy == S16; 4392 const bool IsA16 = AddrTy == S16; 4393 const bool IsD16 = Ty.getScalarType() == S16; 4394 4395 int DMaskLanes = 0; 4396 if (!BaseOpcode->Atomic) { 4397 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4398 if (BaseOpcode->Gather4) { 4399 DMaskLanes = 4; 4400 } else if (DMask != 0) { 4401 DMaskLanes = countPopulation(DMask); 4402 } else if (!IsTFE && !BaseOpcode->Store) { 4403 // If dmask is 0, this is a no-op load. This can be eliminated. 4404 B.buildUndef(MI.getOperand(0)); 4405 MI.eraseFromParent(); 4406 return true; 4407 } 4408 } 4409 4410 Observer.changingInstr(MI); 4411 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4412 4413 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4414 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4415 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4416 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4417 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4418 4419 // Track that we legalized this 4420 MI.setDesc(B.getTII().get(NewOpcode)); 4421 4422 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4423 // dmask to be at least 1 otherwise the instruction will fail 4424 if (IsTFE && DMask == 0) { 4425 DMask = 0x1; 4426 DMaskLanes = 1; 4427 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4428 } 4429 4430 if (BaseOpcode->Atomic) { 4431 Register VData0 = MI.getOperand(2).getReg(); 4432 LLT Ty = MRI->getType(VData0); 4433 4434 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4435 if (Ty.isVector()) 4436 return false; 4437 4438 if (BaseOpcode->AtomicX2) { 4439 Register VData1 = MI.getOperand(3).getReg(); 4440 // The two values are packed in one register. 4441 LLT PackedTy = LLT::fixed_vector(2, Ty); 4442 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4443 MI.getOperand(2).setReg(Concat.getReg(0)); 4444 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4445 } 4446 } 4447 4448 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4449 4450 // Optimize _L to _LZ when _L is zero 4451 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4452 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4453 const ConstantFP *ConstantLod; 4454 4455 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4456 m_GFCst(ConstantLod))) { 4457 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4458 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4459 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4460 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4461 Intr->Dim); 4462 4463 // The starting indexes should remain in the same place. 4464 --CorrectedNumVAddrs; 4465 4466 MI.getOperand(NumDefs).setIntrinsicID( 4467 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4468 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4469 Intr = NewImageDimIntr; 4470 } 4471 } 4472 } 4473 4474 // Optimize _mip away, when 'lod' is zero 4475 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4476 int64_t ConstantLod; 4477 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4478 m_ICst(ConstantLod))) { 4479 if (ConstantLod == 0) { 4480 // TODO: Change intrinsic opcode and remove operand instead or replacing 4481 // it with 0, as the _L to _LZ handling is done above. 4482 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4483 --CorrectedNumVAddrs; 4484 } 4485 } 4486 } 4487 4488 // Rewrite the addressing register layout before doing anything else. 4489 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4490 // 16 bit gradients are supported, but are tied to the A16 control 4491 // so both gradients and addresses must be 16 bit 4492 return false; 4493 } 4494 4495 if (IsA16 && !ST.hasA16()) { 4496 // A16 not supported 4497 return false; 4498 } 4499 4500 if (IsA16 || IsG16) { 4501 if (Intr->NumVAddrs > 1) { 4502 SmallVector<Register, 4> PackedRegs; 4503 4504 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4505 IsG16); 4506 4507 // See also below in the non-a16 branch 4508 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4509 PackedRegs.size() <= ST.getNSAMaxSize(); 4510 4511 if (!UseNSA && PackedRegs.size() > 1) { 4512 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4513 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4514 PackedRegs[0] = Concat.getReg(0); 4515 PackedRegs.resize(1); 4516 } 4517 4518 const unsigned NumPacked = PackedRegs.size(); 4519 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4520 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4521 if (!SrcOp.isReg()) { 4522 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4523 continue; 4524 } 4525 4526 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4527 4528 if (I - Intr->VAddrStart < NumPacked) 4529 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4530 else 4531 SrcOp.setReg(AMDGPU::NoRegister); 4532 } 4533 } 4534 } else { 4535 // If the register allocator cannot place the address registers contiguously 4536 // without introducing moves, then using the non-sequential address encoding 4537 // is always preferable, since it saves VALU instructions and is usually a 4538 // wash in terms of code size or even better. 4539 // 4540 // However, we currently have no way of hinting to the register allocator 4541 // that MIMG addresses should be placed contiguously when it is possible to 4542 // do so, so force non-NSA for the common 2-address case as a heuristic. 4543 // 4544 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4545 // allocation when possible. 4546 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4547 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4548 4549 if (!UseNSA && Intr->NumVAddrs > 1) 4550 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4551 Intr->NumVAddrs); 4552 } 4553 4554 int Flags = 0; 4555 if (IsA16) 4556 Flags |= 1; 4557 if (IsG16) 4558 Flags |= 2; 4559 MI.addOperand(MachineOperand::CreateImm(Flags)); 4560 4561 if (BaseOpcode->Store) { // No TFE for stores? 4562 // TODO: Handle dmask trim 4563 if (!Ty.isVector() || !IsD16) 4564 return true; 4565 4566 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4567 if (RepackedReg != VData) { 4568 MI.getOperand(1).setReg(RepackedReg); 4569 } 4570 4571 return true; 4572 } 4573 4574 Register DstReg = MI.getOperand(0).getReg(); 4575 const LLT EltTy = Ty.getScalarType(); 4576 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4577 4578 // Confirm that the return type is large enough for the dmask specified 4579 if (NumElts < DMaskLanes) 4580 return false; 4581 4582 if (NumElts > 4 || DMaskLanes > 4) 4583 return false; 4584 4585 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4586 const LLT AdjustedTy = 4587 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4588 4589 // The raw dword aligned data component of the load. The only legal cases 4590 // where this matters should be when using the packed D16 format, for 4591 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4592 LLT RoundedTy; 4593 4594 // S32 vector to to cover all data, plus TFE result element. 4595 LLT TFETy; 4596 4597 // Register type to use for each loaded component. Will be S32 or V2S16. 4598 LLT RegTy; 4599 4600 if (IsD16 && ST.hasUnpackedD16VMem()) { 4601 RoundedTy = 4602 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4603 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4604 RegTy = S32; 4605 } else { 4606 unsigned EltSize = EltTy.getSizeInBits(); 4607 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4608 unsigned RoundedSize = 32 * RoundedElts; 4609 RoundedTy = LLT::scalarOrVector( 4610 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4611 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4612 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4613 } 4614 4615 // The return type does not need adjustment. 4616 // TODO: Should we change s16 case to s32 or <2 x s16>? 4617 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4618 return true; 4619 4620 Register Dst1Reg; 4621 4622 // Insert after the instruction. 4623 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4624 4625 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4626 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4627 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4628 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4629 4630 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4631 4632 MI.getOperand(0).setReg(NewResultReg); 4633 4634 // In the IR, TFE is supposed to be used with a 2 element struct return 4635 // type. The instruction really returns these two values in one contiguous 4636 // register, with one additional dword beyond the loaded data. Rewrite the 4637 // return type to use a single register result. 4638 4639 if (IsTFE) { 4640 Dst1Reg = MI.getOperand(1).getReg(); 4641 if (MRI->getType(Dst1Reg) != S32) 4642 return false; 4643 4644 // TODO: Make sure the TFE operand bit is set. 4645 MI.RemoveOperand(1); 4646 4647 // Handle the easy case that requires no repack instructions. 4648 if (Ty == S32) { 4649 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4650 return true; 4651 } 4652 } 4653 4654 // Now figure out how to copy the new result register back into the old 4655 // result. 4656 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4657 4658 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4659 4660 if (ResultNumRegs == 1) { 4661 assert(!IsTFE); 4662 ResultRegs[0] = NewResultReg; 4663 } else { 4664 // We have to repack into a new vector of some kind. 4665 for (int I = 0; I != NumDataRegs; ++I) 4666 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4667 B.buildUnmerge(ResultRegs, NewResultReg); 4668 4669 // Drop the final TFE element to get the data part. The TFE result is 4670 // directly written to the right place already. 4671 if (IsTFE) 4672 ResultRegs.resize(NumDataRegs); 4673 } 4674 4675 // For an s16 scalar result, we form an s32 result with a truncate regardless 4676 // of packed vs. unpacked. 4677 if (IsD16 && !Ty.isVector()) { 4678 B.buildTrunc(DstReg, ResultRegs[0]); 4679 return true; 4680 } 4681 4682 // Avoid a build/concat_vector of 1 entry. 4683 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4684 B.buildBitcast(DstReg, ResultRegs[0]); 4685 return true; 4686 } 4687 4688 assert(Ty.isVector()); 4689 4690 if (IsD16) { 4691 // For packed D16 results with TFE enabled, all the data components are 4692 // S32. Cast back to the expected type. 4693 // 4694 // TODO: We don't really need to use load s32 elements. We would only need one 4695 // cast for the TFE result if a multiple of v2s16 was used. 4696 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4697 for (Register &Reg : ResultRegs) 4698 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4699 } else if (ST.hasUnpackedD16VMem()) { 4700 for (Register &Reg : ResultRegs) 4701 Reg = B.buildTrunc(S16, Reg).getReg(0); 4702 } 4703 } 4704 4705 auto padWithUndef = [&](LLT Ty, int NumElts) { 4706 if (NumElts == 0) 4707 return; 4708 Register Undef = B.buildUndef(Ty).getReg(0); 4709 for (int I = 0; I != NumElts; ++I) 4710 ResultRegs.push_back(Undef); 4711 }; 4712 4713 // Pad out any elements eliminated due to the dmask. 4714 LLT ResTy = MRI->getType(ResultRegs[0]); 4715 if (!ResTy.isVector()) { 4716 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4717 B.buildBuildVector(DstReg, ResultRegs); 4718 return true; 4719 } 4720 4721 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4722 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4723 4724 // Deal with the one annoying legal case. 4725 const LLT V3S16 = LLT::fixed_vector(3, 16); 4726 if (Ty == V3S16) { 4727 if (IsTFE) { 4728 if (ResultRegs.size() == 1) { 4729 NewResultReg = ResultRegs[0]; 4730 } else if (ResultRegs.size() == 2) { 4731 LLT V4S16 = LLT::fixed_vector(4, 16); 4732 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4733 } else { 4734 return false; 4735 } 4736 } 4737 4738 if (MRI->getType(DstReg).getNumElements() < 4739 MRI->getType(NewResultReg).getNumElements()) { 4740 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4741 } else { 4742 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4743 } 4744 return true; 4745 } 4746 4747 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4748 B.buildConcatVectors(DstReg, ResultRegs); 4749 return true; 4750 } 4751 4752 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4753 LegalizerHelper &Helper, MachineInstr &MI) const { 4754 MachineIRBuilder &B = Helper.MIRBuilder; 4755 GISelChangeObserver &Observer = Helper.Observer; 4756 4757 Register Dst = MI.getOperand(0).getReg(); 4758 LLT Ty = B.getMRI()->getType(Dst); 4759 unsigned Size = Ty.getSizeInBits(); 4760 MachineFunction &MF = B.getMF(); 4761 4762 Observer.changingInstr(MI); 4763 4764 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4765 Ty = getBitcastRegisterType(Ty); 4766 Helper.bitcastDst(MI, Ty, 0); 4767 Dst = MI.getOperand(0).getReg(); 4768 B.setInsertPt(B.getMBB(), MI); 4769 } 4770 4771 // FIXME: We don't really need this intermediate instruction. The intrinsic 4772 // should be fixed to have a memory operand. Since it's readnone, we're not 4773 // allowed to add one. 4774 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4775 MI.RemoveOperand(1); // Remove intrinsic ID 4776 4777 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4778 // TODO: Should this use datalayout alignment? 4779 const unsigned MemSize = (Size + 7) / 8; 4780 const Align MemAlign(4); 4781 MachineMemOperand *MMO = MF.getMachineMemOperand( 4782 MachinePointerInfo(), 4783 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4784 MachineMemOperand::MOInvariant, 4785 MemSize, MemAlign); 4786 MI.addMemOperand(MF, MMO); 4787 4788 // There are no 96-bit result scalar loads, but widening to 128-bit should 4789 // always be legal. We may need to restore this to a 96-bit result if it turns 4790 // out this needs to be converted to a vector load during RegBankSelect. 4791 if (!isPowerOf2_32(Size)) { 4792 if (Ty.isVector()) 4793 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4794 else 4795 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4796 } 4797 4798 Observer.changedInstr(MI); 4799 return true; 4800 } 4801 4802 // TODO: Move to selection 4803 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4804 MachineRegisterInfo &MRI, 4805 MachineIRBuilder &B) const { 4806 if (!ST.isTrapHandlerEnabled() || 4807 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4808 return legalizeTrapEndpgm(MI, MRI, B); 4809 4810 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4811 switch (*HsaAbiVer) { 4812 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4813 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4814 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4815 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4816 return ST.supportsGetDoorbellID() ? 4817 legalizeTrapHsa(MI, MRI, B) : 4818 legalizeTrapHsaQueuePtr(MI, MRI, B); 4819 } 4820 } 4821 4822 llvm_unreachable("Unknown trap handler"); 4823 } 4824 4825 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4826 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4827 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4828 MI.eraseFromParent(); 4829 return true; 4830 } 4831 4832 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4833 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4834 // Pass queue pointer to trap handler as input, and insert trap instruction 4835 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4836 Register LiveIn = 4837 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4838 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4839 return false; 4840 4841 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4842 B.buildCopy(SGPR01, LiveIn); 4843 B.buildInstr(AMDGPU::S_TRAP) 4844 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4845 .addReg(SGPR01, RegState::Implicit); 4846 4847 MI.eraseFromParent(); 4848 return true; 4849 } 4850 4851 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4852 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4853 B.buildInstr(AMDGPU::S_TRAP) 4854 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4855 MI.eraseFromParent(); 4856 return true; 4857 } 4858 4859 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4860 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4861 // Is non-HSA path or trap-handler disabled? Then, report a warning 4862 // accordingly 4863 if (!ST.isTrapHandlerEnabled() || 4864 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4865 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4866 "debugtrap handler not supported", 4867 MI.getDebugLoc(), DS_Warning); 4868 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4869 Ctx.diagnose(NoTrap); 4870 } else { 4871 // Insert debug-trap instruction 4872 B.buildInstr(AMDGPU::S_TRAP) 4873 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4874 } 4875 4876 MI.eraseFromParent(); 4877 return true; 4878 } 4879 4880 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4881 MachineIRBuilder &B) const { 4882 MachineRegisterInfo &MRI = *B.getMRI(); 4883 const LLT S16 = LLT::scalar(16); 4884 const LLT S32 = LLT::scalar(32); 4885 4886 Register DstReg = MI.getOperand(0).getReg(); 4887 Register NodePtr = MI.getOperand(2).getReg(); 4888 Register RayExtent = MI.getOperand(3).getReg(); 4889 Register RayOrigin = MI.getOperand(4).getReg(); 4890 Register RayDir = MI.getOperand(5).getReg(); 4891 Register RayInvDir = MI.getOperand(6).getReg(); 4892 Register TDescr = MI.getOperand(7).getReg(); 4893 4894 if (!ST.hasGFX10_AEncoding()) { 4895 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4896 "intrinsic not supported on subtarget", 4897 MI.getDebugLoc()); 4898 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4899 return false; 4900 } 4901 4902 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4903 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4904 const unsigned NumVDataDwords = 4; 4905 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4906 const bool UseNSA = 4907 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4908 const unsigned BaseOpcodes[2][2] = { 4909 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4910 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4911 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4912 int Opcode; 4913 if (UseNSA) { 4914 Opcode = 4915 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4916 NumVDataDwords, NumVAddrDwords); 4917 } else { 4918 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4919 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4920 PowerOf2Ceil(NumVAddrDwords)); 4921 } 4922 assert(Opcode != -1); 4923 4924 SmallVector<Register, 12> Ops; 4925 if (Is64) { 4926 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4927 Ops.push_back(Unmerge.getReg(0)); 4928 Ops.push_back(Unmerge.getReg(1)); 4929 } else { 4930 Ops.push_back(NodePtr); 4931 } 4932 Ops.push_back(RayExtent); 4933 4934 auto packLanes = [&Ops, &S32, &B](Register Src) { 4935 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4936 Ops.push_back(Unmerge.getReg(0)); 4937 Ops.push_back(Unmerge.getReg(1)); 4938 Ops.push_back(Unmerge.getReg(2)); 4939 }; 4940 4941 packLanes(RayOrigin); 4942 if (IsA16) { 4943 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4944 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4945 Register R1 = MRI.createGenericVirtualRegister(S32); 4946 Register R2 = MRI.createGenericVirtualRegister(S32); 4947 Register R3 = MRI.createGenericVirtualRegister(S32); 4948 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4949 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4950 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4951 Ops.push_back(R1); 4952 Ops.push_back(R2); 4953 Ops.push_back(R3); 4954 } else { 4955 packLanes(RayDir); 4956 packLanes(RayInvDir); 4957 } 4958 4959 if (!UseNSA) { 4960 // Build a single vector containing all the operands so far prepared. 4961 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4962 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4963 Ops.clear(); 4964 Ops.push_back(MergedOps); 4965 } 4966 4967 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4968 .addDef(DstReg) 4969 .addImm(Opcode); 4970 4971 for (Register R : Ops) { 4972 MIB.addUse(R); 4973 } 4974 4975 MIB.addUse(TDescr) 4976 .addImm(IsA16 ? 1 : 0) 4977 .cloneMemRefs(MI); 4978 4979 MI.eraseFromParent(); 4980 return true; 4981 } 4982 4983 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4984 B.buildConstant(MI.getOperand(0).getReg(), C); 4985 MI.eraseFromParent(); 4986 return true; 4987 } 4988 4989 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4990 MachineInstr &MI) const { 4991 MachineIRBuilder &B = Helper.MIRBuilder; 4992 MachineRegisterInfo &MRI = *B.getMRI(); 4993 4994 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4995 auto IntrID = MI.getIntrinsicID(); 4996 switch (IntrID) { 4997 case Intrinsic::amdgcn_if: 4998 case Intrinsic::amdgcn_else: { 4999 MachineInstr *Br = nullptr; 5000 MachineBasicBlock *UncondBrTarget = nullptr; 5001 bool Negated = false; 5002 if (MachineInstr *BrCond = 5003 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5004 const SIRegisterInfo *TRI 5005 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5006 5007 Register Def = MI.getOperand(1).getReg(); 5008 Register Use = MI.getOperand(3).getReg(); 5009 5010 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5011 5012 if (Negated) 5013 std::swap(CondBrTarget, UncondBrTarget); 5014 5015 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5016 if (IntrID == Intrinsic::amdgcn_if) { 5017 B.buildInstr(AMDGPU::SI_IF) 5018 .addDef(Def) 5019 .addUse(Use) 5020 .addMBB(UncondBrTarget); 5021 } else { 5022 B.buildInstr(AMDGPU::SI_ELSE) 5023 .addDef(Def) 5024 .addUse(Use) 5025 .addMBB(UncondBrTarget); 5026 } 5027 5028 if (Br) { 5029 Br->getOperand(0).setMBB(CondBrTarget); 5030 } else { 5031 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5032 // since we're swapping branch targets it needs to be reinserted. 5033 // FIXME: IRTranslator should probably not do this 5034 B.buildBr(*CondBrTarget); 5035 } 5036 5037 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5038 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5039 MI.eraseFromParent(); 5040 BrCond->eraseFromParent(); 5041 return true; 5042 } 5043 5044 return false; 5045 } 5046 case Intrinsic::amdgcn_loop: { 5047 MachineInstr *Br = nullptr; 5048 MachineBasicBlock *UncondBrTarget = nullptr; 5049 bool Negated = false; 5050 if (MachineInstr *BrCond = 5051 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5052 const SIRegisterInfo *TRI 5053 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5054 5055 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5056 Register Reg = MI.getOperand(2).getReg(); 5057 5058 if (Negated) 5059 std::swap(CondBrTarget, UncondBrTarget); 5060 5061 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5062 B.buildInstr(AMDGPU::SI_LOOP) 5063 .addUse(Reg) 5064 .addMBB(UncondBrTarget); 5065 5066 if (Br) 5067 Br->getOperand(0).setMBB(CondBrTarget); 5068 else 5069 B.buildBr(*CondBrTarget); 5070 5071 MI.eraseFromParent(); 5072 BrCond->eraseFromParent(); 5073 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5074 return true; 5075 } 5076 5077 return false; 5078 } 5079 case Intrinsic::amdgcn_kernarg_segment_ptr: 5080 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5081 // This only makes sense to call in a kernel, so just lower to null. 5082 B.buildConstant(MI.getOperand(0).getReg(), 0); 5083 MI.eraseFromParent(); 5084 return true; 5085 } 5086 5087 return legalizePreloadedArgIntrin( 5088 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5089 case Intrinsic::amdgcn_implicitarg_ptr: 5090 return legalizeImplicitArgPtr(MI, MRI, B); 5091 case Intrinsic::amdgcn_workitem_id_x: 5092 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5093 return replaceWithConstant(B, MI, 0); 5094 return legalizePreloadedArgIntrin(MI, MRI, B, 5095 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5096 case Intrinsic::amdgcn_workitem_id_y: 5097 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5098 return replaceWithConstant(B, MI, 0); 5099 5100 return legalizePreloadedArgIntrin(MI, MRI, B, 5101 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5102 case Intrinsic::amdgcn_workitem_id_z: 5103 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5104 return replaceWithConstant(B, MI, 0); 5105 5106 return legalizePreloadedArgIntrin(MI, MRI, B, 5107 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5108 case Intrinsic::amdgcn_workgroup_id_x: 5109 return legalizePreloadedArgIntrin(MI, MRI, B, 5110 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5111 case Intrinsic::amdgcn_workgroup_id_y: 5112 return legalizePreloadedArgIntrin(MI, MRI, B, 5113 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5114 case Intrinsic::amdgcn_workgroup_id_z: 5115 return legalizePreloadedArgIntrin(MI, MRI, B, 5116 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5117 case Intrinsic::amdgcn_dispatch_ptr: 5118 return legalizePreloadedArgIntrin(MI, MRI, B, 5119 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5120 case Intrinsic::amdgcn_queue_ptr: 5121 return legalizePreloadedArgIntrin(MI, MRI, B, 5122 AMDGPUFunctionArgInfo::QUEUE_PTR); 5123 case Intrinsic::amdgcn_implicit_buffer_ptr: 5124 return legalizePreloadedArgIntrin( 5125 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5126 case Intrinsic::amdgcn_dispatch_id: 5127 return legalizePreloadedArgIntrin(MI, MRI, B, 5128 AMDGPUFunctionArgInfo::DISPATCH_ID); 5129 case Intrinsic::amdgcn_fdiv_fast: 5130 return legalizeFDIVFastIntrin(MI, MRI, B); 5131 case Intrinsic::amdgcn_is_shared: 5132 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5133 case Intrinsic::amdgcn_is_private: 5134 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5135 case Intrinsic::amdgcn_wavefrontsize: { 5136 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5137 MI.eraseFromParent(); 5138 return true; 5139 } 5140 case Intrinsic::amdgcn_s_buffer_load: 5141 return legalizeSBufferLoad(Helper, MI); 5142 case Intrinsic::amdgcn_raw_buffer_store: 5143 case Intrinsic::amdgcn_struct_buffer_store: 5144 return legalizeBufferStore(MI, MRI, B, false, false); 5145 case Intrinsic::amdgcn_raw_buffer_store_format: 5146 case Intrinsic::amdgcn_struct_buffer_store_format: 5147 return legalizeBufferStore(MI, MRI, B, false, true); 5148 case Intrinsic::amdgcn_raw_tbuffer_store: 5149 case Intrinsic::amdgcn_struct_tbuffer_store: 5150 return legalizeBufferStore(MI, MRI, B, true, true); 5151 case Intrinsic::amdgcn_raw_buffer_load: 5152 case Intrinsic::amdgcn_struct_buffer_load: 5153 return legalizeBufferLoad(MI, MRI, B, false, false); 5154 case Intrinsic::amdgcn_raw_buffer_load_format: 5155 case Intrinsic::amdgcn_struct_buffer_load_format: 5156 return legalizeBufferLoad(MI, MRI, B, true, false); 5157 case Intrinsic::amdgcn_raw_tbuffer_load: 5158 case Intrinsic::amdgcn_struct_tbuffer_load: 5159 return legalizeBufferLoad(MI, MRI, B, true, true); 5160 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5161 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5162 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5163 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5164 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5165 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5166 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5167 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5168 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5169 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5170 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5171 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5172 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5173 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5174 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5175 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5176 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5177 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5178 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5179 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5180 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5181 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5182 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5183 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5184 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5185 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5186 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5187 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5188 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5189 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5190 return legalizeBufferAtomic(MI, B, IntrID); 5191 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5192 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5193 Register DstReg = MI.getOperand(0).getReg(); 5194 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5195 Function &F = B.getMF().getFunction(); 5196 DiagnosticInfoUnsupported NoFpRet( 5197 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5198 DS_Error); 5199 F.getContext().diagnose(NoFpRet); 5200 B.buildUndef(DstReg); 5201 MI.eraseFromParent(); 5202 return true; 5203 } 5204 5205 return legalizeBufferAtomic(MI, B, IntrID); 5206 } 5207 case Intrinsic::amdgcn_atomic_inc: 5208 return legalizeAtomicIncDec(MI, B, true); 5209 case Intrinsic::amdgcn_atomic_dec: 5210 return legalizeAtomicIncDec(MI, B, false); 5211 case Intrinsic::trap: 5212 return legalizeTrapIntrinsic(MI, MRI, B); 5213 case Intrinsic::debugtrap: 5214 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5215 case Intrinsic::amdgcn_rsq_clamp: 5216 return legalizeRsqClampIntrinsic(MI, MRI, B); 5217 case Intrinsic::amdgcn_ds_fadd: 5218 case Intrinsic::amdgcn_ds_fmin: 5219 case Intrinsic::amdgcn_ds_fmax: 5220 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5221 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5222 return legalizeBVHIntrinsic(MI, B); 5223 default: { 5224 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5225 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5226 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5227 return true; 5228 } 5229 } 5230 5231 return true; 5232 } 5233