1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerIf(typeIs(1, S1)) 817 .customFor({{S32, S64}, {S64, S64}}); 818 if (ST.has16BitInsts()) 819 IToFP.legalFor({{S16, S16}}); 820 IToFP.clampScalar(1, S32, S64) 821 .minScalar(0, S32) 822 .scalarize(0) 823 .widenScalarToNextPow2(1); 824 825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 827 .customFor({{S64, S32}, {S64, S64}}) 828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 829 if (ST.has16BitInsts()) 830 FPToI.legalFor({{S16, S16}}); 831 else 832 FPToI.minScalar(1, S32); 833 834 FPToI.minScalar(0, S32) 835 .widenScalarToNextPow2(0, 32) 836 .scalarize(0) 837 .lower(); 838 839 // Lower roundeven into G_FRINT 840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 841 .scalarize(0) 842 .lower(); 843 844 if (ST.has16BitInsts()) { 845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 846 .legalFor({S16, S32, S64}) 847 .clampScalar(0, S16, S64) 848 .scalarize(0); 849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 851 .legalFor({S32, S64}) 852 .clampScalar(0, S32, S64) 853 .scalarize(0); 854 } else { 855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 856 .legalFor({S32}) 857 .customFor({S64}) 858 .clampScalar(0, S32, S64) 859 .scalarize(0); 860 } 861 862 getActionDefinitionsBuilder(G_PTR_ADD) 863 .legalIf(all(isPointer(0), sameSize(0, 1))) 864 .scalarize(0) 865 .scalarSameSizeAs(1, 0); 866 867 getActionDefinitionsBuilder(G_PTRMASK) 868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 869 .scalarSameSizeAs(1, 0) 870 .scalarize(0); 871 872 auto &CmpBuilder = 873 getActionDefinitionsBuilder(G_ICMP) 874 // The compare output type differs based on the register bank of the output, 875 // so make both s1 and s32 legal. 876 // 877 // Scalar compares producing output in scc will be promoted to s32, as that 878 // is the allocatable register type that will be needed for the copy from 879 // scc. This will be promoted during RegBankSelect, and we assume something 880 // before that won't try to use s32 result types. 881 // 882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 883 // bank. 884 .legalForCartesianProduct( 885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 886 .legalForCartesianProduct( 887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 888 if (ST.has16BitInsts()) { 889 CmpBuilder.legalFor({{S1, S16}}); 890 } 891 892 CmpBuilder 893 .widenScalarToNextPow2(1) 894 .clampScalar(1, S32, S64) 895 .scalarize(0) 896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 897 898 getActionDefinitionsBuilder(G_FCMP) 899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 900 .widenScalarToNextPow2(1) 901 .clampScalar(1, S32, S64) 902 .scalarize(0); 903 904 // FIXME: fpow has a selection pattern that should move to custom lowering. 905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 906 if (ST.has16BitInsts()) 907 Exp2Ops.legalFor({S32, S16}); 908 else 909 Exp2Ops.legalFor({S32}); 910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 911 Exp2Ops.scalarize(0); 912 913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 914 if (ST.has16BitInsts()) 915 ExpOps.customFor({{S32}, {S16}}); 916 else 917 ExpOps.customFor({S32}); 918 ExpOps.clampScalar(0, MinScalarFPTy, S32) 919 .scalarize(0); 920 921 getActionDefinitionsBuilder(G_FPOWI) 922 .clampScalar(0, MinScalarFPTy, S32) 923 .lower(); 924 925 // The 64-bit versions produce 32-bit results, but only on the SALU. 926 getActionDefinitionsBuilder(G_CTPOP) 927 .legalFor({{S32, S32}, {S32, S64}}) 928 .clampScalar(0, S32, S32) 929 .clampScalar(1, S32, S64) 930 .scalarize(0) 931 .widenScalarToNextPow2(0, 32) 932 .widenScalarToNextPow2(1, 32); 933 934 // The hardware instructions return a different result on 0 than the generic 935 // instructions expect. The hardware produces -1, but these produce the 936 // bitwidth. 937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 938 .scalarize(0) 939 .clampScalar(0, S32, S32) 940 .clampScalar(1, S32, S64) 941 .widenScalarToNextPow2(0, 32) 942 .widenScalarToNextPow2(1, 32) 943 .lower(); 944 945 // The 64-bit versions produce 32-bit results, but only on the SALU. 946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 947 .legalFor({{S32, S32}, {S32, S64}}) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .scalarize(0) 951 .widenScalarToNextPow2(0, 32) 952 .widenScalarToNextPow2(1, 32); 953 954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 955 // RegBankSelect. 956 getActionDefinitionsBuilder(G_BITREVERSE) 957 .legalFor({S32, S64}) 958 .clampScalar(0, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0); 961 962 if (ST.has16BitInsts()) { 963 getActionDefinitionsBuilder(G_BSWAP) 964 .legalFor({S16, S32, V2S16}) 965 .clampMaxNumElements(0, S16, 2) 966 // FIXME: Fixing non-power-of-2 before clamp is workaround for 967 // narrowScalar limitation. 968 .widenScalarToNextPow2(0) 969 .clampScalar(0, S16, S32) 970 .scalarize(0); 971 972 if (ST.hasVOP3PInsts()) { 973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 974 .legalFor({S32, S16, V2S16}) 975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 976 .clampMaxNumElements(0, S16, 2) 977 .minScalar(0, S16) 978 .widenScalarToNextPow2(0) 979 .scalarize(0) 980 .lower(); 981 } else { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16}) 984 .widenScalarToNextPow2(0) 985 .minScalar(0, S16) 986 .scalarize(0) 987 .lower(); 988 } 989 } else { 990 // TODO: Should have same legality without v_perm_b32 991 getActionDefinitionsBuilder(G_BSWAP) 992 .legalFor({S32}) 993 .lowerIf(scalarNarrowerThan(0, 32)) 994 // FIXME: Fixing non-power-of-2 before clamp is workaround for 995 // narrowScalar limitation. 996 .widenScalarToNextPow2(0) 997 .maxScalar(0, S32) 998 .scalarize(0) 999 .lower(); 1000 1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1002 .legalFor({S32}) 1003 .minScalar(0, S32) 1004 .widenScalarToNextPow2(0) 1005 .scalarize(0) 1006 .lower(); 1007 } 1008 1009 getActionDefinitionsBuilder(G_INTTOPTR) 1010 // List the common cases 1011 .legalForCartesianProduct(AddrSpaces64, {S64}) 1012 .legalForCartesianProduct(AddrSpaces32, {S32}) 1013 .scalarize(0) 1014 // Accept any address space as long as the size matches 1015 .legalIf(sameSize(0, 1)) 1016 .widenScalarIf(smallerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }) 1020 .narrowScalarIf(largerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }); 1024 1025 getActionDefinitionsBuilder(G_PTRTOINT) 1026 // List the common cases 1027 .legalForCartesianProduct(AddrSpaces64, {S64}) 1028 .legalForCartesianProduct(AddrSpaces32, {S32}) 1029 .scalarize(0) 1030 // Accept any address space as long as the size matches 1031 .legalIf(sameSize(0, 1)) 1032 .widenScalarIf(smallerThan(0, 1), 1033 [](const LegalityQuery &Query) { 1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1035 }) 1036 .narrowScalarIf( 1037 largerThan(0, 1), 1038 [](const LegalityQuery &Query) { 1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1040 }); 1041 1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1043 .scalarize(0) 1044 .custom(); 1045 1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1047 bool IsLoad) -> bool { 1048 const LLT DstTy = Query.Types[0]; 1049 1050 // Split vector extloads. 1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1053 1054 if (MemSize < DstTy.getSizeInBits()) 1055 MemSize = std::max(MemSize, AlignBits); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 if (AlignBits < MemSize) { 1078 const SITargetLowering *TLI = ST.getTargetLowering(); 1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1080 Align(AlignBits / 8)); 1081 } 1082 1083 return false; 1084 }; 1085 1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1089 1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1091 // LDS 1092 // TODO: Unsupported flat for SI. 1093 1094 for (unsigned Op : {G_LOAD, G_STORE}) { 1095 const bool IsStore = Op == G_STORE; 1096 1097 auto &Actions = getActionDefinitionsBuilder(Op); 1098 // Explicitly list some common cases. 1099 // TODO: Does this help compile time at all? 1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1103 {S64, GlobalPtr, S64, GlobalAlign32}, 1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1106 {S32, GlobalPtr, S8, GlobalAlign8}, 1107 {S32, GlobalPtr, S16, GlobalAlign16}, 1108 1109 {S32, LocalPtr, S32, 32}, 1110 {S64, LocalPtr, S64, 32}, 1111 {V2S32, LocalPtr, V2S32, 32}, 1112 {S32, LocalPtr, S8, 8}, 1113 {S32, LocalPtr, S16, 16}, 1114 {V2S16, LocalPtr, S32, 32}, 1115 1116 {S32, PrivatePtr, S32, 32}, 1117 {S32, PrivatePtr, S8, 8}, 1118 {S32, PrivatePtr, S16, 16}, 1119 {V2S16, PrivatePtr, S32, 32}, 1120 1121 {S32, ConstantPtr, S32, GlobalAlign32}, 1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1124 {S64, ConstantPtr, S64, GlobalAlign32}, 1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1126 Actions.legalIf( 1127 [=](const LegalityQuery &Query) -> bool { 1128 return isLoadStoreLegal(ST, Query); 1129 }); 1130 1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1132 // 64-bits. 1133 // 1134 // TODO: Should generalize bitcast action into coerce, which will also cover 1135 // inserting addrspacecasts. 1136 Actions.customIf(typeIs(1, Constant32Ptr)); 1137 1138 // Turn any illegal element vectors into something easier to deal 1139 // with. These will ultimately produce 32-bit scalar shifts to extract the 1140 // parts anyway. 1141 // 1142 // For odd 16-bit element vectors, prefer to split those into pieces with 1143 // 16-bit vector parts. 1144 Actions.bitcastIf( 1145 [=](const LegalityQuery &Query) -> bool { 1146 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1147 Query.MMODescrs[0].MemoryTy); 1148 }, bitcastToRegisterType(0)); 1149 1150 if (!IsStore) { 1151 // Widen suitably aligned loads by loading extra bytes. The standard 1152 // legalization actions can't properly express widening memory operands. 1153 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1154 return shouldWidenLoad(ST, Query, G_LOAD); 1155 }); 1156 } 1157 1158 // FIXME: load/store narrowing should be moved to lower action 1159 Actions 1160 .narrowScalarIf( 1161 [=](const LegalityQuery &Query) -> bool { 1162 return !Query.Types[0].isVector() && 1163 needToSplitMemOp(Query, Op == G_LOAD); 1164 }, 1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1166 const LLT DstTy = Query.Types[0]; 1167 const LLT PtrTy = Query.Types[1]; 1168 1169 const unsigned DstSize = DstTy.getSizeInBits(); 1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1171 1172 // Split extloads. 1173 if (DstSize > MemSize) 1174 return std::make_pair(0, LLT::scalar(MemSize)); 1175 1176 if (!isPowerOf2_32(DstSize)) { 1177 // We're probably decomposing an odd sized store. Try to split 1178 // to the widest type. TODO: Account for alignment. As-is it 1179 // should be OK, since the new parts will be further legalized. 1180 unsigned FloorSize = PowerOf2Floor(DstSize); 1181 return std::make_pair(0, LLT::scalar(FloorSize)); 1182 } 1183 1184 if (DstSize > 32 && (DstSize % 32 != 0)) { 1185 // FIXME: Need a way to specify non-extload of larger size if 1186 // suitably aligned. 1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1188 } 1189 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 if (MemSize > MaxSize) 1194 return std::make_pair(0, LLT::scalar(MaxSize)); 1195 1196 unsigned Align = Query.MMODescrs[0].AlignInBits; 1197 return std::make_pair(0, LLT::scalar(Align)); 1198 }) 1199 .fewerElementsIf( 1200 [=](const LegalityQuery &Query) -> bool { 1201 return Query.Types[0].isVector() && 1202 needToSplitMemOp(Query, Op == G_LOAD); 1203 }, 1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1205 const LLT DstTy = Query.Types[0]; 1206 const LLT PtrTy = Query.Types[1]; 1207 1208 LLT EltTy = DstTy.getElementType(); 1209 unsigned MaxSize = maxSizeForAddrSpace(ST, 1210 PtrTy.getAddressSpace(), 1211 Op == G_LOAD); 1212 1213 // FIXME: Handle widened to power of 2 results better. This ends 1214 // up scalarizing. 1215 // FIXME: 3 element stores scalarized on SI 1216 1217 // Split if it's too large for the address space. 1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1219 if (MemSize > MaxSize) { 1220 unsigned NumElts = DstTy.getNumElements(); 1221 unsigned EltSize = EltTy.getSizeInBits(); 1222 1223 if (MaxSize % EltSize == 0) { 1224 return std::make_pair( 1225 0, LLT::scalarOrVector( 1226 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1227 } 1228 1229 unsigned NumPieces = MemSize / MaxSize; 1230 1231 // FIXME: Refine when odd breakdowns handled 1232 // The scalars will need to be re-legalized. 1233 if (NumPieces == 1 || NumPieces >= NumElts || 1234 NumElts % NumPieces != 0) 1235 return std::make_pair(0, EltTy); 1236 1237 return std::make_pair( 1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1239 } 1240 1241 // FIXME: We could probably handle weird extending loads better. 1242 if (DstTy.getSizeInBits() > MemSize) 1243 return std::make_pair(0, EltTy); 1244 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 unsigned DstSize = DstTy.getSizeInBits(); 1247 if (!isPowerOf2_32(DstSize)) { 1248 // We're probably decomposing an odd sized store. Try to split 1249 // to the widest type. TODO: Account for alignment. As-is it 1250 // should be OK, since the new parts will be further legalized. 1251 unsigned FloorSize = PowerOf2Floor(DstSize); 1252 return std::make_pair( 1253 0, LLT::scalarOrVector( 1254 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1255 } 1256 1257 // Need to split because of alignment. 1258 unsigned Align = Query.MMODescrs[0].AlignInBits; 1259 if (EltSize > Align && 1260 (EltSize / Align < DstTy.getNumElements())) { 1261 return std::make_pair( 1262 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1263 } 1264 1265 // May need relegalization for the scalars. 1266 return std::make_pair(0, EltTy); 1267 }) 1268 .minScalar(0, S32) 1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1270 .widenScalarToNextPow2(0) 1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1272 .lower(); 1273 } 1274 1275 // FIXME: Unaligned accesses not lowered. 1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1278 {S32, GlobalPtr, S16, 2 * 8}, 1279 {S32, LocalPtr, S8, 8}, 1280 {S32, LocalPtr, S16, 16}, 1281 {S32, PrivatePtr, S8, 8}, 1282 {S32, PrivatePtr, S16, 16}, 1283 {S32, ConstantPtr, S8, 8}, 1284 {S32, ConstantPtr, S16, 2 * 8}}) 1285 .legalIf( 1286 [=](const LegalityQuery &Query) -> bool { 1287 return isLoadStoreLegal(ST, Query); 1288 }); 1289 1290 if (ST.hasFlatAddressSpace()) { 1291 ExtLoads.legalForTypesWithMemDesc( 1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1293 } 1294 1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1296 // 64-bits. 1297 // 1298 // TODO: Should generalize bitcast action into coerce, which will also cover 1299 // inserting addrspacecasts. 1300 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1301 1302 ExtLoads.clampScalar(0, S32, S32) 1303 .widenScalarToNextPow2(0) 1304 .lower(); 1305 1306 auto &Atomics = getActionDefinitionsBuilder( 1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1310 G_ATOMICRMW_UMIN}) 1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1312 {S64, GlobalPtr}, {S64, LocalPtr}, 1313 {S32, RegionPtr}, {S64, RegionPtr}}); 1314 if (ST.hasFlatAddressSpace()) { 1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1316 } 1317 1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1319 if (ST.hasLDSFPAtomics()) { 1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1321 if (ST.hasGFX90AInsts()) 1322 Atomic.legalFor({{S64, LocalPtr}}); 1323 } 1324 if (ST.hasAtomicFaddInsts()) 1325 Atomic.legalFor({{S32, GlobalPtr}}); 1326 1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1328 // demarshalling 1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1331 {S32, FlatPtr}, {S64, FlatPtr}}) 1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1333 {S32, RegionPtr}, {S64, RegionPtr}}); 1334 // TODO: Pointer types, any 32-bit or 64-bit vector 1335 1336 // Condition should be s32 for scalar, s1 for vector. 1337 getActionDefinitionsBuilder(G_SELECT) 1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1339 LocalPtr, FlatPtr, PrivatePtr, 1340 LLT::fixed_vector(2, LocalPtr), 1341 LLT::fixed_vector(2, PrivatePtr)}, 1342 {S1, S32}) 1343 .clampScalar(0, S16, S64) 1344 .scalarize(1) 1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1347 .clampMaxNumElements(0, S32, 2) 1348 .clampMaxNumElements(0, LocalPtr, 2) 1349 .clampMaxNumElements(0, PrivatePtr, 2) 1350 .scalarize(0) 1351 .widenScalarToNextPow2(0) 1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1353 1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1355 // be more flexible with the shift amount type. 1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1357 .legalFor({{S32, S32}, {S64, S32}}); 1358 if (ST.has16BitInsts()) { 1359 if (ST.hasVOP3PInsts()) { 1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1361 .clampMaxNumElements(0, S16, 2); 1362 } else 1363 Shifts.legalFor({{S16, S16}}); 1364 1365 // TODO: Support 16-bit shift amounts for all types 1366 Shifts.widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1369 // 32-bit amount. 1370 const LLT ValTy = Query.Types[0]; 1371 const LLT AmountTy = Query.Types[1]; 1372 return ValTy.getSizeInBits() <= 16 && 1373 AmountTy.getSizeInBits() < 16; 1374 }, changeTo(1, S16)); 1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S16, S64); 1378 Shifts.widenScalarToNextPow2(0, 16); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S16) 1382 .scalarize(0) 1383 .lower(); 1384 } else { 1385 // Make sure we legalize the shift amount type first, as the general 1386 // expansion for the shifted type will produce much worse code if it hasn't 1387 // been truncated already. 1388 Shifts.clampScalar(1, S32, S32); 1389 Shifts.clampScalar(0, S32, S64); 1390 Shifts.widenScalarToNextPow2(0, 32); 1391 1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1393 .minScalar(0, S32) 1394 .scalarize(0) 1395 .lower(); 1396 } 1397 Shifts.scalarize(0); 1398 1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1402 unsigned IdxTypeIdx = 2; 1403 1404 getActionDefinitionsBuilder(Op) 1405 .customIf([=](const LegalityQuery &Query) { 1406 const LLT EltTy = Query.Types[EltTypeIdx]; 1407 const LLT VecTy = Query.Types[VecTypeIdx]; 1408 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1409 const unsigned EltSize = EltTy.getSizeInBits(); 1410 return (EltSize == 32 || EltSize == 64) && 1411 VecTy.getSizeInBits() % 32 == 0 && 1412 VecTy.getSizeInBits() <= MaxRegisterSize && 1413 IdxTy.getSizeInBits() == 32; 1414 }) 1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1416 bitcastToVectorElement32(VecTypeIdx)) 1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1418 .bitcastIf( 1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1420 [=](const LegalityQuery &Query) { 1421 // For > 64-bit element types, try to turn this into a 64-bit 1422 // element vector since we may be able to do better indexing 1423 // if this is scalar. If not, fall back to 32. 1424 const LLT EltTy = Query.Types[EltTypeIdx]; 1425 const LLT VecTy = Query.Types[VecTypeIdx]; 1426 const unsigned DstEltSize = EltTy.getSizeInBits(); 1427 const unsigned VecSize = VecTy.getSizeInBits(); 1428 1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1430 return std::make_pair( 1431 VecTypeIdx, 1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1433 }) 1434 .clampScalar(EltTypeIdx, S32, S64) 1435 .clampScalar(VecTypeIdx, S32, S64) 1436 .clampScalar(IdxTypeIdx, S32, S32) 1437 .clampMaxNumElements(VecTypeIdx, S32, 32) 1438 // TODO: Clamp elements for 64-bit vectors? 1439 // It should only be necessary with variable indexes. 1440 // As a last resort, lower to the stack 1441 .lower(); 1442 } 1443 1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1445 .unsupportedIf([=](const LegalityQuery &Query) { 1446 const LLT &EltTy = Query.Types[1].getElementType(); 1447 return Query.Types[0] != EltTy; 1448 }); 1449 1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1453 1454 // FIXME: Doesn't handle extract of illegal sizes. 1455 getActionDefinitionsBuilder(Op) 1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1457 // FIXME: Multiples of 16 should not be legal. 1458 .legalIf([=](const LegalityQuery &Query) { 1459 const LLT BigTy = Query.Types[BigTyIdx]; 1460 const LLT LitTy = Query.Types[LitTyIdx]; 1461 return (BigTy.getSizeInBits() % 32 == 0) && 1462 (LitTy.getSizeInBits() % 16 == 0); 1463 }) 1464 .widenScalarIf( 1465 [=](const LegalityQuery &Query) { 1466 const LLT BigTy = Query.Types[BigTyIdx]; 1467 return (BigTy.getScalarSizeInBits() < 16); 1468 }, 1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1470 .widenScalarIf( 1471 [=](const LegalityQuery &Query) { 1472 const LLT LitTy = Query.Types[LitTyIdx]; 1473 return (LitTy.getScalarSizeInBits() < 16); 1474 }, 1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1477 .widenScalarToNextPow2(BigTyIdx, 32); 1478 1479 } 1480 1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1482 .legalForCartesianProduct(AllS32Vectors, {S32}) 1483 .legalForCartesianProduct(AllS64Vectors, {S64}) 1484 .clampNumElements(0, V16S32, V32S32) 1485 .clampNumElements(0, V2S64, V16S64) 1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1487 1488 if (ST.hasScalarPackInsts()) { 1489 BuildVector 1490 // FIXME: Should probably widen s1 vectors straight to s32 1491 .minScalarOrElt(0, S16) 1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1493 .minScalar(1, S32); 1494 1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1496 .legalFor({V2S16, S32}) 1497 .lower(); 1498 BuildVector.minScalarOrElt(0, S32); 1499 } else { 1500 BuildVector.customFor({V2S16, S16}); 1501 BuildVector.minScalarOrElt(0, S32); 1502 1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1504 .customFor({V2S16, S32}) 1505 .lower(); 1506 } 1507 1508 BuildVector.legalIf(isRegisterType(0)); 1509 1510 // FIXME: Clamp maximum size 1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1512 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1513 .clampMaxNumElements(0, S32, 32) 1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1515 .clampMaxNumElements(0, S16, 64); 1516 1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1518 // pre-legalize. 1519 if (ST.hasVOP3PInsts()) { 1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1521 .customFor({V2S16, V2S16}) 1522 .lower(); 1523 } else 1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1525 1526 // Merge/Unmerge 1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1530 1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1532 const LLT Ty = Query.Types[TypeIdx]; 1533 if (Ty.isVector()) { 1534 const LLT &EltTy = Ty.getElementType(); 1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1536 return true; 1537 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1538 return true; 1539 } 1540 return false; 1541 }; 1542 1543 auto &Builder = getActionDefinitionsBuilder(Op) 1544 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1545 .lowerFor({{S16, V2S16}}) 1546 .lowerIf([=](const LegalityQuery &Query) { 1547 const LLT BigTy = Query.Types[BigTyIdx]; 1548 return BigTy.getSizeInBits() == 32; 1549 }) 1550 // Try to widen to s16 first for small types. 1551 // TODO: Only do this on targets with legal s16 shifts 1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1556 elementTypeIs(1, S16)), 1557 changeTo(1, V2S16)) 1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1560 // valid. 1561 .clampScalar(LitTyIdx, S32, S512) 1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1563 // Break up vectors with weird elements into scalars 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1566 scalarize(0)) 1567 .fewerElementsIf( 1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1569 scalarize(1)) 1570 .clampScalar(BigTyIdx, S32, MaxScalar); 1571 1572 if (Op == G_MERGE_VALUES) { 1573 Builder.widenScalarIf( 1574 // TODO: Use 16-bit shifts if legal for 8-bit values? 1575 [=](const LegalityQuery &Query) { 1576 const LLT Ty = Query.Types[LitTyIdx]; 1577 return Ty.getSizeInBits() < 32; 1578 }, 1579 changeTo(LitTyIdx, S32)); 1580 } 1581 1582 Builder.widenScalarIf( 1583 [=](const LegalityQuery &Query) { 1584 const LLT Ty = Query.Types[BigTyIdx]; 1585 return !isPowerOf2_32(Ty.getSizeInBits()) && 1586 Ty.getSizeInBits() % 16 != 0; 1587 }, 1588 [=](const LegalityQuery &Query) { 1589 // Pick the next power of 2, or a multiple of 64 over 128. 1590 // Whichever is smaller. 1591 const LLT &Ty = Query.Types[BigTyIdx]; 1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1593 if (NewSizeInBits >= 256) { 1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1595 if (RoundedTo < NewSizeInBits) 1596 NewSizeInBits = RoundedTo; 1597 } 1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1599 }) 1600 // Any vectors left are the wrong size. Scalarize them. 1601 .scalarize(0) 1602 .scalarize(1); 1603 } 1604 1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1606 // RegBankSelect. 1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1608 .legalFor({{S32}, {S64}}); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 SextInReg.lowerFor({{V2S16}}) 1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1613 // get more vector shift opportunities, since we'll get those when 1614 // expanded. 1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1616 } else if (ST.has16BitInsts()) { 1617 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1618 } else { 1619 // Prefer to promote to s32 before lowering if we don't have 16-bit 1620 // shifts. This avoid a lot of intermediate truncate and extend operations. 1621 SextInReg.lowerFor({{S32}, {S64}}); 1622 } 1623 1624 SextInReg 1625 .scalarize(0) 1626 .clampScalar(0, S32, S64) 1627 .lower(); 1628 1629 // TODO: Only Try to form v2s16 with legal packed instructions. 1630 getActionDefinitionsBuilder(G_FSHR) 1631 .legalFor({{S32, S32}}) 1632 .lowerFor({{V2S16, V2S16}}) 1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1634 .scalarize(0) 1635 .lower(); 1636 1637 if (ST.hasVOP3PInsts()) { 1638 getActionDefinitionsBuilder(G_FSHL) 1639 .lowerFor({{V2S16, V2S16}}) 1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1641 .scalarize(0) 1642 .lower(); 1643 } else { 1644 getActionDefinitionsBuilder(G_FSHL) 1645 .scalarize(0) 1646 .lower(); 1647 } 1648 1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1650 .legalFor({S64}); 1651 1652 getActionDefinitionsBuilder(G_FENCE) 1653 .alwaysLegal(); 1654 1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1656 .scalarize(0) 1657 .minScalar(0, S32) 1658 .lower(); 1659 1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1661 .legalFor({{S32, S32}, {S64, S32}}) 1662 .clampScalar(1, S32, S32) 1663 .clampScalar(0, S32, S64) 1664 .widenScalarToNextPow2(0) 1665 .scalarize(0); 1666 1667 getActionDefinitionsBuilder({ 1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1669 G_FCOPYSIGN, 1670 1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1672 G_ATOMICRMW_NAND, 1673 G_ATOMICRMW_FSUB, 1674 G_READ_REGISTER, 1675 G_WRITE_REGISTER, 1676 1677 G_SADDO, G_SSUBO, 1678 1679 // TODO: Implement 1680 G_FMINIMUM, G_FMAXIMUM}).lower(); 1681 1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1685 .unsupported(); 1686 1687 getLegacyLegalizerInfo().computeTables(); 1688 verify(*ST.getInstrInfo()); 1689 } 1690 1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1692 MachineInstr &MI) const { 1693 MachineIRBuilder &B = Helper.MIRBuilder; 1694 MachineRegisterInfo &MRI = *B.getMRI(); 1695 1696 switch (MI.getOpcode()) { 1697 case TargetOpcode::G_ADDRSPACE_CAST: 1698 return legalizeAddrSpaceCast(MI, MRI, B); 1699 case TargetOpcode::G_FRINT: 1700 return legalizeFrint(MI, MRI, B); 1701 case TargetOpcode::G_FCEIL: 1702 return legalizeFceil(MI, MRI, B); 1703 case TargetOpcode::G_FREM: 1704 return legalizeFrem(MI, MRI, B); 1705 case TargetOpcode::G_INTRINSIC_TRUNC: 1706 return legalizeIntrinsicTrunc(MI, MRI, B); 1707 case TargetOpcode::G_SITOFP: 1708 return legalizeITOFP(MI, MRI, B, true); 1709 case TargetOpcode::G_UITOFP: 1710 return legalizeITOFP(MI, MRI, B, false); 1711 case TargetOpcode::G_FPTOSI: 1712 return legalizeFPTOI(MI, MRI, B, true); 1713 case TargetOpcode::G_FPTOUI: 1714 return legalizeFPTOI(MI, MRI, B, false); 1715 case TargetOpcode::G_FMINNUM: 1716 case TargetOpcode::G_FMAXNUM: 1717 case TargetOpcode::G_FMINNUM_IEEE: 1718 case TargetOpcode::G_FMAXNUM_IEEE: 1719 return legalizeMinNumMaxNum(Helper, MI); 1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1721 return legalizeExtractVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_INSERT_VECTOR_ELT: 1723 return legalizeInsertVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_SHUFFLE_VECTOR: 1725 return legalizeShuffleVector(MI, MRI, B); 1726 case TargetOpcode::G_FSIN: 1727 case TargetOpcode::G_FCOS: 1728 return legalizeSinCos(MI, MRI, B); 1729 case TargetOpcode::G_GLOBAL_VALUE: 1730 return legalizeGlobalValue(MI, MRI, B); 1731 case TargetOpcode::G_LOAD: 1732 case TargetOpcode::G_SEXTLOAD: 1733 case TargetOpcode::G_ZEXTLOAD: 1734 return legalizeLoad(Helper, MI); 1735 case TargetOpcode::G_FMAD: 1736 return legalizeFMad(MI, MRI, B); 1737 case TargetOpcode::G_FDIV: 1738 return legalizeFDIV(MI, MRI, B); 1739 case TargetOpcode::G_UDIV: 1740 case TargetOpcode::G_UREM: 1741 case TargetOpcode::G_UDIVREM: 1742 return legalizeUnsignedDIV_REM(MI, MRI, B); 1743 case TargetOpcode::G_SDIV: 1744 case TargetOpcode::G_SREM: 1745 case TargetOpcode::G_SDIVREM: 1746 return legalizeSignedDIV_REM(MI, MRI, B); 1747 case TargetOpcode::G_ATOMIC_CMPXCHG: 1748 return legalizeAtomicCmpXChg(MI, MRI, B); 1749 case TargetOpcode::G_FLOG: 1750 return legalizeFlog(MI, B, numbers::ln2f); 1751 case TargetOpcode::G_FLOG10: 1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1753 case TargetOpcode::G_FEXP: 1754 return legalizeFExp(MI, B); 1755 case TargetOpcode::G_FPOW: 1756 return legalizeFPow(MI, B); 1757 case TargetOpcode::G_FFLOOR: 1758 return legalizeFFloor(MI, MRI, B); 1759 case TargetOpcode::G_BUILD_VECTOR: 1760 return legalizeBuildVector(MI, MRI, B); 1761 default: 1762 return false; 1763 } 1764 1765 llvm_unreachable("expected switch to return"); 1766 } 1767 1768 Register AMDGPULegalizerInfo::getSegmentAperture( 1769 unsigned AS, 1770 MachineRegisterInfo &MRI, 1771 MachineIRBuilder &B) const { 1772 MachineFunction &MF = B.getMF(); 1773 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1774 const LLT S32 = LLT::scalar(32); 1775 1776 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1777 1778 if (ST.hasApertureRegs()) { 1779 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1780 // getreg. 1781 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1782 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1783 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1784 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1785 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1786 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1787 unsigned Encoding = 1788 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1789 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1790 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1791 1792 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1793 1794 B.buildInstr(AMDGPU::S_GETREG_B32) 1795 .addDef(GetReg) 1796 .addImm(Encoding); 1797 MRI.setType(GetReg, S32); 1798 1799 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1800 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1801 } 1802 1803 Register QueuePtr = MRI.createGenericVirtualRegister( 1804 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1805 1806 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1807 return Register(); 1808 1809 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1810 // private_segment_aperture_base_hi. 1811 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1812 1813 // TODO: can we be smarter about machine pointer info? 1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1815 MachineMemOperand *MMO = MF.getMachineMemOperand( 1816 PtrInfo, 1817 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1818 MachineMemOperand::MOInvariant, 1819 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1820 1821 Register LoadAddr; 1822 1823 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1824 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1825 } 1826 1827 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1828 MachineInstr &MI, MachineRegisterInfo &MRI, 1829 MachineIRBuilder &B) const { 1830 MachineFunction &MF = B.getMF(); 1831 1832 const LLT S32 = LLT::scalar(32); 1833 Register Dst = MI.getOperand(0).getReg(); 1834 Register Src = MI.getOperand(1).getReg(); 1835 1836 LLT DstTy = MRI.getType(Dst); 1837 LLT SrcTy = MRI.getType(Src); 1838 unsigned DestAS = DstTy.getAddressSpace(); 1839 unsigned SrcAS = SrcTy.getAddressSpace(); 1840 1841 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1842 // vector element. 1843 assert(!DstTy.isVector()); 1844 1845 const AMDGPUTargetMachine &TM 1846 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1847 1848 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1849 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1850 return true; 1851 } 1852 1853 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1854 // Truncate. 1855 B.buildExtract(Dst, Src, 0); 1856 MI.eraseFromParent(); 1857 return true; 1858 } 1859 1860 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1861 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1862 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1863 1864 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1865 // another. Merge operands are required to be the same type, but creating an 1866 // extra ptrtoint would be kind of pointless. 1867 auto HighAddr = B.buildConstant( 1868 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1869 B.buildMerge(Dst, {Src, HighAddr}); 1870 MI.eraseFromParent(); 1871 return true; 1872 } 1873 1874 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1875 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1876 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1877 unsigned NullVal = TM.getNullPointerValue(DestAS); 1878 1879 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1880 auto FlatNull = B.buildConstant(SrcTy, 0); 1881 1882 // Extract low 32-bits of the pointer. 1883 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1884 1885 auto CmpRes = 1886 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1887 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1888 1889 MI.eraseFromParent(); 1890 return true; 1891 } 1892 1893 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1894 return false; 1895 1896 if (!ST.hasFlatAddressSpace()) 1897 return false; 1898 1899 auto SegmentNull = 1900 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1901 auto FlatNull = 1902 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1903 1904 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1905 if (!ApertureReg.isValid()) 1906 return false; 1907 1908 auto CmpRes = 1909 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1910 1911 // Coerce the type of the low half of the result so we can use merge_values. 1912 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1913 1914 // TODO: Should we allow mismatched types but matching sizes in merges to 1915 // avoid the ptrtoint? 1916 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1917 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 bool AMDGPULegalizerInfo::legalizeFrint( 1924 MachineInstr &MI, MachineRegisterInfo &MRI, 1925 MachineIRBuilder &B) const { 1926 Register Src = MI.getOperand(1).getReg(); 1927 LLT Ty = MRI.getType(Src); 1928 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1929 1930 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1931 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1932 1933 auto C1 = B.buildFConstant(Ty, C1Val); 1934 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1935 1936 // TODO: Should this propagate fast-math-flags? 1937 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1938 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1939 1940 auto C2 = B.buildFConstant(Ty, C2Val); 1941 auto Fabs = B.buildFAbs(Ty, Src); 1942 1943 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1944 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1945 MI.eraseFromParent(); 1946 return true; 1947 } 1948 1949 bool AMDGPULegalizerInfo::legalizeFceil( 1950 MachineInstr &MI, MachineRegisterInfo &MRI, 1951 MachineIRBuilder &B) const { 1952 1953 const LLT S1 = LLT::scalar(1); 1954 const LLT S64 = LLT::scalar(64); 1955 1956 Register Src = MI.getOperand(1).getReg(); 1957 assert(MRI.getType(Src) == S64); 1958 1959 // result = trunc(src) 1960 // if (src > 0.0 && src != result) 1961 // result += 1.0 1962 1963 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1964 1965 const auto Zero = B.buildFConstant(S64, 0.0); 1966 const auto One = B.buildFConstant(S64, 1.0); 1967 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1968 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1969 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1970 auto Add = B.buildSelect(S64, And, One, Zero); 1971 1972 // TODO: Should this propagate fast-math-flags? 1973 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1974 return true; 1975 } 1976 1977 bool AMDGPULegalizerInfo::legalizeFrem( 1978 MachineInstr &MI, MachineRegisterInfo &MRI, 1979 MachineIRBuilder &B) const { 1980 Register DstReg = MI.getOperand(0).getReg(); 1981 Register Src0Reg = MI.getOperand(1).getReg(); 1982 Register Src1Reg = MI.getOperand(2).getReg(); 1983 auto Flags = MI.getFlags(); 1984 LLT Ty = MRI.getType(DstReg); 1985 1986 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1987 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1988 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1989 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1990 MI.eraseFromParent(); 1991 return true; 1992 } 1993 1994 static MachineInstrBuilder extractF64Exponent(Register Hi, 1995 MachineIRBuilder &B) { 1996 const unsigned FractBits = 52; 1997 const unsigned ExpBits = 11; 1998 LLT S32 = LLT::scalar(32); 1999 2000 auto Const0 = B.buildConstant(S32, FractBits - 32); 2001 auto Const1 = B.buildConstant(S32, ExpBits); 2002 2003 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2004 .addUse(Hi) 2005 .addUse(Const0.getReg(0)) 2006 .addUse(Const1.getReg(0)); 2007 2008 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2009 } 2010 2011 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2012 MachineInstr &MI, MachineRegisterInfo &MRI, 2013 MachineIRBuilder &B) const { 2014 const LLT S1 = LLT::scalar(1); 2015 const LLT S32 = LLT::scalar(32); 2016 const LLT S64 = LLT::scalar(64); 2017 2018 Register Src = MI.getOperand(1).getReg(); 2019 assert(MRI.getType(Src) == S64); 2020 2021 // TODO: Should this use extract since the low half is unused? 2022 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2023 Register Hi = Unmerge.getReg(1); 2024 2025 // Extract the upper half, since this is where we will find the sign and 2026 // exponent. 2027 auto Exp = extractF64Exponent(Hi, B); 2028 2029 const unsigned FractBits = 52; 2030 2031 // Extract the sign bit. 2032 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2033 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2034 2035 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2036 2037 const auto Zero32 = B.buildConstant(S32, 0); 2038 2039 // Extend back to 64-bits. 2040 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2041 2042 auto Shr = B.buildAShr(S64, FractMask, Exp); 2043 auto Not = B.buildNot(S64, Shr); 2044 auto Tmp0 = B.buildAnd(S64, Src, Not); 2045 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2046 2047 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2048 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2049 2050 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2051 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2052 MI.eraseFromParent(); 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::legalizeITOFP( 2057 MachineInstr &MI, MachineRegisterInfo &MRI, 2058 MachineIRBuilder &B, bool Signed) const { 2059 2060 Register Dst = MI.getOperand(0).getReg(); 2061 Register Src = MI.getOperand(1).getReg(); 2062 2063 const LLT S64 = LLT::scalar(64); 2064 const LLT S32 = LLT::scalar(32); 2065 const LLT S1 = LLT::scalar(1); 2066 2067 assert(MRI.getType(Src) == S64); 2068 2069 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2070 auto ThirtyTwo = B.buildConstant(S32, 32); 2071 2072 if (MRI.getType(Dst) == S64) { 2073 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2074 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2075 2076 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2077 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2078 .addUse(CvtHi.getReg(0)) 2079 .addUse(ThirtyTwo.getReg(0)); 2080 2081 // TODO: Should this propagate fast-math-flags? 2082 B.buildFAdd(Dst, LdExp, CvtLo); 2083 MI.eraseFromParent(); 2084 return true; 2085 } 2086 2087 assert(MRI.getType(Dst) == S32); 2088 2089 auto Zero = B.buildConstant(S32, 0); 2090 auto One = B.buildConstant(S32, 1); 2091 auto AllOnes = B.buildConstant(S32, -1); 2092 2093 MachineInstrBuilder ShAmt; 2094 if (Signed) { 2095 auto ThirtyThree = B.buildConstant(S32, 33); 2096 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2097 auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero); 2098 auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo); 2099 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2100 /*HasSideEffects=*/false) 2101 .addUse(Unmerge.getReg(1)); 2102 auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes); 2103 auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt); 2104 ShAmt = B.buildSub(S32, LS2, One); 2105 } else 2106 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2107 auto Norm = B.buildShl(S64, Src, ShAmt); 2108 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2109 auto NotAllZeros = 2110 B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero); 2111 auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero); 2112 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2113 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2114 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2115 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2116 /*HasSideEffects=*/false) 2117 .addUse(FVal.getReg(0)) 2118 .addUse(Scale.getReg(0)); 2119 MI.eraseFromParent(); 2120 return true; 2121 } 2122 2123 // TODO: Copied from DAG implementation. Verify logic and document how this 2124 // actually works. 2125 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2126 MachineRegisterInfo &MRI, 2127 MachineIRBuilder &B, 2128 bool Signed) const { 2129 2130 Register Dst = MI.getOperand(0).getReg(); 2131 Register Src = MI.getOperand(1).getReg(); 2132 2133 const LLT S64 = LLT::scalar(64); 2134 const LLT S32 = LLT::scalar(32); 2135 2136 const LLT SrcLT = MRI.getType(Src); 2137 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2138 2139 unsigned Flags = MI.getFlags(); 2140 2141 // The basic idea of converting a floating point number into a pair of 32-bit 2142 // integers is illustrated as follows: 2143 // 2144 // tf := trunc(val); 2145 // hif := floor(tf * 2^-32); 2146 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2147 // hi := fptoi(hif); 2148 // lo := fptoi(lof); 2149 // 2150 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2151 MachineInstrBuilder Sign; 2152 if (Signed && SrcLT == S32) { 2153 // However, a 32-bit floating point number has only 23 bits mantissa and 2154 // it's not enough to hold all the significant bits of `lof` if val is 2155 // negative. To avoid the loss of precision, We need to take the absolute 2156 // value after truncating and flip the result back based on the original 2157 // signedness. 2158 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2159 Trunc = B.buildFAbs(S32, Trunc, Flags); 2160 } 2161 MachineInstrBuilder K0, K1; 2162 if (SrcLT == S64) { 2163 K0 = B.buildFConstant(S64, 2164 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2165 K1 = B.buildFConstant(S64, 2166 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2167 } else { 2168 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2169 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2170 } 2171 2172 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2173 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2174 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2175 2176 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2177 : B.buildFPTOUI(S32, FloorMul); 2178 auto Lo = B.buildFPTOUI(S32, Fma); 2179 2180 if (Signed && SrcLT == S32) { 2181 // Flip the result based on the signedness, which is either all 0s or 1s. 2182 Sign = B.buildMerge(S64, {Sign, Sign}); 2183 // r := xor({lo, hi}, sign) - sign; 2184 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2185 } else 2186 B.buildMerge(Dst, {Lo, Hi}); 2187 MI.eraseFromParent(); 2188 2189 return true; 2190 } 2191 2192 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2193 MachineInstr &MI) const { 2194 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2195 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2196 2197 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2198 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2199 2200 // With ieee_mode disabled, the instructions have the correct behavior 2201 // already for G_FMINNUM/G_FMAXNUM 2202 if (!MFI->getMode().IEEE) 2203 return !IsIEEEOp; 2204 2205 if (IsIEEEOp) 2206 return true; 2207 2208 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2209 } 2210 2211 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2212 MachineInstr &MI, MachineRegisterInfo &MRI, 2213 MachineIRBuilder &B) const { 2214 // TODO: Should move some of this into LegalizerHelper. 2215 2216 // TODO: Promote dynamic indexing of s16 to s32 2217 2218 // FIXME: Artifact combiner probably should have replaced the truncated 2219 // constant before this, so we shouldn't need 2220 // getConstantVRegValWithLookThrough. 2221 Optional<ValueAndVReg> MaybeIdxVal = 2222 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2223 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2224 return true; 2225 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2226 2227 Register Dst = MI.getOperand(0).getReg(); 2228 Register Vec = MI.getOperand(1).getReg(); 2229 2230 LLT VecTy = MRI.getType(Vec); 2231 LLT EltTy = VecTy.getElementType(); 2232 assert(EltTy == MRI.getType(Dst)); 2233 2234 if (IdxVal < VecTy.getNumElements()) 2235 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2236 else 2237 B.buildUndef(Dst); 2238 2239 MI.eraseFromParent(); 2240 return true; 2241 } 2242 2243 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2244 MachineInstr &MI, MachineRegisterInfo &MRI, 2245 MachineIRBuilder &B) const { 2246 // TODO: Should move some of this into LegalizerHelper. 2247 2248 // TODO: Promote dynamic indexing of s16 to s32 2249 2250 // FIXME: Artifact combiner probably should have replaced the truncated 2251 // constant before this, so we shouldn't need 2252 // getConstantVRegValWithLookThrough. 2253 Optional<ValueAndVReg> MaybeIdxVal = 2254 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2255 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2256 return true; 2257 2258 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2259 Register Dst = MI.getOperand(0).getReg(); 2260 Register Vec = MI.getOperand(1).getReg(); 2261 Register Ins = MI.getOperand(2).getReg(); 2262 2263 LLT VecTy = MRI.getType(Vec); 2264 LLT EltTy = VecTy.getElementType(); 2265 assert(EltTy == MRI.getType(Ins)); 2266 2267 if (IdxVal < VecTy.getNumElements()) 2268 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2269 else 2270 B.buildUndef(Dst); 2271 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2277 MachineInstr &MI, MachineRegisterInfo &MRI, 2278 MachineIRBuilder &B) const { 2279 const LLT V2S16 = LLT::fixed_vector(2, 16); 2280 2281 Register Dst = MI.getOperand(0).getReg(); 2282 Register Src0 = MI.getOperand(1).getReg(); 2283 LLT DstTy = MRI.getType(Dst); 2284 LLT SrcTy = MRI.getType(Src0); 2285 2286 if (SrcTy == V2S16 && DstTy == V2S16 && 2287 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2288 return true; 2289 2290 MachineIRBuilder HelperBuilder(MI); 2291 GISelObserverWrapper DummyObserver; 2292 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2293 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2294 } 2295 2296 bool AMDGPULegalizerInfo::legalizeSinCos( 2297 MachineInstr &MI, MachineRegisterInfo &MRI, 2298 MachineIRBuilder &B) const { 2299 2300 Register DstReg = MI.getOperand(0).getReg(); 2301 Register SrcReg = MI.getOperand(1).getReg(); 2302 LLT Ty = MRI.getType(DstReg); 2303 unsigned Flags = MI.getFlags(); 2304 2305 Register TrigVal; 2306 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2307 if (ST.hasTrigReducedRange()) { 2308 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2309 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2310 .addUse(MulVal.getReg(0)) 2311 .setMIFlags(Flags).getReg(0); 2312 } else 2313 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2314 2315 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2316 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2317 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2318 .addUse(TrigVal) 2319 .setMIFlags(Flags); 2320 MI.eraseFromParent(); 2321 return true; 2322 } 2323 2324 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2325 MachineIRBuilder &B, 2326 const GlobalValue *GV, 2327 int64_t Offset, 2328 unsigned GAFlags) const { 2329 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2330 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2331 // to the following code sequence: 2332 // 2333 // For constant address space: 2334 // s_getpc_b64 s[0:1] 2335 // s_add_u32 s0, s0, $symbol 2336 // s_addc_u32 s1, s1, 0 2337 // 2338 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2339 // a fixup or relocation is emitted to replace $symbol with a literal 2340 // constant, which is a pc-relative offset from the encoding of the $symbol 2341 // operand to the global variable. 2342 // 2343 // For global address space: 2344 // s_getpc_b64 s[0:1] 2345 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2346 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2347 // 2348 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2349 // fixups or relocations are emitted to replace $symbol@*@lo and 2350 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2351 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2352 // operand to the global variable. 2353 // 2354 // What we want here is an offset from the value returned by s_getpc 2355 // (which is the address of the s_add_u32 instruction) to the global 2356 // variable, but since the encoding of $symbol starts 4 bytes after the start 2357 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2358 // small. This requires us to add 4 to the global variable offset in order to 2359 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2360 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2361 // instruction. 2362 2363 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2364 2365 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2366 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2367 2368 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2369 .addDef(PCReg); 2370 2371 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2372 if (GAFlags == SIInstrInfo::MO_NONE) 2373 MIB.addImm(0); 2374 else 2375 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2376 2377 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2378 2379 if (PtrTy.getSizeInBits() == 32) 2380 B.buildExtract(DstReg, PCReg, 0); 2381 return true; 2382 } 2383 2384 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2385 MachineInstr &MI, MachineRegisterInfo &MRI, 2386 MachineIRBuilder &B) const { 2387 Register DstReg = MI.getOperand(0).getReg(); 2388 LLT Ty = MRI.getType(DstReg); 2389 unsigned AS = Ty.getAddressSpace(); 2390 2391 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2392 MachineFunction &MF = B.getMF(); 2393 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2394 2395 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2396 if (!MFI->isModuleEntryFunction() && 2397 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2398 const Function &Fn = MF.getFunction(); 2399 DiagnosticInfoUnsupported BadLDSDecl( 2400 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2401 DS_Warning); 2402 Fn.getContext().diagnose(BadLDSDecl); 2403 2404 // We currently don't have a way to correctly allocate LDS objects that 2405 // aren't directly associated with a kernel. We do force inlining of 2406 // functions that use local objects. However, if these dead functions are 2407 // not eliminated, we don't want a compile time error. Just emit a warning 2408 // and a trap, since there should be no callable path here. 2409 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2410 B.buildUndef(DstReg); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 // TODO: We could emit code to handle the initialization somewhere. 2416 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2417 const SITargetLowering *TLI = ST.getTargetLowering(); 2418 if (!TLI->shouldUseLDSConstAddress(GV)) { 2419 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2420 return true; // Leave in place; 2421 } 2422 2423 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2424 Type *Ty = GV->getValueType(); 2425 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2426 // zero-sized type in other languages to declare the dynamic shared 2427 // memory which size is not known at the compile time. They will be 2428 // allocated by the runtime and placed directly after the static 2429 // allocated ones. They all share the same offset. 2430 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2431 // Adjust alignment for that dynamic shared memory array. 2432 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2433 LLT S32 = LLT::scalar(32); 2434 auto Sz = 2435 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2436 B.buildIntToPtr(DstReg, Sz); 2437 MI.eraseFromParent(); 2438 return true; 2439 } 2440 } 2441 2442 B.buildConstant( 2443 DstReg, 2444 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2445 MI.eraseFromParent(); 2446 return true; 2447 } 2448 2449 const Function &Fn = MF.getFunction(); 2450 DiagnosticInfoUnsupported BadInit( 2451 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2452 Fn.getContext().diagnose(BadInit); 2453 return true; 2454 } 2455 2456 const SITargetLowering *TLI = ST.getTargetLowering(); 2457 2458 if (TLI->shouldEmitFixup(GV)) { 2459 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2460 MI.eraseFromParent(); 2461 return true; 2462 } 2463 2464 if (TLI->shouldEmitPCReloc(GV)) { 2465 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2466 MI.eraseFromParent(); 2467 return true; 2468 } 2469 2470 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2471 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2472 2473 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2474 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2475 MachinePointerInfo::getGOT(MF), 2476 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2477 MachineMemOperand::MOInvariant, 2478 LoadTy, Align(8)); 2479 2480 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2481 2482 if (Ty.getSizeInBits() == 32) { 2483 // Truncate if this is a 32-bit constant adrdess. 2484 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2485 B.buildExtract(DstReg, Load, 0); 2486 } else 2487 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2488 2489 MI.eraseFromParent(); 2490 return true; 2491 } 2492 2493 static LLT widenToNextPowerOf2(LLT Ty) { 2494 if (Ty.isVector()) 2495 return Ty.changeElementCount( 2496 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2497 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2498 } 2499 2500 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2501 MachineInstr &MI) const { 2502 MachineIRBuilder &B = Helper.MIRBuilder; 2503 MachineRegisterInfo &MRI = *B.getMRI(); 2504 GISelChangeObserver &Observer = Helper.Observer; 2505 2506 Register PtrReg = MI.getOperand(1).getReg(); 2507 LLT PtrTy = MRI.getType(PtrReg); 2508 unsigned AddrSpace = PtrTy.getAddressSpace(); 2509 2510 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2511 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2512 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2513 Observer.changingInstr(MI); 2514 MI.getOperand(1).setReg(Cast.getReg(0)); 2515 Observer.changedInstr(MI); 2516 return true; 2517 } 2518 2519 if (MI.getOpcode() != AMDGPU::G_LOAD) 2520 return false; 2521 2522 Register ValReg = MI.getOperand(0).getReg(); 2523 LLT ValTy = MRI.getType(ValReg); 2524 2525 MachineMemOperand *MMO = *MI.memoperands_begin(); 2526 const unsigned ValSize = ValTy.getSizeInBits(); 2527 const LLT MemTy = MMO->getMemoryType(); 2528 const Align MemAlign = MMO->getAlign(); 2529 const unsigned MemSize = MemTy.getSizeInBits(); 2530 const unsigned AlignInBits = 8 * MemAlign.value(); 2531 2532 // Widen non-power-of-2 loads to the alignment if needed 2533 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2534 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2535 2536 // This was already the correct extending load result type, so just adjust 2537 // the memory type. 2538 if (WideMemSize == ValSize) { 2539 MachineFunction &MF = B.getMF(); 2540 2541 MachineMemOperand *WideMMO = 2542 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2543 Observer.changingInstr(MI); 2544 MI.setMemRefs(MF, {WideMMO}); 2545 Observer.changedInstr(MI); 2546 return true; 2547 } 2548 2549 // Don't bother handling edge case that should probably never be produced. 2550 if (ValSize > WideMemSize) 2551 return false; 2552 2553 LLT WideTy = widenToNextPowerOf2(ValTy); 2554 2555 Register WideLoad; 2556 if (!WideTy.isVector()) { 2557 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2558 B.buildTrunc(ValReg, WideLoad).getReg(0); 2559 } else { 2560 // Extract the subvector. 2561 2562 if (isRegisterType(ValTy)) { 2563 // If this a case where G_EXTRACT is legal, use it. 2564 // (e.g. <3 x s32> -> <4 x s32>) 2565 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2566 B.buildExtract(ValReg, WideLoad, 0); 2567 } else { 2568 // For cases where the widened type isn't a nice register value, unmerge 2569 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2570 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2571 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2572 B.setInsertPt(B.getMBB(), MI.getIterator()); 2573 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2574 } 2575 } 2576 2577 MI.eraseFromParent(); 2578 return true; 2579 } 2580 2581 return false; 2582 } 2583 2584 bool AMDGPULegalizerInfo::legalizeFMad( 2585 MachineInstr &MI, MachineRegisterInfo &MRI, 2586 MachineIRBuilder &B) const { 2587 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2588 assert(Ty.isScalar()); 2589 2590 MachineFunction &MF = B.getMF(); 2591 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2592 2593 // TODO: Always legal with future ftz flag. 2594 // FIXME: Do we need just output? 2595 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2596 return true; 2597 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2598 return true; 2599 2600 MachineIRBuilder HelperBuilder(MI); 2601 GISelObserverWrapper DummyObserver; 2602 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2603 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2604 } 2605 2606 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2607 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2608 Register DstReg = MI.getOperand(0).getReg(); 2609 Register PtrReg = MI.getOperand(1).getReg(); 2610 Register CmpVal = MI.getOperand(2).getReg(); 2611 Register NewVal = MI.getOperand(3).getReg(); 2612 2613 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2614 "this should not have been custom lowered"); 2615 2616 LLT ValTy = MRI.getType(CmpVal); 2617 LLT VecTy = LLT::fixed_vector(2, ValTy); 2618 2619 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2620 2621 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2622 .addDef(DstReg) 2623 .addUse(PtrReg) 2624 .addUse(PackedVal) 2625 .setMemRefs(MI.memoperands()); 2626 2627 MI.eraseFromParent(); 2628 return true; 2629 } 2630 2631 bool AMDGPULegalizerInfo::legalizeFlog( 2632 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2633 Register Dst = MI.getOperand(0).getReg(); 2634 Register Src = MI.getOperand(1).getReg(); 2635 LLT Ty = B.getMRI()->getType(Dst); 2636 unsigned Flags = MI.getFlags(); 2637 2638 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2639 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2640 2641 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2642 MI.eraseFromParent(); 2643 return true; 2644 } 2645 2646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2647 MachineIRBuilder &B) const { 2648 Register Dst = MI.getOperand(0).getReg(); 2649 Register Src = MI.getOperand(1).getReg(); 2650 unsigned Flags = MI.getFlags(); 2651 LLT Ty = B.getMRI()->getType(Dst); 2652 2653 auto K = B.buildFConstant(Ty, numbers::log2e); 2654 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2655 B.buildFExp2(Dst, Mul, Flags); 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2661 MachineIRBuilder &B) const { 2662 Register Dst = MI.getOperand(0).getReg(); 2663 Register Src0 = MI.getOperand(1).getReg(); 2664 Register Src1 = MI.getOperand(2).getReg(); 2665 unsigned Flags = MI.getFlags(); 2666 LLT Ty = B.getMRI()->getType(Dst); 2667 const LLT S16 = LLT::scalar(16); 2668 const LLT S32 = LLT::scalar(32); 2669 2670 if (Ty == S32) { 2671 auto Log = B.buildFLog2(S32, Src0, Flags); 2672 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2673 .addUse(Log.getReg(0)) 2674 .addUse(Src1) 2675 .setMIFlags(Flags); 2676 B.buildFExp2(Dst, Mul, Flags); 2677 } else if (Ty == S16) { 2678 // There's no f16 fmul_legacy, so we need to convert for it. 2679 auto Log = B.buildFLog2(S16, Src0, Flags); 2680 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2681 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2682 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2683 .addUse(Ext0.getReg(0)) 2684 .addUse(Ext1.getReg(0)) 2685 .setMIFlags(Flags); 2686 2687 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2688 } else 2689 return false; 2690 2691 MI.eraseFromParent(); 2692 return true; 2693 } 2694 2695 // Find a source register, ignoring any possible source modifiers. 2696 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2697 Register ModSrc = OrigSrc; 2698 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2699 ModSrc = SrcFNeg->getOperand(1).getReg(); 2700 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2701 ModSrc = SrcFAbs->getOperand(1).getReg(); 2702 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2703 ModSrc = SrcFAbs->getOperand(1).getReg(); 2704 return ModSrc; 2705 } 2706 2707 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2708 MachineRegisterInfo &MRI, 2709 MachineIRBuilder &B) const { 2710 2711 const LLT S1 = LLT::scalar(1); 2712 const LLT S64 = LLT::scalar(64); 2713 Register Dst = MI.getOperand(0).getReg(); 2714 Register OrigSrc = MI.getOperand(1).getReg(); 2715 unsigned Flags = MI.getFlags(); 2716 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2717 "this should not have been custom lowered"); 2718 2719 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2720 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2721 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2722 // V_FRACT bug is: 2723 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2724 // 2725 // Convert floor(x) to (x - fract(x)) 2726 2727 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2728 .addUse(OrigSrc) 2729 .setMIFlags(Flags); 2730 2731 // Give source modifier matching some assistance before obscuring a foldable 2732 // pattern. 2733 2734 // TODO: We can avoid the neg on the fract? The input sign to fract 2735 // shouldn't matter? 2736 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2737 2738 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2739 2740 Register Min = MRI.createGenericVirtualRegister(S64); 2741 2742 // We don't need to concern ourselves with the snan handling difference, so 2743 // use the one which will directly select. 2744 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2745 if (MFI->getMode().IEEE) 2746 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2747 else 2748 B.buildFMinNum(Min, Fract, Const, Flags); 2749 2750 Register CorrectedFract = Min; 2751 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2752 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2753 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2754 } 2755 2756 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2757 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2758 2759 MI.eraseFromParent(); 2760 return true; 2761 } 2762 2763 // Turn an illegal packed v2s16 build vector into bit operations. 2764 // TODO: This should probably be a bitcast action in LegalizerHelper. 2765 bool AMDGPULegalizerInfo::legalizeBuildVector( 2766 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2767 Register Dst = MI.getOperand(0).getReg(); 2768 const LLT S32 = LLT::scalar(32); 2769 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2770 2771 Register Src0 = MI.getOperand(1).getReg(); 2772 Register Src1 = MI.getOperand(2).getReg(); 2773 assert(MRI.getType(Src0) == LLT::scalar(16)); 2774 2775 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2776 B.buildBitcast(Dst, Merge); 2777 2778 MI.eraseFromParent(); 2779 return true; 2780 } 2781 2782 // Check that this is a G_XOR x, -1 2783 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2784 if (MI.getOpcode() != TargetOpcode::G_XOR) 2785 return false; 2786 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2787 return ConstVal && *ConstVal == -1; 2788 } 2789 2790 // Return the use branch instruction, otherwise null if the usage is invalid. 2791 static MachineInstr * 2792 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2793 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2794 Register CondDef = MI.getOperand(0).getReg(); 2795 if (!MRI.hasOneNonDBGUse(CondDef)) 2796 return nullptr; 2797 2798 MachineBasicBlock *Parent = MI.getParent(); 2799 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2800 2801 if (isNot(MRI, *UseMI)) { 2802 Register NegatedCond = UseMI->getOperand(0).getReg(); 2803 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2804 return nullptr; 2805 2806 // We're deleting the def of this value, so we need to remove it. 2807 UseMI->eraseFromParent(); 2808 2809 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2810 Negated = true; 2811 } 2812 2813 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2814 return nullptr; 2815 2816 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2817 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2818 if (Next == Parent->end()) { 2819 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2820 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2821 return nullptr; 2822 UncondBrTarget = &*NextMBB; 2823 } else { 2824 if (Next->getOpcode() != AMDGPU::G_BR) 2825 return nullptr; 2826 Br = &*Next; 2827 UncondBrTarget = Br->getOperand(0).getMBB(); 2828 } 2829 2830 return UseMI; 2831 } 2832 2833 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2834 const ArgDescriptor *Arg, 2835 const TargetRegisterClass *ArgRC, 2836 LLT ArgTy) const { 2837 MCRegister SrcReg = Arg->getRegister(); 2838 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2839 assert(DstReg.isVirtual() && "Virtual register expected"); 2840 2841 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2842 ArgTy); 2843 if (Arg->isMasked()) { 2844 // TODO: Should we try to emit this once in the entry block? 2845 const LLT S32 = LLT::scalar(32); 2846 const unsigned Mask = Arg->getMask(); 2847 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2848 2849 Register AndMaskSrc = LiveIn; 2850 2851 if (Shift != 0) { 2852 auto ShiftAmt = B.buildConstant(S32, Shift); 2853 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2854 } 2855 2856 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2857 } else { 2858 B.buildCopy(DstReg, LiveIn); 2859 } 2860 2861 return true; 2862 } 2863 2864 bool AMDGPULegalizerInfo::loadInputValue( 2865 Register DstReg, MachineIRBuilder &B, 2866 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2867 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2868 const ArgDescriptor *Arg; 2869 const TargetRegisterClass *ArgRC; 2870 LLT ArgTy; 2871 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2872 2873 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2874 return false; // TODO: Handle these 2875 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2876 } 2877 2878 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2879 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2880 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2881 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2882 return false; 2883 2884 MI.eraseFromParent(); 2885 return true; 2886 } 2887 2888 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2889 MachineRegisterInfo &MRI, 2890 MachineIRBuilder &B) const { 2891 Register Dst = MI.getOperand(0).getReg(); 2892 LLT DstTy = MRI.getType(Dst); 2893 LLT S16 = LLT::scalar(16); 2894 LLT S32 = LLT::scalar(32); 2895 LLT S64 = LLT::scalar(64); 2896 2897 if (DstTy == S16) 2898 return legalizeFDIV16(MI, MRI, B); 2899 if (DstTy == S32) 2900 return legalizeFDIV32(MI, MRI, B); 2901 if (DstTy == S64) 2902 return legalizeFDIV64(MI, MRI, B); 2903 2904 return false; 2905 } 2906 2907 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2908 Register DstDivReg, 2909 Register DstRemReg, 2910 Register X, 2911 Register Y) const { 2912 const LLT S1 = LLT::scalar(1); 2913 const LLT S32 = LLT::scalar(32); 2914 2915 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2916 // algorithm used here. 2917 2918 // Initial estimate of inv(y). 2919 auto FloatY = B.buildUITOFP(S32, Y); 2920 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2921 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2922 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2923 auto Z = B.buildFPTOUI(S32, ScaledY); 2924 2925 // One round of UNR. 2926 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2927 auto NegYZ = B.buildMul(S32, NegY, Z); 2928 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2929 2930 // Quotient/remainder estimate. 2931 auto Q = B.buildUMulH(S32, X, Z); 2932 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2933 2934 // First quotient/remainder refinement. 2935 auto One = B.buildConstant(S32, 1); 2936 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2937 if (DstDivReg) 2938 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2939 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2940 2941 // Second quotient/remainder refinement. 2942 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2943 if (DstDivReg) 2944 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2945 2946 if (DstRemReg) 2947 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2948 } 2949 2950 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2951 // 2952 // Return lo, hi of result 2953 // 2954 // %cvt.lo = G_UITOFP Val.lo 2955 // %cvt.hi = G_UITOFP Val.hi 2956 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2957 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2958 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2959 // %mul2 = G_FMUL %mul1, 2**(-32) 2960 // %trunc = G_INTRINSIC_TRUNC %mul2 2961 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2962 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2963 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2964 Register Val) { 2965 const LLT S32 = LLT::scalar(32); 2966 auto Unmerge = B.buildUnmerge(S32, Val); 2967 2968 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2969 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2970 2971 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2972 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2973 2974 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2975 auto Mul1 = 2976 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2977 2978 // 2**(-32) 2979 auto Mul2 = 2980 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2981 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2982 2983 // -(2**32) 2984 auto Mad2 = B.buildFMAD(S32, Trunc, 2985 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2986 2987 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2988 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2989 2990 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2991 } 2992 2993 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2994 Register DstDivReg, 2995 Register DstRemReg, 2996 Register Numer, 2997 Register Denom) const { 2998 const LLT S32 = LLT::scalar(32); 2999 const LLT S64 = LLT::scalar(64); 3000 const LLT S1 = LLT::scalar(1); 3001 Register RcpLo, RcpHi; 3002 3003 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3004 3005 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3006 3007 auto Zero64 = B.buildConstant(S64, 0); 3008 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3009 3010 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3011 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3012 3013 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3014 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3015 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3016 3017 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3018 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3019 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 3020 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3021 3022 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3023 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3024 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3025 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3026 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3027 3028 auto Zero32 = B.buildConstant(S32, 0); 3029 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3030 auto Add2_HiC = 3031 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 3032 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 3033 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3034 3035 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3036 Register NumerLo = UnmergeNumer.getReg(0); 3037 Register NumerHi = UnmergeNumer.getReg(1); 3038 3039 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3040 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3041 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3042 Register Mul3_Lo = UnmergeMul3.getReg(0); 3043 Register Mul3_Hi = UnmergeMul3.getReg(1); 3044 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3045 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3046 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3047 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3048 3049 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3050 Register DenomLo = UnmergeDenom.getReg(0); 3051 Register DenomHi = UnmergeDenom.getReg(1); 3052 3053 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3054 auto C1 = B.buildSExt(S32, CmpHi); 3055 3056 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3057 auto C2 = B.buildSExt(S32, CmpLo); 3058 3059 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3060 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3061 3062 // TODO: Here and below portions of the code can be enclosed into if/endif. 3063 // Currently control flow is unconditional and we have 4 selects after 3064 // potential endif to substitute PHIs. 3065 3066 // if C3 != 0 ... 3067 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3068 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3069 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3070 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3071 3072 auto One64 = B.buildConstant(S64, 1); 3073 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3074 3075 auto C4 = 3076 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3077 auto C5 = 3078 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3079 auto C6 = B.buildSelect( 3080 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3081 3082 // if (C6 != 0) 3083 auto Add4 = B.buildAdd(S64, Add3, One64); 3084 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3085 3086 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3087 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3088 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3089 3090 // endif C6 3091 // endif C3 3092 3093 if (DstDivReg) { 3094 auto Sel1 = B.buildSelect( 3095 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3096 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3097 Sel1, MulHi3); 3098 } 3099 3100 if (DstRemReg) { 3101 auto Sel2 = B.buildSelect( 3102 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3103 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3104 Sel2, Sub1); 3105 } 3106 } 3107 3108 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3109 MachineRegisterInfo &MRI, 3110 MachineIRBuilder &B) const { 3111 Register DstDivReg, DstRemReg; 3112 switch (MI.getOpcode()) { 3113 default: 3114 llvm_unreachable("Unexpected opcode!"); 3115 case AMDGPU::G_UDIV: { 3116 DstDivReg = MI.getOperand(0).getReg(); 3117 break; 3118 } 3119 case AMDGPU::G_UREM: { 3120 DstRemReg = MI.getOperand(0).getReg(); 3121 break; 3122 } 3123 case AMDGPU::G_UDIVREM: { 3124 DstDivReg = MI.getOperand(0).getReg(); 3125 DstRemReg = MI.getOperand(1).getReg(); 3126 break; 3127 } 3128 } 3129 3130 const LLT S64 = LLT::scalar(64); 3131 const LLT S32 = LLT::scalar(32); 3132 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3133 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3134 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3135 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3136 3137 if (Ty == S32) 3138 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3139 else if (Ty == S64) 3140 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3141 else 3142 return false; 3143 3144 MI.eraseFromParent(); 3145 return true; 3146 } 3147 3148 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3149 MachineRegisterInfo &MRI, 3150 MachineIRBuilder &B) const { 3151 const LLT S64 = LLT::scalar(64); 3152 const LLT S32 = LLT::scalar(32); 3153 3154 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3155 if (Ty != S32 && Ty != S64) 3156 return false; 3157 3158 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3159 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3160 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3161 3162 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3163 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3164 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3165 3166 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3167 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3168 3169 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3170 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3171 3172 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3173 switch (MI.getOpcode()) { 3174 default: 3175 llvm_unreachable("Unexpected opcode!"); 3176 case AMDGPU::G_SDIV: { 3177 DstDivReg = MI.getOperand(0).getReg(); 3178 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3179 break; 3180 } 3181 case AMDGPU::G_SREM: { 3182 DstRemReg = MI.getOperand(0).getReg(); 3183 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3184 break; 3185 } 3186 case AMDGPU::G_SDIVREM: { 3187 DstDivReg = MI.getOperand(0).getReg(); 3188 DstRemReg = MI.getOperand(1).getReg(); 3189 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3190 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3191 break; 3192 } 3193 } 3194 3195 if (Ty == S32) 3196 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3197 else 3198 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3199 3200 if (DstDivReg) { 3201 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3202 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3203 B.buildSub(DstDivReg, SignXor, Sign); 3204 } 3205 3206 if (DstRemReg) { 3207 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3208 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3209 B.buildSub(DstRemReg, SignXor, Sign); 3210 } 3211 3212 MI.eraseFromParent(); 3213 return true; 3214 } 3215 3216 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3217 MachineRegisterInfo &MRI, 3218 MachineIRBuilder &B) const { 3219 Register Res = MI.getOperand(0).getReg(); 3220 Register LHS = MI.getOperand(1).getReg(); 3221 Register RHS = MI.getOperand(2).getReg(); 3222 uint16_t Flags = MI.getFlags(); 3223 LLT ResTy = MRI.getType(Res); 3224 3225 const MachineFunction &MF = B.getMF(); 3226 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3227 MI.getFlag(MachineInstr::FmAfn); 3228 3229 if (!AllowInaccurateRcp) 3230 return false; 3231 3232 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3233 // 1 / x -> RCP(x) 3234 if (CLHS->isExactlyValue(1.0)) { 3235 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3236 .addUse(RHS) 3237 .setMIFlags(Flags); 3238 3239 MI.eraseFromParent(); 3240 return true; 3241 } 3242 3243 // -1 / x -> RCP( FNEG(x) ) 3244 if (CLHS->isExactlyValue(-1.0)) { 3245 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3246 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3247 .addUse(FNeg.getReg(0)) 3248 .setMIFlags(Flags); 3249 3250 MI.eraseFromParent(); 3251 return true; 3252 } 3253 } 3254 3255 // x / y -> x * (1.0 / y) 3256 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3257 .addUse(RHS) 3258 .setMIFlags(Flags); 3259 B.buildFMul(Res, LHS, RCP, Flags); 3260 3261 MI.eraseFromParent(); 3262 return true; 3263 } 3264 3265 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3266 MachineRegisterInfo &MRI, 3267 MachineIRBuilder &B) const { 3268 Register Res = MI.getOperand(0).getReg(); 3269 Register X = MI.getOperand(1).getReg(); 3270 Register Y = MI.getOperand(2).getReg(); 3271 uint16_t Flags = MI.getFlags(); 3272 LLT ResTy = MRI.getType(Res); 3273 3274 const MachineFunction &MF = B.getMF(); 3275 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3276 MI.getFlag(MachineInstr::FmAfn); 3277 3278 if (!AllowInaccurateRcp) 3279 return false; 3280 3281 auto NegY = B.buildFNeg(ResTy, Y); 3282 auto One = B.buildFConstant(ResTy, 1.0); 3283 3284 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3285 .addUse(Y) 3286 .setMIFlags(Flags); 3287 3288 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3289 R = B.buildFMA(ResTy, Tmp0, R, R); 3290 3291 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3292 R = B.buildFMA(ResTy, Tmp1, R, R); 3293 3294 auto Ret = B.buildFMul(ResTy, X, R); 3295 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3296 3297 B.buildFMA(Res, Tmp2, R, Ret); 3298 MI.eraseFromParent(); 3299 return true; 3300 } 3301 3302 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3303 MachineRegisterInfo &MRI, 3304 MachineIRBuilder &B) const { 3305 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3306 return true; 3307 3308 Register Res = MI.getOperand(0).getReg(); 3309 Register LHS = MI.getOperand(1).getReg(); 3310 Register RHS = MI.getOperand(2).getReg(); 3311 3312 uint16_t Flags = MI.getFlags(); 3313 3314 LLT S16 = LLT::scalar(16); 3315 LLT S32 = LLT::scalar(32); 3316 3317 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3318 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3319 3320 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3321 .addUse(RHSExt.getReg(0)) 3322 .setMIFlags(Flags); 3323 3324 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3325 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3326 3327 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3328 .addUse(RDst.getReg(0)) 3329 .addUse(RHS) 3330 .addUse(LHS) 3331 .setMIFlags(Flags); 3332 3333 MI.eraseFromParent(); 3334 return true; 3335 } 3336 3337 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3338 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3339 static void toggleSPDenormMode(bool Enable, 3340 MachineIRBuilder &B, 3341 const GCNSubtarget &ST, 3342 AMDGPU::SIModeRegisterDefaults Mode) { 3343 // Set SP denorm mode to this value. 3344 unsigned SPDenormMode = 3345 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3346 3347 if (ST.hasDenormModeInst()) { 3348 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3349 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3350 3351 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3352 B.buildInstr(AMDGPU::S_DENORM_MODE) 3353 .addImm(NewDenormModeValue); 3354 3355 } else { 3356 // Select FP32 bit field in mode register. 3357 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3358 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3359 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3360 3361 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3362 .addImm(SPDenormMode) 3363 .addImm(SPDenormModeBitField); 3364 } 3365 } 3366 3367 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3368 MachineRegisterInfo &MRI, 3369 MachineIRBuilder &B) const { 3370 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3371 return true; 3372 3373 Register Res = MI.getOperand(0).getReg(); 3374 Register LHS = MI.getOperand(1).getReg(); 3375 Register RHS = MI.getOperand(2).getReg(); 3376 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3377 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3378 3379 uint16_t Flags = MI.getFlags(); 3380 3381 LLT S32 = LLT::scalar(32); 3382 LLT S1 = LLT::scalar(1); 3383 3384 auto One = B.buildFConstant(S32, 1.0f); 3385 3386 auto DenominatorScaled = 3387 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3388 .addUse(LHS) 3389 .addUse(RHS) 3390 .addImm(0) 3391 .setMIFlags(Flags); 3392 auto NumeratorScaled = 3393 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3394 .addUse(LHS) 3395 .addUse(RHS) 3396 .addImm(1) 3397 .setMIFlags(Flags); 3398 3399 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3400 .addUse(DenominatorScaled.getReg(0)) 3401 .setMIFlags(Flags); 3402 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3403 3404 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3405 // aren't modeled as reading it. 3406 if (!Mode.allFP32Denormals()) 3407 toggleSPDenormMode(true, B, ST, Mode); 3408 3409 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3410 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3411 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3412 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3413 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3414 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3415 3416 if (!Mode.allFP32Denormals()) 3417 toggleSPDenormMode(false, B, ST, Mode); 3418 3419 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3420 .addUse(Fma4.getReg(0)) 3421 .addUse(Fma1.getReg(0)) 3422 .addUse(Fma3.getReg(0)) 3423 .addUse(NumeratorScaled.getReg(1)) 3424 .setMIFlags(Flags); 3425 3426 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3427 .addUse(Fmas.getReg(0)) 3428 .addUse(RHS) 3429 .addUse(LHS) 3430 .setMIFlags(Flags); 3431 3432 MI.eraseFromParent(); 3433 return true; 3434 } 3435 3436 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3437 MachineRegisterInfo &MRI, 3438 MachineIRBuilder &B) const { 3439 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3440 return true; 3441 3442 Register Res = MI.getOperand(0).getReg(); 3443 Register LHS = MI.getOperand(1).getReg(); 3444 Register RHS = MI.getOperand(2).getReg(); 3445 3446 uint16_t Flags = MI.getFlags(); 3447 3448 LLT S64 = LLT::scalar(64); 3449 LLT S1 = LLT::scalar(1); 3450 3451 auto One = B.buildFConstant(S64, 1.0); 3452 3453 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3454 .addUse(LHS) 3455 .addUse(RHS) 3456 .addImm(0) 3457 .setMIFlags(Flags); 3458 3459 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3460 3461 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3462 .addUse(DivScale0.getReg(0)) 3463 .setMIFlags(Flags); 3464 3465 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3466 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3467 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3468 3469 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3470 .addUse(LHS) 3471 .addUse(RHS) 3472 .addImm(1) 3473 .setMIFlags(Flags); 3474 3475 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3476 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3477 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3478 3479 Register Scale; 3480 if (!ST.hasUsableDivScaleConditionOutput()) { 3481 // Workaround a hardware bug on SI where the condition output from div_scale 3482 // is not usable. 3483 3484 LLT S32 = LLT::scalar(32); 3485 3486 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3487 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3488 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3489 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3490 3491 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3492 Scale1Unmerge.getReg(1)); 3493 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3494 Scale0Unmerge.getReg(1)); 3495 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3496 } else { 3497 Scale = DivScale1.getReg(1); 3498 } 3499 3500 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3501 .addUse(Fma4.getReg(0)) 3502 .addUse(Fma3.getReg(0)) 3503 .addUse(Mul.getReg(0)) 3504 .addUse(Scale) 3505 .setMIFlags(Flags); 3506 3507 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3508 .addUse(Fmas.getReg(0)) 3509 .addUse(RHS) 3510 .addUse(LHS) 3511 .setMIFlags(Flags); 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3518 MachineRegisterInfo &MRI, 3519 MachineIRBuilder &B) const { 3520 Register Res = MI.getOperand(0).getReg(); 3521 Register LHS = MI.getOperand(2).getReg(); 3522 Register RHS = MI.getOperand(3).getReg(); 3523 uint16_t Flags = MI.getFlags(); 3524 3525 LLT S32 = LLT::scalar(32); 3526 LLT S1 = LLT::scalar(1); 3527 3528 auto Abs = B.buildFAbs(S32, RHS, Flags); 3529 const APFloat C0Val(1.0f); 3530 3531 auto C0 = B.buildConstant(S32, 0x6f800000); 3532 auto C1 = B.buildConstant(S32, 0x2f800000); 3533 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3534 3535 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3536 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3537 3538 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3539 3540 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3541 .addUse(Mul0.getReg(0)) 3542 .setMIFlags(Flags); 3543 3544 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3545 3546 B.buildFMul(Res, Sel, Mul1, Flags); 3547 3548 MI.eraseFromParent(); 3549 return true; 3550 } 3551 3552 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3553 // FIXME: Why do we handle this one but not other removed instructions? 3554 // 3555 // Reciprocal square root. The clamp prevents infinite results, clamping 3556 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3557 // +-max_float. 3558 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3559 MachineRegisterInfo &MRI, 3560 MachineIRBuilder &B) const { 3561 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3562 return true; 3563 3564 Register Dst = MI.getOperand(0).getReg(); 3565 Register Src = MI.getOperand(2).getReg(); 3566 auto Flags = MI.getFlags(); 3567 3568 LLT Ty = MRI.getType(Dst); 3569 3570 const fltSemantics *FltSemantics; 3571 if (Ty == LLT::scalar(32)) 3572 FltSemantics = &APFloat::IEEEsingle(); 3573 else if (Ty == LLT::scalar(64)) 3574 FltSemantics = &APFloat::IEEEdouble(); 3575 else 3576 return false; 3577 3578 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3579 .addUse(Src) 3580 .setMIFlags(Flags); 3581 3582 // We don't need to concern ourselves with the snan handling difference, since 3583 // the rsq quieted (or not) so use the one which will directly select. 3584 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3585 const bool UseIEEE = MFI->getMode().IEEE; 3586 3587 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3588 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3589 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3590 3591 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3592 3593 if (UseIEEE) 3594 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3595 else 3596 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3597 MI.eraseFromParent(); 3598 return true; 3599 } 3600 3601 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3602 switch (IID) { 3603 case Intrinsic::amdgcn_ds_fadd: 3604 return AMDGPU::G_ATOMICRMW_FADD; 3605 case Intrinsic::amdgcn_ds_fmin: 3606 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3607 case Intrinsic::amdgcn_ds_fmax: 3608 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3609 default: 3610 llvm_unreachable("not a DS FP intrinsic"); 3611 } 3612 } 3613 3614 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3615 MachineInstr &MI, 3616 Intrinsic::ID IID) const { 3617 GISelChangeObserver &Observer = Helper.Observer; 3618 Observer.changingInstr(MI); 3619 3620 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3621 3622 // The remaining operands were used to set fields in the MemOperand on 3623 // construction. 3624 for (int I = 6; I > 3; --I) 3625 MI.RemoveOperand(I); 3626 3627 MI.RemoveOperand(1); // Remove the intrinsic ID. 3628 Observer.changedInstr(MI); 3629 return true; 3630 } 3631 3632 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3633 MachineRegisterInfo &MRI, 3634 MachineIRBuilder &B) const { 3635 uint64_t Offset = 3636 ST.getTargetLowering()->getImplicitParameterOffset( 3637 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3638 LLT DstTy = MRI.getType(DstReg); 3639 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3640 3641 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3642 if (!loadInputValue(KernargPtrReg, B, 3643 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3644 return false; 3645 3646 // FIXME: This should be nuw 3647 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3648 return true; 3649 } 3650 3651 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3652 MachineRegisterInfo &MRI, 3653 MachineIRBuilder &B) const { 3654 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3655 if (!MFI->isEntryFunction()) { 3656 return legalizePreloadedArgIntrin(MI, MRI, B, 3657 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3658 } 3659 3660 Register DstReg = MI.getOperand(0).getReg(); 3661 if (!getImplicitArgPtr(DstReg, MRI, B)) 3662 return false; 3663 3664 MI.eraseFromParent(); 3665 return true; 3666 } 3667 3668 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3669 MachineRegisterInfo &MRI, 3670 MachineIRBuilder &B, 3671 unsigned AddrSpace) const { 3672 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3673 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3674 Register Hi32 = Unmerge.getReg(1); 3675 3676 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3677 MI.eraseFromParent(); 3678 return true; 3679 } 3680 3681 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3682 // offset (the offset that is included in bounds checking and swizzling, to be 3683 // split between the instruction's voffset and immoffset fields) and soffset 3684 // (the offset that is excluded from bounds checking and swizzling, to go in 3685 // the instruction's soffset field). This function takes the first kind of 3686 // offset and figures out how to split it between voffset and immoffset. 3687 std::pair<Register, unsigned> 3688 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3689 Register OrigOffset) const { 3690 const unsigned MaxImm = 4095; 3691 Register BaseReg; 3692 unsigned ImmOffset; 3693 const LLT S32 = LLT::scalar(32); 3694 MachineRegisterInfo &MRI = *B.getMRI(); 3695 3696 std::tie(BaseReg, ImmOffset) = 3697 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3698 3699 // If BaseReg is a pointer, convert it to int. 3700 if (MRI.getType(BaseReg).isPointer()) 3701 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3702 3703 // If the immediate value is too big for the immoffset field, put the value 3704 // and -4096 into the immoffset field so that the value that is copied/added 3705 // for the voffset field is a multiple of 4096, and it stands more chance 3706 // of being CSEd with the copy/add for another similar load/store. 3707 // However, do not do that rounding down to a multiple of 4096 if that is a 3708 // negative number, as it appears to be illegal to have a negative offset 3709 // in the vgpr, even if adding the immediate offset makes it positive. 3710 unsigned Overflow = ImmOffset & ~MaxImm; 3711 ImmOffset -= Overflow; 3712 if ((int32_t)Overflow < 0) { 3713 Overflow += ImmOffset; 3714 ImmOffset = 0; 3715 } 3716 3717 if (Overflow != 0) { 3718 if (!BaseReg) { 3719 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3720 } else { 3721 auto OverflowVal = B.buildConstant(S32, Overflow); 3722 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3723 } 3724 } 3725 3726 if (!BaseReg) 3727 BaseReg = B.buildConstant(S32, 0).getReg(0); 3728 3729 return std::make_pair(BaseReg, ImmOffset); 3730 } 3731 3732 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3733 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3734 Register VOffset, Register SOffset, 3735 unsigned ImmOffset, Register VIndex, 3736 MachineRegisterInfo &MRI) const { 3737 Optional<ValueAndVReg> MaybeVOffsetVal = 3738 getConstantVRegValWithLookThrough(VOffset, MRI); 3739 Optional<ValueAndVReg> MaybeSOffsetVal = 3740 getConstantVRegValWithLookThrough(SOffset, MRI); 3741 Optional<ValueAndVReg> MaybeVIndexVal = 3742 getConstantVRegValWithLookThrough(VIndex, MRI); 3743 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3744 // update the MMO with that offset. The stride is unknown so we can only do 3745 // this if VIndex is constant 0. 3746 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3747 MaybeVIndexVal->Value == 0) { 3748 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3749 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3750 MMO->setOffset(TotalOffset); 3751 } else { 3752 // We don't have a constant combined offset to use in the MMO. Give up. 3753 MMO->setValue((Value *)nullptr); 3754 } 3755 } 3756 3757 /// Handle register layout difference for f16 images for some subtargets. 3758 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3759 MachineRegisterInfo &MRI, 3760 Register Reg, 3761 bool ImageStore) const { 3762 const LLT S16 = LLT::scalar(16); 3763 const LLT S32 = LLT::scalar(32); 3764 LLT StoreVT = MRI.getType(Reg); 3765 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3766 3767 if (ST.hasUnpackedD16VMem()) { 3768 auto Unmerge = B.buildUnmerge(S16, Reg); 3769 3770 SmallVector<Register, 4> WideRegs; 3771 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3772 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3773 3774 int NumElts = StoreVT.getNumElements(); 3775 3776 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3777 .getReg(0); 3778 } 3779 3780 if (ImageStore && ST.hasImageStoreD16Bug()) { 3781 if (StoreVT.getNumElements() == 2) { 3782 SmallVector<Register, 4> PackedRegs; 3783 Reg = B.buildBitcast(S32, Reg).getReg(0); 3784 PackedRegs.push_back(Reg); 3785 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3786 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3787 .getReg(0); 3788 } 3789 3790 if (StoreVT.getNumElements() == 3) { 3791 SmallVector<Register, 4> PackedRegs; 3792 auto Unmerge = B.buildUnmerge(S16, Reg); 3793 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3794 PackedRegs.push_back(Unmerge.getReg(I)); 3795 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3796 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3797 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3798 } 3799 3800 if (StoreVT.getNumElements() == 4) { 3801 SmallVector<Register, 4> PackedRegs; 3802 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3803 auto Unmerge = B.buildUnmerge(S32, Reg); 3804 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3805 PackedRegs.push_back(Unmerge.getReg(I)); 3806 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3807 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3808 .getReg(0); 3809 } 3810 3811 llvm_unreachable("invalid data type"); 3812 } 3813 3814 return Reg; 3815 } 3816 3817 Register AMDGPULegalizerInfo::fixStoreSourceType( 3818 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3819 MachineRegisterInfo *MRI = B.getMRI(); 3820 LLT Ty = MRI->getType(VData); 3821 3822 const LLT S16 = LLT::scalar(16); 3823 3824 // Fixup illegal register types for i8 stores. 3825 if (Ty == LLT::scalar(8) || Ty == S16) { 3826 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3827 return AnyExt; 3828 } 3829 3830 if (Ty.isVector()) { 3831 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3832 if (IsFormat) 3833 return handleD16VData(B, *MRI, VData); 3834 } 3835 } 3836 3837 return VData; 3838 } 3839 3840 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3841 MachineRegisterInfo &MRI, 3842 MachineIRBuilder &B, 3843 bool IsTyped, 3844 bool IsFormat) const { 3845 Register VData = MI.getOperand(1).getReg(); 3846 LLT Ty = MRI.getType(VData); 3847 LLT EltTy = Ty.getScalarType(); 3848 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3849 const LLT S32 = LLT::scalar(32); 3850 3851 VData = fixStoreSourceType(B, VData, IsFormat); 3852 Register RSrc = MI.getOperand(2).getReg(); 3853 3854 MachineMemOperand *MMO = *MI.memoperands_begin(); 3855 const int MemSize = MMO->getSize(); 3856 3857 unsigned ImmOffset; 3858 3859 // The typed intrinsics add an immediate after the registers. 3860 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3861 3862 // The struct intrinsic variants add one additional operand over raw. 3863 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3864 Register VIndex; 3865 int OpOffset = 0; 3866 if (HasVIndex) { 3867 VIndex = MI.getOperand(3).getReg(); 3868 OpOffset = 1; 3869 } else { 3870 VIndex = B.buildConstant(S32, 0).getReg(0); 3871 } 3872 3873 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3874 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3875 3876 unsigned Format = 0; 3877 if (IsTyped) { 3878 Format = MI.getOperand(5 + OpOffset).getImm(); 3879 ++OpOffset; 3880 } 3881 3882 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3883 3884 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3885 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3886 3887 unsigned Opc; 3888 if (IsTyped) { 3889 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3890 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3891 } else if (IsFormat) { 3892 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3893 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3894 } else { 3895 switch (MemSize) { 3896 case 1: 3897 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3898 break; 3899 case 2: 3900 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3901 break; 3902 default: 3903 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3904 break; 3905 } 3906 } 3907 3908 auto MIB = B.buildInstr(Opc) 3909 .addUse(VData) // vdata 3910 .addUse(RSrc) // rsrc 3911 .addUse(VIndex) // vindex 3912 .addUse(VOffset) // voffset 3913 .addUse(SOffset) // soffset 3914 .addImm(ImmOffset); // offset(imm) 3915 3916 if (IsTyped) 3917 MIB.addImm(Format); 3918 3919 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3920 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3921 .addMemOperand(MMO); 3922 3923 MI.eraseFromParent(); 3924 return true; 3925 } 3926 3927 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3928 MachineRegisterInfo &MRI, 3929 MachineIRBuilder &B, 3930 bool IsFormat, 3931 bool IsTyped) const { 3932 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3933 MachineMemOperand *MMO = *MI.memoperands_begin(); 3934 const LLT MemTy = MMO->getMemoryType(); 3935 const LLT S32 = LLT::scalar(32); 3936 3937 Register Dst = MI.getOperand(0).getReg(); 3938 Register RSrc = MI.getOperand(2).getReg(); 3939 3940 // The typed intrinsics add an immediate after the registers. 3941 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3942 3943 // The struct intrinsic variants add one additional operand over raw. 3944 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3945 Register VIndex; 3946 int OpOffset = 0; 3947 if (HasVIndex) { 3948 VIndex = MI.getOperand(3).getReg(); 3949 OpOffset = 1; 3950 } else { 3951 VIndex = B.buildConstant(S32, 0).getReg(0); 3952 } 3953 3954 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3955 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3956 3957 unsigned Format = 0; 3958 if (IsTyped) { 3959 Format = MI.getOperand(5 + OpOffset).getImm(); 3960 ++OpOffset; 3961 } 3962 3963 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3964 unsigned ImmOffset; 3965 3966 LLT Ty = MRI.getType(Dst); 3967 LLT EltTy = Ty.getScalarType(); 3968 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3969 const bool Unpacked = ST.hasUnpackedD16VMem(); 3970 3971 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3972 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3973 3974 unsigned Opc; 3975 3976 if (IsTyped) { 3977 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3978 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3979 } else if (IsFormat) { 3980 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3981 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3982 } else { 3983 switch (MemTy.getSizeInBits()) { 3984 case 8: 3985 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3986 break; 3987 case 16: 3988 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3989 break; 3990 default: 3991 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3992 break; 3993 } 3994 } 3995 3996 Register LoadDstReg; 3997 3998 bool IsExtLoad = 3999 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4000 LLT UnpackedTy = Ty.changeElementSize(32); 4001 4002 if (IsExtLoad) 4003 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4004 else if (Unpacked && IsD16 && Ty.isVector()) 4005 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4006 else 4007 LoadDstReg = Dst; 4008 4009 auto MIB = B.buildInstr(Opc) 4010 .addDef(LoadDstReg) // vdata 4011 .addUse(RSrc) // rsrc 4012 .addUse(VIndex) // vindex 4013 .addUse(VOffset) // voffset 4014 .addUse(SOffset) // soffset 4015 .addImm(ImmOffset); // offset(imm) 4016 4017 if (IsTyped) 4018 MIB.addImm(Format); 4019 4020 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4021 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4022 .addMemOperand(MMO); 4023 4024 if (LoadDstReg != Dst) { 4025 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4026 4027 // Widen result for extending loads was widened. 4028 if (IsExtLoad) 4029 B.buildTrunc(Dst, LoadDstReg); 4030 else { 4031 // Repack to original 16-bit vector result 4032 // FIXME: G_TRUNC should work, but legalization currently fails 4033 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4034 SmallVector<Register, 4> Repack; 4035 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4036 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4037 B.buildMerge(Dst, Repack); 4038 } 4039 } 4040 4041 MI.eraseFromParent(); 4042 return true; 4043 } 4044 4045 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4046 MachineIRBuilder &B, 4047 bool IsInc) const { 4048 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4049 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4050 B.buildInstr(Opc) 4051 .addDef(MI.getOperand(0).getReg()) 4052 .addUse(MI.getOperand(2).getReg()) 4053 .addUse(MI.getOperand(3).getReg()) 4054 .cloneMemRefs(MI); 4055 MI.eraseFromParent(); 4056 return true; 4057 } 4058 4059 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4060 switch (IntrID) { 4061 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4062 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4063 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4064 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4065 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4066 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4067 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4068 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4070 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4071 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4073 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4074 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4075 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4076 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4077 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4078 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4079 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4080 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4082 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4083 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4085 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4086 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4088 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4089 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4091 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4092 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4094 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4095 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4097 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4098 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4100 case Intrinsic::amdgcn_buffer_atomic_fadd: 4101 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4102 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4103 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4104 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4105 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4106 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4107 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4108 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4110 default: 4111 llvm_unreachable("unhandled atomic opcode"); 4112 } 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4116 MachineIRBuilder &B, 4117 Intrinsic::ID IID) const { 4118 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4119 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4120 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4121 4122 Register Dst; 4123 4124 int OpOffset = 0; 4125 if (HasReturn) { 4126 // A few FP atomics do not support return values. 4127 Dst = MI.getOperand(0).getReg(); 4128 } else { 4129 OpOffset = -1; 4130 } 4131 4132 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4133 Register CmpVal; 4134 4135 if (IsCmpSwap) { 4136 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4137 ++OpOffset; 4138 } 4139 4140 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4141 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4142 4143 // The struct intrinsic variants add one additional operand over raw. 4144 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4145 Register VIndex; 4146 if (HasVIndex) { 4147 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4148 ++OpOffset; 4149 } else { 4150 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4151 } 4152 4153 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4154 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4155 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4156 4157 MachineMemOperand *MMO = *MI.memoperands_begin(); 4158 4159 unsigned ImmOffset; 4160 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4161 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4162 4163 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4164 4165 if (HasReturn) 4166 MIB.addDef(Dst); 4167 4168 MIB.addUse(VData); // vdata 4169 4170 if (IsCmpSwap) 4171 MIB.addReg(CmpVal); 4172 4173 MIB.addUse(RSrc) // rsrc 4174 .addUse(VIndex) // vindex 4175 .addUse(VOffset) // voffset 4176 .addUse(SOffset) // soffset 4177 .addImm(ImmOffset) // offset(imm) 4178 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4179 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4180 .addMemOperand(MMO); 4181 4182 MI.eraseFromParent(); 4183 return true; 4184 } 4185 4186 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4187 /// vector with s16 typed elements. 4188 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4189 SmallVectorImpl<Register> &PackedAddrs, 4190 unsigned ArgOffset, 4191 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4192 bool IsA16, bool IsG16) { 4193 const LLT S16 = LLT::scalar(16); 4194 const LLT V2S16 = LLT::fixed_vector(2, 16); 4195 auto EndIdx = Intr->VAddrEnd; 4196 4197 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4198 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4199 if (!SrcOp.isReg()) 4200 continue; // _L to _LZ may have eliminated this. 4201 4202 Register AddrReg = SrcOp.getReg(); 4203 4204 if ((I < Intr->GradientStart) || 4205 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4206 (I >= Intr->CoordStart && !IsA16)) { 4207 // Handle any gradient or coordinate operands that should not be packed 4208 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4209 PackedAddrs.push_back(AddrReg); 4210 } else { 4211 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4212 // derivatives dx/dh and dx/dv are packed with undef. 4213 if (((I + 1) >= EndIdx) || 4214 ((Intr->NumGradients / 2) % 2 == 1 && 4215 (I == static_cast<unsigned>(Intr->GradientStart + 4216 (Intr->NumGradients / 2) - 1) || 4217 I == static_cast<unsigned>(Intr->GradientStart + 4218 Intr->NumGradients - 1))) || 4219 // Check for _L to _LZ optimization 4220 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4221 PackedAddrs.push_back( 4222 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4223 .getReg(0)); 4224 } else { 4225 PackedAddrs.push_back( 4226 B.buildBuildVector( 4227 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4228 .getReg(0)); 4229 ++I; 4230 } 4231 } 4232 } 4233 } 4234 4235 /// Convert from separate vaddr components to a single vector address register, 4236 /// and replace the remaining operands with $noreg. 4237 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4238 int DimIdx, int NumVAddrs) { 4239 const LLT S32 = LLT::scalar(32); 4240 4241 SmallVector<Register, 8> AddrRegs; 4242 for (int I = 0; I != NumVAddrs; ++I) { 4243 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4244 if (SrcOp.isReg()) { 4245 AddrRegs.push_back(SrcOp.getReg()); 4246 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4247 } 4248 } 4249 4250 int NumAddrRegs = AddrRegs.size(); 4251 if (NumAddrRegs != 1) { 4252 // Above 8 elements round up to next power of 2 (i.e. 16). 4253 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4254 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4255 auto Undef = B.buildUndef(S32); 4256 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4257 NumAddrRegs = RoundedNumRegs; 4258 } 4259 4260 auto VAddr = 4261 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4262 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4263 } 4264 4265 for (int I = 1; I != NumVAddrs; ++I) { 4266 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4267 if (SrcOp.isReg()) 4268 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4269 } 4270 } 4271 4272 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4273 /// 4274 /// Depending on the subtarget, load/store with 16-bit element data need to be 4275 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4276 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4277 /// registers. 4278 /// 4279 /// We don't want to directly select image instructions just yet, but also want 4280 /// to exposes all register repacking to the legalizer/combiners. We also don't 4281 /// want a selected instrution entering RegBankSelect. In order to avoid 4282 /// defining a multitude of intermediate image instructions, directly hack on 4283 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4284 /// now unnecessary arguments with $noreg. 4285 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4286 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4287 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4288 4289 const unsigned NumDefs = MI.getNumExplicitDefs(); 4290 const unsigned ArgOffset = NumDefs + 1; 4291 bool IsTFE = NumDefs == 2; 4292 // We are only processing the operands of d16 image operations on subtargets 4293 // that use the unpacked register layout, or need to repack the TFE result. 4294 4295 // TODO: Do we need to guard against already legalized intrinsics? 4296 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4297 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4298 4299 MachineRegisterInfo *MRI = B.getMRI(); 4300 const LLT S32 = LLT::scalar(32); 4301 const LLT S16 = LLT::scalar(16); 4302 const LLT V2S16 = LLT::fixed_vector(2, 16); 4303 4304 unsigned DMask = 0; 4305 4306 // Check for 16 bit addresses and pack if true. 4307 LLT GradTy = 4308 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4309 LLT AddrTy = 4310 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4311 const bool IsG16 = GradTy == S16; 4312 const bool IsA16 = AddrTy == S16; 4313 4314 int DMaskLanes = 0; 4315 if (!BaseOpcode->Atomic) { 4316 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4317 if (BaseOpcode->Gather4) { 4318 DMaskLanes = 4; 4319 } else if (DMask != 0) { 4320 DMaskLanes = countPopulation(DMask); 4321 } else if (!IsTFE && !BaseOpcode->Store) { 4322 // If dmask is 0, this is a no-op load. This can be eliminated. 4323 B.buildUndef(MI.getOperand(0)); 4324 MI.eraseFromParent(); 4325 return true; 4326 } 4327 } 4328 4329 Observer.changingInstr(MI); 4330 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4331 4332 unsigned NewOpcode = NumDefs == 0 ? 4333 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4334 4335 // Track that we legalized this 4336 MI.setDesc(B.getTII().get(NewOpcode)); 4337 4338 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4339 // dmask to be at least 1 otherwise the instruction will fail 4340 if (IsTFE && DMask == 0) { 4341 DMask = 0x1; 4342 DMaskLanes = 1; 4343 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4344 } 4345 4346 if (BaseOpcode->Atomic) { 4347 Register VData0 = MI.getOperand(2).getReg(); 4348 LLT Ty = MRI->getType(VData0); 4349 4350 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4351 if (Ty.isVector()) 4352 return false; 4353 4354 if (BaseOpcode->AtomicX2) { 4355 Register VData1 = MI.getOperand(3).getReg(); 4356 // The two values are packed in one register. 4357 LLT PackedTy = LLT::fixed_vector(2, Ty); 4358 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4359 MI.getOperand(2).setReg(Concat.getReg(0)); 4360 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4361 } 4362 } 4363 4364 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4365 4366 // Optimize _L to _LZ when _L is zero 4367 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4368 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4369 const ConstantFP *ConstantLod; 4370 4371 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4372 m_GFCst(ConstantLod))) { 4373 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4374 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4375 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4376 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4377 Intr->Dim); 4378 4379 // The starting indexes should remain in the same place. 4380 --CorrectedNumVAddrs; 4381 4382 MI.getOperand(MI.getNumExplicitDefs()) 4383 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4384 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4385 Intr = NewImageDimIntr; 4386 } 4387 } 4388 } 4389 4390 // Optimize _mip away, when 'lod' is zero 4391 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4392 int64_t ConstantLod; 4393 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4394 m_ICst(ConstantLod))) { 4395 if (ConstantLod == 0) { 4396 // TODO: Change intrinsic opcode and remove operand instead or replacing 4397 // it with 0, as the _L to _LZ handling is done above. 4398 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4399 --CorrectedNumVAddrs; 4400 } 4401 } 4402 } 4403 4404 // Rewrite the addressing register layout before doing anything else. 4405 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4406 // 16 bit gradients are supported, but are tied to the A16 control 4407 // so both gradients and addresses must be 16 bit 4408 return false; 4409 } 4410 4411 if (IsA16 && !ST.hasA16()) { 4412 // A16 not supported 4413 return false; 4414 } 4415 4416 if (IsA16 || IsG16) { 4417 if (Intr->NumVAddrs > 1) { 4418 SmallVector<Register, 4> PackedRegs; 4419 4420 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4421 IsG16); 4422 4423 // See also below in the non-a16 branch 4424 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4425 PackedRegs.size() <= ST.getNSAMaxSize(); 4426 4427 if (!UseNSA && PackedRegs.size() > 1) { 4428 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4429 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4430 PackedRegs[0] = Concat.getReg(0); 4431 PackedRegs.resize(1); 4432 } 4433 4434 const unsigned NumPacked = PackedRegs.size(); 4435 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4436 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4437 if (!SrcOp.isReg()) { 4438 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4439 continue; 4440 } 4441 4442 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4443 4444 if (I - Intr->VAddrStart < NumPacked) 4445 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4446 else 4447 SrcOp.setReg(AMDGPU::NoRegister); 4448 } 4449 } 4450 } else { 4451 // If the register allocator cannot place the address registers contiguously 4452 // without introducing moves, then using the non-sequential address encoding 4453 // is always preferable, since it saves VALU instructions and is usually a 4454 // wash in terms of code size or even better. 4455 // 4456 // However, we currently have no way of hinting to the register allocator 4457 // that MIMG addresses should be placed contiguously when it is possible to 4458 // do so, so force non-NSA for the common 2-address case as a heuristic. 4459 // 4460 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4461 // allocation when possible. 4462 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4463 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4464 4465 if (!UseNSA && Intr->NumVAddrs > 1) 4466 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4467 Intr->NumVAddrs); 4468 } 4469 4470 int Flags = 0; 4471 if (IsA16) 4472 Flags |= 1; 4473 if (IsG16) 4474 Flags |= 2; 4475 MI.addOperand(MachineOperand::CreateImm(Flags)); 4476 4477 if (BaseOpcode->Store) { // No TFE for stores? 4478 // TODO: Handle dmask trim 4479 Register VData = MI.getOperand(1).getReg(); 4480 LLT Ty = MRI->getType(VData); 4481 if (!Ty.isVector() || Ty.getElementType() != S16) 4482 return true; 4483 4484 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4485 if (RepackedReg != VData) { 4486 MI.getOperand(1).setReg(RepackedReg); 4487 } 4488 4489 return true; 4490 } 4491 4492 Register DstReg = MI.getOperand(0).getReg(); 4493 LLT Ty = MRI->getType(DstReg); 4494 const LLT EltTy = Ty.getScalarType(); 4495 const bool IsD16 = Ty.getScalarType() == S16; 4496 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4497 4498 // Confirm that the return type is large enough for the dmask specified 4499 if (NumElts < DMaskLanes) 4500 return false; 4501 4502 if (NumElts > 4 || DMaskLanes > 4) 4503 return false; 4504 4505 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4506 const LLT AdjustedTy = 4507 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4508 4509 // The raw dword aligned data component of the load. The only legal cases 4510 // where this matters should be when using the packed D16 format, for 4511 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4512 LLT RoundedTy; 4513 4514 // S32 vector to to cover all data, plus TFE result element. 4515 LLT TFETy; 4516 4517 // Register type to use for each loaded component. Will be S32 or V2S16. 4518 LLT RegTy; 4519 4520 if (IsD16 && ST.hasUnpackedD16VMem()) { 4521 RoundedTy = 4522 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4523 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4524 RegTy = S32; 4525 } else { 4526 unsigned EltSize = EltTy.getSizeInBits(); 4527 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4528 unsigned RoundedSize = 32 * RoundedElts; 4529 RoundedTy = LLT::scalarOrVector( 4530 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4531 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4532 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4533 } 4534 4535 // The return type does not need adjustment. 4536 // TODO: Should we change s16 case to s32 or <2 x s16>? 4537 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4538 return true; 4539 4540 Register Dst1Reg; 4541 4542 // Insert after the instruction. 4543 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4544 4545 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4546 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4547 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4548 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4549 4550 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4551 4552 MI.getOperand(0).setReg(NewResultReg); 4553 4554 // In the IR, TFE is supposed to be used with a 2 element struct return 4555 // type. The intruction really returns these two values in one contiguous 4556 // register, with one additional dword beyond the loaded data. Rewrite the 4557 // return type to use a single register result. 4558 4559 if (IsTFE) { 4560 Dst1Reg = MI.getOperand(1).getReg(); 4561 if (MRI->getType(Dst1Reg) != S32) 4562 return false; 4563 4564 // TODO: Make sure the TFE operand bit is set. 4565 MI.RemoveOperand(1); 4566 4567 // Handle the easy case that requires no repack instructions. 4568 if (Ty == S32) { 4569 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4570 return true; 4571 } 4572 } 4573 4574 // Now figure out how to copy the new result register back into the old 4575 // result. 4576 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4577 4578 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4579 4580 if (ResultNumRegs == 1) { 4581 assert(!IsTFE); 4582 ResultRegs[0] = NewResultReg; 4583 } else { 4584 // We have to repack into a new vector of some kind. 4585 for (int I = 0; I != NumDataRegs; ++I) 4586 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4587 B.buildUnmerge(ResultRegs, NewResultReg); 4588 4589 // Drop the final TFE element to get the data part. The TFE result is 4590 // directly written to the right place already. 4591 if (IsTFE) 4592 ResultRegs.resize(NumDataRegs); 4593 } 4594 4595 // For an s16 scalar result, we form an s32 result with a truncate regardless 4596 // of packed vs. unpacked. 4597 if (IsD16 && !Ty.isVector()) { 4598 B.buildTrunc(DstReg, ResultRegs[0]); 4599 return true; 4600 } 4601 4602 // Avoid a build/concat_vector of 1 entry. 4603 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4604 B.buildBitcast(DstReg, ResultRegs[0]); 4605 return true; 4606 } 4607 4608 assert(Ty.isVector()); 4609 4610 if (IsD16) { 4611 // For packed D16 results with TFE enabled, all the data components are 4612 // S32. Cast back to the expected type. 4613 // 4614 // TODO: We don't really need to use load s32 elements. We would only need one 4615 // cast for the TFE result if a multiple of v2s16 was used. 4616 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4617 for (Register &Reg : ResultRegs) 4618 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4619 } else if (ST.hasUnpackedD16VMem()) { 4620 for (Register &Reg : ResultRegs) 4621 Reg = B.buildTrunc(S16, Reg).getReg(0); 4622 } 4623 } 4624 4625 auto padWithUndef = [&](LLT Ty, int NumElts) { 4626 if (NumElts == 0) 4627 return; 4628 Register Undef = B.buildUndef(Ty).getReg(0); 4629 for (int I = 0; I != NumElts; ++I) 4630 ResultRegs.push_back(Undef); 4631 }; 4632 4633 // Pad out any elements eliminated due to the dmask. 4634 LLT ResTy = MRI->getType(ResultRegs[0]); 4635 if (!ResTy.isVector()) { 4636 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4637 B.buildBuildVector(DstReg, ResultRegs); 4638 return true; 4639 } 4640 4641 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4642 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4643 4644 // Deal with the one annoying legal case. 4645 const LLT V3S16 = LLT::fixed_vector(3, 16); 4646 if (Ty == V3S16) { 4647 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4648 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4649 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4650 return true; 4651 } 4652 4653 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4654 B.buildConcatVectors(DstReg, ResultRegs); 4655 return true; 4656 } 4657 4658 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4659 LegalizerHelper &Helper, MachineInstr &MI) const { 4660 MachineIRBuilder &B = Helper.MIRBuilder; 4661 GISelChangeObserver &Observer = Helper.Observer; 4662 4663 Register Dst = MI.getOperand(0).getReg(); 4664 LLT Ty = B.getMRI()->getType(Dst); 4665 unsigned Size = Ty.getSizeInBits(); 4666 MachineFunction &MF = B.getMF(); 4667 4668 Observer.changingInstr(MI); 4669 4670 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4671 Ty = getBitcastRegisterType(Ty); 4672 Helper.bitcastDst(MI, Ty, 0); 4673 Dst = MI.getOperand(0).getReg(); 4674 B.setInsertPt(B.getMBB(), MI); 4675 } 4676 4677 // FIXME: We don't really need this intermediate instruction. The intrinsic 4678 // should be fixed to have a memory operand. Since it's readnone, we're not 4679 // allowed to add one. 4680 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4681 MI.RemoveOperand(1); // Remove intrinsic ID 4682 4683 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4684 // TODO: Should this use datalayout alignment? 4685 const unsigned MemSize = (Size + 7) / 8; 4686 const Align MemAlign(4); 4687 MachineMemOperand *MMO = MF.getMachineMemOperand( 4688 MachinePointerInfo(), 4689 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4690 MachineMemOperand::MOInvariant, 4691 MemSize, MemAlign); 4692 MI.addMemOperand(MF, MMO); 4693 4694 // There are no 96-bit result scalar loads, but widening to 128-bit should 4695 // always be legal. We may need to restore this to a 96-bit result if it turns 4696 // out this needs to be converted to a vector load during RegBankSelect. 4697 if (!isPowerOf2_32(Size)) { 4698 if (Ty.isVector()) 4699 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4700 else 4701 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4702 } 4703 4704 Observer.changedInstr(MI); 4705 return true; 4706 } 4707 4708 // TODO: Move to selection 4709 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4710 MachineRegisterInfo &MRI, 4711 MachineIRBuilder &B) const { 4712 if (!ST.isTrapHandlerEnabled() || 4713 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4714 return legalizeTrapEndpgm(MI, MRI, B); 4715 4716 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4717 switch (*HsaAbiVer) { 4718 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4719 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4720 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4721 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4722 return ST.supportsGetDoorbellID() ? 4723 legalizeTrapHsa(MI, MRI, B) : 4724 legalizeTrapHsaQueuePtr(MI, MRI, B); 4725 } 4726 } 4727 4728 llvm_unreachable("Unknown trap handler"); 4729 } 4730 4731 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4733 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4734 MI.eraseFromParent(); 4735 return true; 4736 } 4737 4738 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4739 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4740 // Pass queue pointer to trap handler as input, and insert trap instruction 4741 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4742 Register LiveIn = 4743 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4744 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4745 return false; 4746 4747 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4748 B.buildCopy(SGPR01, LiveIn); 4749 B.buildInstr(AMDGPU::S_TRAP) 4750 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4751 .addReg(SGPR01, RegState::Implicit); 4752 4753 MI.eraseFromParent(); 4754 return true; 4755 } 4756 4757 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4758 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4759 B.buildInstr(AMDGPU::S_TRAP) 4760 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4761 MI.eraseFromParent(); 4762 return true; 4763 } 4764 4765 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4766 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4767 // Is non-HSA path or trap-handler disabled? then, report a warning 4768 // accordingly 4769 if (!ST.isTrapHandlerEnabled() || 4770 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4771 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4772 "debugtrap handler not supported", 4773 MI.getDebugLoc(), DS_Warning); 4774 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4775 Ctx.diagnose(NoTrap); 4776 } else { 4777 // Insert debug-trap instruction 4778 B.buildInstr(AMDGPU::S_TRAP) 4779 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4780 } 4781 4782 MI.eraseFromParent(); 4783 return true; 4784 } 4785 4786 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4787 MachineIRBuilder &B) const { 4788 MachineRegisterInfo &MRI = *B.getMRI(); 4789 const LLT S16 = LLT::scalar(16); 4790 const LLT S32 = LLT::scalar(32); 4791 4792 Register DstReg = MI.getOperand(0).getReg(); 4793 Register NodePtr = MI.getOperand(2).getReg(); 4794 Register RayExtent = MI.getOperand(3).getReg(); 4795 Register RayOrigin = MI.getOperand(4).getReg(); 4796 Register RayDir = MI.getOperand(5).getReg(); 4797 Register RayInvDir = MI.getOperand(6).getReg(); 4798 Register TDescr = MI.getOperand(7).getReg(); 4799 4800 if (!ST.hasGFX10_AEncoding()) { 4801 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4802 "intrinsic not supported on subtarget", 4803 MI.getDebugLoc()); 4804 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4805 return false; 4806 } 4807 4808 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4809 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4810 const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4811 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 4812 const unsigned Opcodes[2][2][2] = { 4813 {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, 4814 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, 4815 {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, 4816 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, 4817 {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, 4818 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, 4819 {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, 4820 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; 4821 const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; 4822 4823 SmallVector<Register, 12> Ops; 4824 if (Is64) { 4825 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4826 Ops.push_back(Unmerge.getReg(0)); 4827 Ops.push_back(Unmerge.getReg(1)); 4828 } else { 4829 Ops.push_back(NodePtr); 4830 } 4831 Ops.push_back(RayExtent); 4832 4833 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4834 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4835 Ops.push_back(Unmerge.getReg(0)); 4836 Ops.push_back(Unmerge.getReg(1)); 4837 Ops.push_back(Unmerge.getReg(2)); 4838 }; 4839 4840 packLanes(RayOrigin); 4841 if (IsA16) { 4842 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4843 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4844 Register R1 = MRI.createGenericVirtualRegister(S32); 4845 Register R2 = MRI.createGenericVirtualRegister(S32); 4846 Register R3 = MRI.createGenericVirtualRegister(S32); 4847 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4848 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4849 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4850 Ops.push_back(R1); 4851 Ops.push_back(R2); 4852 Ops.push_back(R3); 4853 } else { 4854 packLanes(RayDir); 4855 packLanes(RayInvDir); 4856 } 4857 4858 if (!UseNSA) { 4859 // Build a single vector containing all the operands so far prepared. 4860 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4861 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4862 Ops.clear(); 4863 Ops.push_back(MergedOps); 4864 } 4865 4866 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4867 .addDef(DstReg) 4868 .addImm(Opcode); 4869 4870 for (Register R : Ops) { 4871 MIB.addUse(R); 4872 } 4873 4874 MIB.addUse(TDescr) 4875 .addImm(IsA16 ? 1 : 0) 4876 .cloneMemRefs(MI); 4877 4878 MI.eraseFromParent(); 4879 return true; 4880 } 4881 4882 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4883 MachineInstr &MI) const { 4884 MachineIRBuilder &B = Helper.MIRBuilder; 4885 MachineRegisterInfo &MRI = *B.getMRI(); 4886 4887 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4888 auto IntrID = MI.getIntrinsicID(); 4889 switch (IntrID) { 4890 case Intrinsic::amdgcn_if: 4891 case Intrinsic::amdgcn_else: { 4892 MachineInstr *Br = nullptr; 4893 MachineBasicBlock *UncondBrTarget = nullptr; 4894 bool Negated = false; 4895 if (MachineInstr *BrCond = 4896 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4897 const SIRegisterInfo *TRI 4898 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4899 4900 Register Def = MI.getOperand(1).getReg(); 4901 Register Use = MI.getOperand(3).getReg(); 4902 4903 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4904 4905 if (Negated) 4906 std::swap(CondBrTarget, UncondBrTarget); 4907 4908 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4909 if (IntrID == Intrinsic::amdgcn_if) { 4910 B.buildInstr(AMDGPU::SI_IF) 4911 .addDef(Def) 4912 .addUse(Use) 4913 .addMBB(UncondBrTarget); 4914 } else { 4915 B.buildInstr(AMDGPU::SI_ELSE) 4916 .addDef(Def) 4917 .addUse(Use) 4918 .addMBB(UncondBrTarget); 4919 } 4920 4921 if (Br) { 4922 Br->getOperand(0).setMBB(CondBrTarget); 4923 } else { 4924 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4925 // since we're swapping branch targets it needs to be reinserted. 4926 // FIXME: IRTranslator should probably not do this 4927 B.buildBr(*CondBrTarget); 4928 } 4929 4930 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4931 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4932 MI.eraseFromParent(); 4933 BrCond->eraseFromParent(); 4934 return true; 4935 } 4936 4937 return false; 4938 } 4939 case Intrinsic::amdgcn_loop: { 4940 MachineInstr *Br = nullptr; 4941 MachineBasicBlock *UncondBrTarget = nullptr; 4942 bool Negated = false; 4943 if (MachineInstr *BrCond = 4944 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4945 const SIRegisterInfo *TRI 4946 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4947 4948 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4949 Register Reg = MI.getOperand(2).getReg(); 4950 4951 if (Negated) 4952 std::swap(CondBrTarget, UncondBrTarget); 4953 4954 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4955 B.buildInstr(AMDGPU::SI_LOOP) 4956 .addUse(Reg) 4957 .addMBB(UncondBrTarget); 4958 4959 if (Br) 4960 Br->getOperand(0).setMBB(CondBrTarget); 4961 else 4962 B.buildBr(*CondBrTarget); 4963 4964 MI.eraseFromParent(); 4965 BrCond->eraseFromParent(); 4966 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4967 return true; 4968 } 4969 4970 return false; 4971 } 4972 case Intrinsic::amdgcn_kernarg_segment_ptr: 4973 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4974 // This only makes sense to call in a kernel, so just lower to null. 4975 B.buildConstant(MI.getOperand(0).getReg(), 0); 4976 MI.eraseFromParent(); 4977 return true; 4978 } 4979 4980 return legalizePreloadedArgIntrin( 4981 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4982 case Intrinsic::amdgcn_implicitarg_ptr: 4983 return legalizeImplicitArgPtr(MI, MRI, B); 4984 case Intrinsic::amdgcn_workitem_id_x: 4985 return legalizePreloadedArgIntrin(MI, MRI, B, 4986 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4987 case Intrinsic::amdgcn_workitem_id_y: 4988 return legalizePreloadedArgIntrin(MI, MRI, B, 4989 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4990 case Intrinsic::amdgcn_workitem_id_z: 4991 return legalizePreloadedArgIntrin(MI, MRI, B, 4992 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4993 case Intrinsic::amdgcn_workgroup_id_x: 4994 return legalizePreloadedArgIntrin(MI, MRI, B, 4995 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4996 case Intrinsic::amdgcn_workgroup_id_y: 4997 return legalizePreloadedArgIntrin(MI, MRI, B, 4998 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4999 case Intrinsic::amdgcn_workgroup_id_z: 5000 return legalizePreloadedArgIntrin(MI, MRI, B, 5001 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5002 case Intrinsic::amdgcn_dispatch_ptr: 5003 return legalizePreloadedArgIntrin(MI, MRI, B, 5004 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5005 case Intrinsic::amdgcn_queue_ptr: 5006 return legalizePreloadedArgIntrin(MI, MRI, B, 5007 AMDGPUFunctionArgInfo::QUEUE_PTR); 5008 case Intrinsic::amdgcn_implicit_buffer_ptr: 5009 return legalizePreloadedArgIntrin( 5010 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5011 case Intrinsic::amdgcn_dispatch_id: 5012 return legalizePreloadedArgIntrin(MI, MRI, B, 5013 AMDGPUFunctionArgInfo::DISPATCH_ID); 5014 case Intrinsic::amdgcn_fdiv_fast: 5015 return legalizeFDIVFastIntrin(MI, MRI, B); 5016 case Intrinsic::amdgcn_is_shared: 5017 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5018 case Intrinsic::amdgcn_is_private: 5019 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5020 case Intrinsic::amdgcn_wavefrontsize: { 5021 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5022 MI.eraseFromParent(); 5023 return true; 5024 } 5025 case Intrinsic::amdgcn_s_buffer_load: 5026 return legalizeSBufferLoad(Helper, MI); 5027 case Intrinsic::amdgcn_raw_buffer_store: 5028 case Intrinsic::amdgcn_struct_buffer_store: 5029 return legalizeBufferStore(MI, MRI, B, false, false); 5030 case Intrinsic::amdgcn_raw_buffer_store_format: 5031 case Intrinsic::amdgcn_struct_buffer_store_format: 5032 return legalizeBufferStore(MI, MRI, B, false, true); 5033 case Intrinsic::amdgcn_raw_tbuffer_store: 5034 case Intrinsic::amdgcn_struct_tbuffer_store: 5035 return legalizeBufferStore(MI, MRI, B, true, true); 5036 case Intrinsic::amdgcn_raw_buffer_load: 5037 case Intrinsic::amdgcn_struct_buffer_load: 5038 return legalizeBufferLoad(MI, MRI, B, false, false); 5039 case Intrinsic::amdgcn_raw_buffer_load_format: 5040 case Intrinsic::amdgcn_struct_buffer_load_format: 5041 return legalizeBufferLoad(MI, MRI, B, true, false); 5042 case Intrinsic::amdgcn_raw_tbuffer_load: 5043 case Intrinsic::amdgcn_struct_tbuffer_load: 5044 return legalizeBufferLoad(MI, MRI, B, true, true); 5045 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5046 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5047 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5048 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5049 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5050 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5051 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5052 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5053 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5054 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5055 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5056 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5057 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5058 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5059 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5060 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5061 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5062 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5063 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5064 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5065 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5066 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5067 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5068 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5069 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5070 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5071 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5072 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5073 case Intrinsic::amdgcn_buffer_atomic_fadd: 5074 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5075 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5076 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5077 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5078 return legalizeBufferAtomic(MI, B, IntrID); 5079 case Intrinsic::amdgcn_atomic_inc: 5080 return legalizeAtomicIncDec(MI, B, true); 5081 case Intrinsic::amdgcn_atomic_dec: 5082 return legalizeAtomicIncDec(MI, B, false); 5083 case Intrinsic::trap: 5084 return legalizeTrapIntrinsic(MI, MRI, B); 5085 case Intrinsic::debugtrap: 5086 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5087 case Intrinsic::amdgcn_rsq_clamp: 5088 return legalizeRsqClampIntrinsic(MI, MRI, B); 5089 case Intrinsic::amdgcn_ds_fadd: 5090 case Intrinsic::amdgcn_ds_fmin: 5091 case Intrinsic::amdgcn_ds_fmax: 5092 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5093 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5094 return legalizeBVHIntrinsic(MI, B); 5095 default: { 5096 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5097 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5098 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5099 return true; 5100 } 5101 } 5102 5103 return true; 5104 } 5105