1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .scalarize(0) 934 .widenScalarToNextPow2(0, 32) 935 .widenScalarToNextPow2(1, 32); 936 937 // The hardware instructions return a different result on 0 than the generic 938 // instructions expect. The hardware produces -1, but these produce the 939 // bitwidth. 940 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 941 .scalarize(0) 942 .clampScalar(0, S32, S32) 943 .clampScalar(1, S32, S64) 944 .widenScalarToNextPow2(0, 32) 945 .widenScalarToNextPow2(1, 32) 946 .custom(); 947 948 // The 64-bit versions produce 32-bit results, but only on the SALU. 949 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 950 .legalFor({{S32, S32}, {S32, S64}}) 951 .clampScalar(0, S32, S32) 952 .clampScalar(1, S32, S64) 953 .scalarize(0) 954 .widenScalarToNextPow2(0, 32) 955 .widenScalarToNextPow2(1, 32); 956 957 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 958 // RegBankSelect. 959 getActionDefinitionsBuilder(G_BITREVERSE) 960 .legalFor({S32, S64}) 961 .clampScalar(0, S32, S64) 962 .scalarize(0) 963 .widenScalarToNextPow2(0); 964 965 if (ST.has16BitInsts()) { 966 getActionDefinitionsBuilder(G_BSWAP) 967 .legalFor({S16, S32, V2S16}) 968 .clampMaxNumElementsStrict(0, S16, 2) 969 // FIXME: Fixing non-power-of-2 before clamp is workaround for 970 // narrowScalar limitation. 971 .widenScalarToNextPow2(0) 972 .clampScalar(0, S16, S32) 973 .scalarize(0); 974 975 if (ST.hasVOP3PInsts()) { 976 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 977 .legalFor({S32, S16, V2S16}) 978 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 979 .clampMaxNumElements(0, S16, 2) 980 .minScalar(0, S16) 981 .widenScalarToNextPow2(0) 982 .scalarize(0) 983 .lower(); 984 } else { 985 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 986 .legalFor({S32, S16}) 987 .widenScalarToNextPow2(0) 988 .minScalar(0, S16) 989 .scalarize(0) 990 .lower(); 991 } 992 } else { 993 // TODO: Should have same legality without v_perm_b32 994 getActionDefinitionsBuilder(G_BSWAP) 995 .legalFor({S32}) 996 .lowerIf(scalarNarrowerThan(0, 32)) 997 // FIXME: Fixing non-power-of-2 before clamp is workaround for 998 // narrowScalar limitation. 999 .widenScalarToNextPow2(0) 1000 .maxScalar(0, S32) 1001 .scalarize(0) 1002 .lower(); 1003 1004 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1005 .legalFor({S32}) 1006 .minScalar(0, S32) 1007 .widenScalarToNextPow2(0) 1008 .scalarize(0) 1009 .lower(); 1010 } 1011 1012 getActionDefinitionsBuilder(G_INTTOPTR) 1013 // List the common cases 1014 .legalForCartesianProduct(AddrSpaces64, {S64}) 1015 .legalForCartesianProduct(AddrSpaces32, {S32}) 1016 .scalarize(0) 1017 // Accept any address space as long as the size matches 1018 .legalIf(sameSize(0, 1)) 1019 .widenScalarIf(smallerThan(1, 0), 1020 [](const LegalityQuery &Query) { 1021 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1022 }) 1023 .narrowScalarIf(largerThan(1, 0), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_PTRTOINT) 1029 // List the common cases 1030 .legalForCartesianProduct(AddrSpaces64, {S64}) 1031 .legalForCartesianProduct(AddrSpaces32, {S32}) 1032 .scalarize(0) 1033 // Accept any address space as long as the size matches 1034 .legalIf(sameSize(0, 1)) 1035 .widenScalarIf(smallerThan(0, 1), 1036 [](const LegalityQuery &Query) { 1037 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1038 }) 1039 .narrowScalarIf( 1040 largerThan(0, 1), 1041 [](const LegalityQuery &Query) { 1042 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1043 }); 1044 1045 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1046 .scalarize(0) 1047 .custom(); 1048 1049 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1050 bool IsLoad) -> bool { 1051 const LLT DstTy = Query.Types[0]; 1052 1053 // Split vector extloads. 1054 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1055 1056 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1057 return true; 1058 1059 const LLT PtrTy = Query.Types[1]; 1060 unsigned AS = PtrTy.getAddressSpace(); 1061 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1062 return true; 1063 1064 // Catch weird sized loads that don't evenly divide into the access sizes 1065 // TODO: May be able to widen depending on alignment etc. 1066 unsigned NumRegs = (MemSize + 31) / 32; 1067 if (NumRegs == 3) { 1068 if (!ST.hasDwordx3LoadStores()) 1069 return true; 1070 } else { 1071 // If the alignment allows, these should have been widened. 1072 if (!isPowerOf2_32(NumRegs)) 1073 return true; 1074 } 1075 1076 return false; 1077 }; 1078 1079 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1080 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1081 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1082 1083 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1084 // LDS 1085 // TODO: Unsupported flat for SI. 1086 1087 for (unsigned Op : {G_LOAD, G_STORE}) { 1088 const bool IsStore = Op == G_STORE; 1089 1090 auto &Actions = getActionDefinitionsBuilder(Op); 1091 // Explicitly list some common cases. 1092 // TODO: Does this help compile time at all? 1093 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1094 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1095 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1096 {S64, GlobalPtr, S64, GlobalAlign32}, 1097 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1098 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1099 {S32, GlobalPtr, S8, GlobalAlign8}, 1100 {S32, GlobalPtr, S16, GlobalAlign16}, 1101 1102 {S32, LocalPtr, S32, 32}, 1103 {S64, LocalPtr, S64, 32}, 1104 {V2S32, LocalPtr, V2S32, 32}, 1105 {S32, LocalPtr, S8, 8}, 1106 {S32, LocalPtr, S16, 16}, 1107 {V2S16, LocalPtr, S32, 32}, 1108 1109 {S32, PrivatePtr, S32, 32}, 1110 {S32, PrivatePtr, S8, 8}, 1111 {S32, PrivatePtr, S16, 16}, 1112 {V2S16, PrivatePtr, S32, 32}, 1113 1114 {S32, ConstantPtr, S32, GlobalAlign32}, 1115 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1116 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1117 {S64, ConstantPtr, S64, GlobalAlign32}, 1118 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1119 Actions.legalIf( 1120 [=](const LegalityQuery &Query) -> bool { 1121 return isLoadStoreLegal(ST, Query); 1122 }); 1123 1124 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1125 // 64-bits. 1126 // 1127 // TODO: Should generalize bitcast action into coerce, which will also cover 1128 // inserting addrspacecasts. 1129 Actions.customIf(typeIs(1, Constant32Ptr)); 1130 1131 // Turn any illegal element vectors into something easier to deal 1132 // with. These will ultimately produce 32-bit scalar shifts to extract the 1133 // parts anyway. 1134 // 1135 // For odd 16-bit element vectors, prefer to split those into pieces with 1136 // 16-bit vector parts. 1137 Actions.bitcastIf( 1138 [=](const LegalityQuery &Query) -> bool { 1139 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1140 Query.MMODescrs[0].MemoryTy); 1141 }, bitcastToRegisterType(0)); 1142 1143 if (!IsStore) { 1144 // Widen suitably aligned loads by loading extra bytes. The standard 1145 // legalization actions can't properly express widening memory operands. 1146 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1147 return shouldWidenLoad(ST, Query, G_LOAD); 1148 }); 1149 } 1150 1151 // FIXME: load/store narrowing should be moved to lower action 1152 Actions 1153 .narrowScalarIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return !Query.Types[0].isVector() && 1156 needToSplitMemOp(Query, Op == G_LOAD); 1157 }, 1158 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1159 const LLT DstTy = Query.Types[0]; 1160 const LLT PtrTy = Query.Types[1]; 1161 1162 const unsigned DstSize = DstTy.getSizeInBits(); 1163 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1164 1165 // Split extloads. 1166 if (DstSize > MemSize) 1167 return std::make_pair(0, LLT::scalar(MemSize)); 1168 1169 unsigned MaxSize = maxSizeForAddrSpace(ST, 1170 PtrTy.getAddressSpace(), 1171 Op == G_LOAD); 1172 if (MemSize > MaxSize) 1173 return std::make_pair(0, LLT::scalar(MaxSize)); 1174 1175 unsigned Align = Query.MMODescrs[0].AlignInBits; 1176 return std::make_pair(0, LLT::scalar(Align)); 1177 }) 1178 .fewerElementsIf( 1179 [=](const LegalityQuery &Query) -> bool { 1180 return Query.Types[0].isVector() && 1181 needToSplitMemOp(Query, Op == G_LOAD); 1182 }, 1183 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1184 const LLT DstTy = Query.Types[0]; 1185 const LLT PtrTy = Query.Types[1]; 1186 1187 LLT EltTy = DstTy.getElementType(); 1188 unsigned MaxSize = maxSizeForAddrSpace(ST, 1189 PtrTy.getAddressSpace(), 1190 Op == G_LOAD); 1191 1192 // FIXME: Handle widened to power of 2 results better. This ends 1193 // up scalarizing. 1194 // FIXME: 3 element stores scalarized on SI 1195 1196 // Split if it's too large for the address space. 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 if (MemSize > MaxSize) { 1199 unsigned NumElts = DstTy.getNumElements(); 1200 unsigned EltSize = EltTy.getSizeInBits(); 1201 1202 if (MaxSize % EltSize == 0) { 1203 return std::make_pair( 1204 0, LLT::scalarOrVector( 1205 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1206 } 1207 1208 unsigned NumPieces = MemSize / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair( 1217 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 if (DstTy.getSizeInBits() > MemSize) 1222 return std::make_pair(0, EltTy); 1223 1224 unsigned EltSize = EltTy.getSizeInBits(); 1225 unsigned DstSize = DstTy.getSizeInBits(); 1226 if (!isPowerOf2_32(DstSize)) { 1227 // We're probably decomposing an odd sized store. Try to split 1228 // to the widest type. TODO: Account for alignment. As-is it 1229 // should be OK, since the new parts will be further legalized. 1230 unsigned FloorSize = PowerOf2Floor(DstSize); 1231 return std::make_pair( 1232 0, LLT::scalarOrVector( 1233 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1234 } 1235 1236 // May need relegalization for the scalars. 1237 return std::make_pair(0, EltTy); 1238 }) 1239 .minScalar(0, S32) 1240 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1241 .widenScalarToNextPow2(0) 1242 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1243 .lower(); 1244 } 1245 1246 // FIXME: Unaligned accesses not lowered. 1247 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1248 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1249 {S32, GlobalPtr, S16, 2 * 8}, 1250 {S32, LocalPtr, S8, 8}, 1251 {S32, LocalPtr, S16, 16}, 1252 {S32, PrivatePtr, S8, 8}, 1253 {S32, PrivatePtr, S16, 16}, 1254 {S32, ConstantPtr, S8, 8}, 1255 {S32, ConstantPtr, S16, 2 * 8}}) 1256 .legalIf( 1257 [=](const LegalityQuery &Query) -> bool { 1258 return isLoadStoreLegal(ST, Query); 1259 }); 1260 1261 if (ST.hasFlatAddressSpace()) { 1262 ExtLoads.legalForTypesWithMemDesc( 1263 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1264 } 1265 1266 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1267 // 64-bits. 1268 // 1269 // TODO: Should generalize bitcast action into coerce, which will also cover 1270 // inserting addrspacecasts. 1271 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1272 1273 ExtLoads.clampScalar(0, S32, S32) 1274 .widenScalarToNextPow2(0) 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1290 if (ST.hasLDSFPAtomicAdd()) { 1291 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 if (ST.hasGFX90AInsts()) 1293 Atomic.legalFor({{S64, LocalPtr}}); 1294 } 1295 if (ST.hasAtomicFaddInsts()) 1296 Atomic.legalFor({{S32, GlobalPtr}}); 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1310 LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::fixed_vector(2, LocalPtr), 1312 LLT::fixed_vector(2, PrivatePtr)}, 1313 {S1, S32}) 1314 .clampScalar(0, S16, S64) 1315 .scalarize(1) 1316 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1317 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1318 .clampMaxNumElements(0, S32, 2) 1319 .clampMaxNumElements(0, LocalPtr, 2) 1320 .clampMaxNumElements(0, PrivatePtr, 2) 1321 .scalarize(0) 1322 .widenScalarToNextPow2(0) 1323 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1324 1325 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1326 // be more flexible with the shift amount type. 1327 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1328 .legalFor({{S32, S32}, {S64, S32}}); 1329 if (ST.has16BitInsts()) { 1330 if (ST.hasVOP3PInsts()) { 1331 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1332 .clampMaxNumElements(0, S16, 2); 1333 } else 1334 Shifts.legalFor({{S16, S16}}); 1335 1336 // TODO: Support 16-bit shift amounts for all types 1337 Shifts.widenScalarIf( 1338 [=](const LegalityQuery &Query) { 1339 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1340 // 32-bit amount. 1341 const LLT ValTy = Query.Types[0]; 1342 const LLT AmountTy = Query.Types[1]; 1343 return ValTy.getSizeInBits() <= 16 && 1344 AmountTy.getSizeInBits() < 16; 1345 }, changeTo(1, S16)); 1346 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1347 Shifts.clampScalar(1, S32, S32); 1348 Shifts.clampScalar(0, S16, S64); 1349 Shifts.widenScalarToNextPow2(0, 16); 1350 1351 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1352 .minScalar(0, S16) 1353 .scalarize(0) 1354 .lower(); 1355 } else { 1356 // Make sure we legalize the shift amount type first, as the general 1357 // expansion for the shifted type will produce much worse code if it hasn't 1358 // been truncated already. 1359 Shifts.clampScalar(1, S32, S32); 1360 Shifts.clampScalar(0, S32, S64); 1361 Shifts.widenScalarToNextPow2(0, 32); 1362 1363 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1364 .minScalar(0, S32) 1365 .scalarize(0) 1366 .lower(); 1367 } 1368 Shifts.scalarize(0); 1369 1370 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1371 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1372 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1373 unsigned IdxTypeIdx = 2; 1374 1375 getActionDefinitionsBuilder(Op) 1376 .customIf([=](const LegalityQuery &Query) { 1377 const LLT EltTy = Query.Types[EltTypeIdx]; 1378 const LLT VecTy = Query.Types[VecTypeIdx]; 1379 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1380 const unsigned EltSize = EltTy.getSizeInBits(); 1381 return (EltSize == 32 || EltSize == 64) && 1382 VecTy.getSizeInBits() % 32 == 0 && 1383 VecTy.getSizeInBits() <= MaxRegisterSize && 1384 IdxTy.getSizeInBits() == 32; 1385 }) 1386 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1387 bitcastToVectorElement32(VecTypeIdx)) 1388 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1389 .bitcastIf( 1390 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1391 [=](const LegalityQuery &Query) { 1392 // For > 64-bit element types, try to turn this into a 64-bit 1393 // element vector since we may be able to do better indexing 1394 // if this is scalar. If not, fall back to 32. 1395 const LLT EltTy = Query.Types[EltTypeIdx]; 1396 const LLT VecTy = Query.Types[VecTypeIdx]; 1397 const unsigned DstEltSize = EltTy.getSizeInBits(); 1398 const unsigned VecSize = VecTy.getSizeInBits(); 1399 1400 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1401 return std::make_pair( 1402 VecTypeIdx, 1403 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1404 }) 1405 .clampScalar(EltTypeIdx, S32, S64) 1406 .clampScalar(VecTypeIdx, S32, S64) 1407 .clampScalar(IdxTypeIdx, S32, S32) 1408 .clampMaxNumElements(VecTypeIdx, S32, 32) 1409 // TODO: Clamp elements for 64-bit vectors? 1410 // It should only be necessary with variable indexes. 1411 // As a last resort, lower to the stack 1412 .lower(); 1413 } 1414 1415 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1416 .unsupportedIf([=](const LegalityQuery &Query) { 1417 const LLT &EltTy = Query.Types[1].getElementType(); 1418 return Query.Types[0] != EltTy; 1419 }); 1420 1421 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1422 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1423 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1424 1425 // FIXME: Doesn't handle extract of illegal sizes. 1426 getActionDefinitionsBuilder(Op) 1427 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1428 .lowerIf([=](const LegalityQuery &Query) { 1429 // Sub-vector(or single element) insert and extract. 1430 // TODO: verify immediate offset here since lower only works with 1431 // whole elements. 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 return BigTy.isVector(); 1434 }) 1435 // FIXME: Multiples of 16 should not be legal. 1436 .legalIf([=](const LegalityQuery &Query) { 1437 const LLT BigTy = Query.Types[BigTyIdx]; 1438 const LLT LitTy = Query.Types[LitTyIdx]; 1439 return (BigTy.getSizeInBits() % 32 == 0) && 1440 (LitTy.getSizeInBits() % 16 == 0); 1441 }) 1442 .widenScalarIf( 1443 [=](const LegalityQuery &Query) { 1444 const LLT BigTy = Query.Types[BigTyIdx]; 1445 return (BigTy.getScalarSizeInBits() < 16); 1446 }, 1447 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1448 .widenScalarIf( 1449 [=](const LegalityQuery &Query) { 1450 const LLT LitTy = Query.Types[LitTyIdx]; 1451 return (LitTy.getScalarSizeInBits() < 16); 1452 }, 1453 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1454 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1455 .widenScalarToNextPow2(BigTyIdx, 32); 1456 1457 } 1458 1459 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1460 .legalForCartesianProduct(AllS32Vectors, {S32}) 1461 .legalForCartesianProduct(AllS64Vectors, {S64}) 1462 .clampNumElements(0, V16S32, V32S32) 1463 .clampNumElements(0, V2S64, V16S64) 1464 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1465 1466 if (ST.hasScalarPackInsts()) { 1467 BuildVector 1468 // FIXME: Should probably widen s1 vectors straight to s32 1469 .minScalarOrElt(0, S16) 1470 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1471 .minScalar(1, S32); 1472 1473 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1474 .legalFor({V2S16, S32}) 1475 .lower(); 1476 BuildVector.minScalarOrElt(0, S32); 1477 } else { 1478 BuildVector.customFor({V2S16, S16}); 1479 BuildVector.minScalarOrElt(0, S32); 1480 1481 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1482 .customFor({V2S16, S32}) 1483 .lower(); 1484 } 1485 1486 BuildVector.legalIf(isRegisterType(0)); 1487 1488 // FIXME: Clamp maximum size 1489 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1490 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1491 .clampMaxNumElements(0, S32, 32) 1492 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1493 .clampMaxNumElements(0, S16, 64); 1494 1495 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1496 // pre-legalize. 1497 if (ST.hasVOP3PInsts()) { 1498 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1499 .customFor({V2S16, V2S16}) 1500 .lower(); 1501 } else 1502 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1503 1504 // Merge/Unmerge 1505 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1506 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1507 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1508 1509 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1510 const LLT Ty = Query.Types[TypeIdx]; 1511 if (Ty.isVector()) { 1512 const LLT &EltTy = Ty.getElementType(); 1513 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1514 return true; 1515 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1516 return true; 1517 } 1518 return false; 1519 }; 1520 1521 auto &Builder = getActionDefinitionsBuilder(Op) 1522 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1523 .lowerFor({{S16, V2S16}}) 1524 .lowerIf([=](const LegalityQuery &Query) { 1525 const LLT BigTy = Query.Types[BigTyIdx]; 1526 return BigTy.getSizeInBits() == 32; 1527 }) 1528 // Try to widen to s16 first for small types. 1529 // TODO: Only do this on targets with legal s16 shifts 1530 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1531 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1532 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1533 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1534 elementTypeIs(1, S16)), 1535 changeTo(1, V2S16)) 1536 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1537 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1538 // valid. 1539 .clampScalar(LitTyIdx, S32, S512) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1541 // Break up vectors with weird elements into scalars 1542 .fewerElementsIf( 1543 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1544 scalarize(0)) 1545 .fewerElementsIf( 1546 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1547 scalarize(1)) 1548 .clampScalar(BigTyIdx, S32, MaxScalar); 1549 1550 if (Op == G_MERGE_VALUES) { 1551 Builder.widenScalarIf( 1552 // TODO: Use 16-bit shifts if legal for 8-bit values? 1553 [=](const LegalityQuery &Query) { 1554 const LLT Ty = Query.Types[LitTyIdx]; 1555 return Ty.getSizeInBits() < 32; 1556 }, 1557 changeTo(LitTyIdx, S32)); 1558 } 1559 1560 Builder.widenScalarIf( 1561 [=](const LegalityQuery &Query) { 1562 const LLT Ty = Query.Types[BigTyIdx]; 1563 return !isPowerOf2_32(Ty.getSizeInBits()) && 1564 Ty.getSizeInBits() % 16 != 0; 1565 }, 1566 [=](const LegalityQuery &Query) { 1567 // Pick the next power of 2, or a multiple of 64 over 128. 1568 // Whichever is smaller. 1569 const LLT &Ty = Query.Types[BigTyIdx]; 1570 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1571 if (NewSizeInBits >= 256) { 1572 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1573 if (RoundedTo < NewSizeInBits) 1574 NewSizeInBits = RoundedTo; 1575 } 1576 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1577 }) 1578 // Any vectors left are the wrong size. Scalarize them. 1579 .scalarize(0) 1580 .scalarize(1); 1581 } 1582 1583 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1584 // RegBankSelect. 1585 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1586 .legalFor({{S32}, {S64}}); 1587 1588 if (ST.hasVOP3PInsts()) { 1589 SextInReg.lowerFor({{V2S16}}) 1590 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1591 // get more vector shift opportunities, since we'll get those when 1592 // expanded. 1593 .clampMaxNumElementsStrict(0, S16, 2); 1594 } else if (ST.has16BitInsts()) { 1595 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1596 } else { 1597 // Prefer to promote to s32 before lowering if we don't have 16-bit 1598 // shifts. This avoid a lot of intermediate truncate and extend operations. 1599 SextInReg.lowerFor({{S32}, {S64}}); 1600 } 1601 1602 SextInReg 1603 .scalarize(0) 1604 .clampScalar(0, S32, S64) 1605 .lower(); 1606 1607 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1608 .scalarize(0) 1609 .lower(); 1610 1611 // TODO: Only Try to form v2s16 with legal packed instructions. 1612 getActionDefinitionsBuilder(G_FSHR) 1613 .legalFor({{S32, S32}}) 1614 .lowerFor({{V2S16, V2S16}}) 1615 .clampMaxNumElementsStrict(0, S16, 2) 1616 .scalarize(0) 1617 .lower(); 1618 1619 if (ST.hasVOP3PInsts()) { 1620 getActionDefinitionsBuilder(G_FSHL) 1621 .lowerFor({{V2S16, V2S16}}) 1622 .clampMaxNumElementsStrict(0, S16, 2) 1623 .scalarize(0) 1624 .lower(); 1625 } else { 1626 getActionDefinitionsBuilder(G_FSHL) 1627 .scalarize(0) 1628 .lower(); 1629 } 1630 1631 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1632 .legalFor({S64}); 1633 1634 getActionDefinitionsBuilder(G_FENCE) 1635 .alwaysLegal(); 1636 1637 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1638 .scalarize(0) 1639 .minScalar(0, S32) 1640 .lower(); 1641 1642 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1643 .legalFor({{S32, S32}, {S64, S32}}) 1644 .clampScalar(1, S32, S32) 1645 .clampScalar(0, S32, S64) 1646 .widenScalarToNextPow2(0) 1647 .scalarize(0); 1648 1649 getActionDefinitionsBuilder({ 1650 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1651 G_FCOPYSIGN, 1652 1653 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1654 G_ATOMICRMW_NAND, 1655 G_ATOMICRMW_FSUB, 1656 G_READ_REGISTER, 1657 G_WRITE_REGISTER, 1658 1659 G_SADDO, G_SSUBO, 1660 1661 // TODO: Implement 1662 G_FMINIMUM, G_FMAXIMUM}).lower(); 1663 1664 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1665 .lower(); 1666 1667 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1668 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1669 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1670 .unsupported(); 1671 1672 getLegacyLegalizerInfo().computeTables(); 1673 verify(*ST.getInstrInfo()); 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1677 MachineInstr &MI) const { 1678 MachineIRBuilder &B = Helper.MIRBuilder; 1679 MachineRegisterInfo &MRI = *B.getMRI(); 1680 1681 switch (MI.getOpcode()) { 1682 case TargetOpcode::G_ADDRSPACE_CAST: 1683 return legalizeAddrSpaceCast(MI, MRI, B); 1684 case TargetOpcode::G_FRINT: 1685 return legalizeFrint(MI, MRI, B); 1686 case TargetOpcode::G_FCEIL: 1687 return legalizeFceil(MI, MRI, B); 1688 case TargetOpcode::G_FREM: 1689 return legalizeFrem(MI, MRI, B); 1690 case TargetOpcode::G_INTRINSIC_TRUNC: 1691 return legalizeIntrinsicTrunc(MI, MRI, B); 1692 case TargetOpcode::G_SITOFP: 1693 return legalizeITOFP(MI, MRI, B, true); 1694 case TargetOpcode::G_UITOFP: 1695 return legalizeITOFP(MI, MRI, B, false); 1696 case TargetOpcode::G_FPTOSI: 1697 return legalizeFPTOI(MI, MRI, B, true); 1698 case TargetOpcode::G_FPTOUI: 1699 return legalizeFPTOI(MI, MRI, B, false); 1700 case TargetOpcode::G_FMINNUM: 1701 case TargetOpcode::G_FMAXNUM: 1702 case TargetOpcode::G_FMINNUM_IEEE: 1703 case TargetOpcode::G_FMAXNUM_IEEE: 1704 return legalizeMinNumMaxNum(Helper, MI); 1705 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1706 return legalizeExtractVectorElt(MI, MRI, B); 1707 case TargetOpcode::G_INSERT_VECTOR_ELT: 1708 return legalizeInsertVectorElt(MI, MRI, B); 1709 case TargetOpcode::G_SHUFFLE_VECTOR: 1710 return legalizeShuffleVector(MI, MRI, B); 1711 case TargetOpcode::G_FSIN: 1712 case TargetOpcode::G_FCOS: 1713 return legalizeSinCos(MI, MRI, B); 1714 case TargetOpcode::G_GLOBAL_VALUE: 1715 return legalizeGlobalValue(MI, MRI, B); 1716 case TargetOpcode::G_LOAD: 1717 case TargetOpcode::G_SEXTLOAD: 1718 case TargetOpcode::G_ZEXTLOAD: 1719 return legalizeLoad(Helper, MI); 1720 case TargetOpcode::G_FMAD: 1721 return legalizeFMad(MI, MRI, B); 1722 case TargetOpcode::G_FDIV: 1723 return legalizeFDIV(MI, MRI, B); 1724 case TargetOpcode::G_UDIV: 1725 case TargetOpcode::G_UREM: 1726 case TargetOpcode::G_UDIVREM: 1727 return legalizeUnsignedDIV_REM(MI, MRI, B); 1728 case TargetOpcode::G_SDIV: 1729 case TargetOpcode::G_SREM: 1730 case TargetOpcode::G_SDIVREM: 1731 return legalizeSignedDIV_REM(MI, MRI, B); 1732 case TargetOpcode::G_ATOMIC_CMPXCHG: 1733 return legalizeAtomicCmpXChg(MI, MRI, B); 1734 case TargetOpcode::G_FLOG: 1735 return legalizeFlog(MI, B, numbers::ln2f); 1736 case TargetOpcode::G_FLOG10: 1737 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1738 case TargetOpcode::G_FEXP: 1739 return legalizeFExp(MI, B); 1740 case TargetOpcode::G_FPOW: 1741 return legalizeFPow(MI, B); 1742 case TargetOpcode::G_FFLOOR: 1743 return legalizeFFloor(MI, MRI, B); 1744 case TargetOpcode::G_BUILD_VECTOR: 1745 return legalizeBuildVector(MI, MRI, B); 1746 case TargetOpcode::G_CTLZ: 1747 case TargetOpcode::G_CTTZ: 1748 return legalizeCTLZ_CTTZ(MI, MRI, B); 1749 default: 1750 return false; 1751 } 1752 1753 llvm_unreachable("expected switch to return"); 1754 } 1755 1756 Register AMDGPULegalizerInfo::getSegmentAperture( 1757 unsigned AS, 1758 MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1762 const LLT S32 = LLT::scalar(32); 1763 1764 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1765 1766 if (ST.hasApertureRegs()) { 1767 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1768 // getreg. 1769 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1770 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1771 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1772 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1773 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1774 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1775 unsigned Encoding = 1776 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1777 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1778 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1779 1780 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1781 1782 B.buildInstr(AMDGPU::S_GETREG_B32) 1783 .addDef(GetReg) 1784 .addImm(Encoding); 1785 MRI.setType(GetReg, S32); 1786 1787 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1788 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1789 } 1790 1791 Register QueuePtr = MRI.createGenericVirtualRegister( 1792 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1793 1794 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1795 return Register(); 1796 1797 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1798 // private_segment_aperture_base_hi. 1799 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1800 1801 // TODO: can we be smarter about machine pointer info? 1802 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1803 MachineMemOperand *MMO = MF.getMachineMemOperand( 1804 PtrInfo, 1805 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1806 MachineMemOperand::MOInvariant, 1807 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1808 1809 Register LoadAddr; 1810 1811 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1812 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1813 } 1814 1815 /// Return true if the value is a known valid address, such that a null check is 1816 /// not necessary. 1817 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1819 MachineInstr *Def = MRI.getVRegDef(Val); 1820 switch (Def->getOpcode()) { 1821 case AMDGPU::G_FRAME_INDEX: 1822 case AMDGPU::G_GLOBAL_VALUE: 1823 case AMDGPU::G_BLOCK_ADDR: 1824 return true; 1825 case AMDGPU::G_CONSTANT: { 1826 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1827 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1828 } 1829 default: 1830 return false; 1831 } 1832 1833 return false; 1834 } 1835 1836 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1837 MachineInstr &MI, MachineRegisterInfo &MRI, 1838 MachineIRBuilder &B) const { 1839 MachineFunction &MF = B.getMF(); 1840 1841 const LLT S32 = LLT::scalar(32); 1842 Register Dst = MI.getOperand(0).getReg(); 1843 Register Src = MI.getOperand(1).getReg(); 1844 1845 LLT DstTy = MRI.getType(Dst); 1846 LLT SrcTy = MRI.getType(Src); 1847 unsigned DestAS = DstTy.getAddressSpace(); 1848 unsigned SrcAS = SrcTy.getAddressSpace(); 1849 1850 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1851 // vector element. 1852 assert(!DstTy.isVector()); 1853 1854 const AMDGPUTargetMachine &TM 1855 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1856 1857 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1858 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1859 return true; 1860 } 1861 1862 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1863 // Truncate. 1864 B.buildExtract(Dst, Src, 0); 1865 MI.eraseFromParent(); 1866 return true; 1867 } 1868 1869 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1870 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1871 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1872 1873 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1874 // another. Merge operands are required to be the same type, but creating an 1875 // extra ptrtoint would be kind of pointless. 1876 auto HighAddr = B.buildConstant( 1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1878 B.buildMerge(Dst, {Src, HighAddr}); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1884 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1885 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1886 1887 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1888 // Extract low 32-bits of the pointer. 1889 B.buildExtract(Dst, Src, 0); 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 unsigned NullVal = TM.getNullPointerValue(DestAS); 1895 1896 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1897 auto FlatNull = B.buildConstant(SrcTy, 0); 1898 1899 // Extract low 32-bits of the pointer. 1900 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1901 1902 auto CmpRes = 1903 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1904 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1905 1906 MI.eraseFromParent(); 1907 return true; 1908 } 1909 1910 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1911 return false; 1912 1913 if (!ST.hasFlatAddressSpace()) 1914 return false; 1915 1916 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1917 if (!ApertureReg.isValid()) 1918 return false; 1919 1920 // Coerce the type of the low half of the result so we can use merge_values. 1921 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1922 1923 // TODO: Should we allow mismatched types but matching sizes in merges to 1924 // avoid the ptrtoint? 1925 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1926 1927 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1928 B.buildCopy(Dst, BuildPtr); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1934 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1935 1936 auto CmpRes = 1937 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1938 1939 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeFrint( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 Register Src = MI.getOperand(1).getReg(); 1949 LLT Ty = MRI.getType(Src); 1950 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1951 1952 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1953 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1954 1955 auto C1 = B.buildFConstant(Ty, C1Val); 1956 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1957 1958 // TODO: Should this propagate fast-math-flags? 1959 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1960 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1961 1962 auto C2 = B.buildFConstant(Ty, C2Val); 1963 auto Fabs = B.buildFAbs(Ty, Src); 1964 1965 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1966 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1967 MI.eraseFromParent(); 1968 return true; 1969 } 1970 1971 bool AMDGPULegalizerInfo::legalizeFceil( 1972 MachineInstr &MI, MachineRegisterInfo &MRI, 1973 MachineIRBuilder &B) const { 1974 1975 const LLT S1 = LLT::scalar(1); 1976 const LLT S64 = LLT::scalar(64); 1977 1978 Register Src = MI.getOperand(1).getReg(); 1979 assert(MRI.getType(Src) == S64); 1980 1981 // result = trunc(src) 1982 // if (src > 0.0 && src != result) 1983 // result += 1.0 1984 1985 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1986 1987 const auto Zero = B.buildFConstant(S64, 0.0); 1988 const auto One = B.buildFConstant(S64, 1.0); 1989 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1990 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1991 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1992 auto Add = B.buildSelect(S64, And, One, Zero); 1993 1994 // TODO: Should this propagate fast-math-flags? 1995 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1996 return true; 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeFrem( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 Register DstReg = MI.getOperand(0).getReg(); 2003 Register Src0Reg = MI.getOperand(1).getReg(); 2004 Register Src1Reg = MI.getOperand(2).getReg(); 2005 auto Flags = MI.getFlags(); 2006 LLT Ty = MRI.getType(DstReg); 2007 2008 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2009 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2010 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2011 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2012 MI.eraseFromParent(); 2013 return true; 2014 } 2015 2016 static MachineInstrBuilder extractF64Exponent(Register Hi, 2017 MachineIRBuilder &B) { 2018 const unsigned FractBits = 52; 2019 const unsigned ExpBits = 11; 2020 LLT S32 = LLT::scalar(32); 2021 2022 auto Const0 = B.buildConstant(S32, FractBits - 32); 2023 auto Const1 = B.buildConstant(S32, ExpBits); 2024 2025 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2026 .addUse(Hi) 2027 .addUse(Const0.getReg(0)) 2028 .addUse(Const1.getReg(0)); 2029 2030 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2031 } 2032 2033 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2034 MachineInstr &MI, MachineRegisterInfo &MRI, 2035 MachineIRBuilder &B) const { 2036 const LLT S1 = LLT::scalar(1); 2037 const LLT S32 = LLT::scalar(32); 2038 const LLT S64 = LLT::scalar(64); 2039 2040 Register Src = MI.getOperand(1).getReg(); 2041 assert(MRI.getType(Src) == S64); 2042 2043 // TODO: Should this use extract since the low half is unused? 2044 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2045 Register Hi = Unmerge.getReg(1); 2046 2047 // Extract the upper half, since this is where we will find the sign and 2048 // exponent. 2049 auto Exp = extractF64Exponent(Hi, B); 2050 2051 const unsigned FractBits = 52; 2052 2053 // Extract the sign bit. 2054 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2055 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2056 2057 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2058 2059 const auto Zero32 = B.buildConstant(S32, 0); 2060 2061 // Extend back to 64-bits. 2062 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2063 2064 auto Shr = B.buildAShr(S64, FractMask, Exp); 2065 auto Not = B.buildNot(S64, Shr); 2066 auto Tmp0 = B.buildAnd(S64, Src, Not); 2067 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2068 2069 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2070 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2071 2072 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2073 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2074 MI.eraseFromParent(); 2075 return true; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeITOFP( 2079 MachineInstr &MI, MachineRegisterInfo &MRI, 2080 MachineIRBuilder &B, bool Signed) const { 2081 2082 Register Dst = MI.getOperand(0).getReg(); 2083 Register Src = MI.getOperand(1).getReg(); 2084 2085 const LLT S64 = LLT::scalar(64); 2086 const LLT S32 = LLT::scalar(32); 2087 2088 assert(MRI.getType(Src) == S64); 2089 2090 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2091 auto ThirtyTwo = B.buildConstant(S32, 32); 2092 2093 if (MRI.getType(Dst) == S64) { 2094 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2095 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2096 2097 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2098 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2099 .addUse(CvtHi.getReg(0)) 2100 .addUse(ThirtyTwo.getReg(0)); 2101 2102 // TODO: Should this propagate fast-math-flags? 2103 B.buildFAdd(Dst, LdExp, CvtLo); 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 assert(MRI.getType(Dst) == S32); 2109 2110 auto One = B.buildConstant(S32, 1); 2111 2112 MachineInstrBuilder ShAmt; 2113 if (Signed) { 2114 auto ThirtyOne = B.buildConstant(S32, 31); 2115 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2116 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2117 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2118 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2119 /*HasSideEffects=*/false) 2120 .addUse(Unmerge.getReg(1)); 2121 auto LS2 = B.buildSub(S32, LS, One); 2122 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2123 } else 2124 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2125 auto Norm = B.buildShl(S64, Src, ShAmt); 2126 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2127 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2128 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2129 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2130 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2131 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2132 /*HasSideEffects=*/false) 2133 .addUse(FVal.getReg(0)) 2134 .addUse(Scale.getReg(0)); 2135 MI.eraseFromParent(); 2136 return true; 2137 } 2138 2139 // TODO: Copied from DAG implementation. Verify logic and document how this 2140 // actually works. 2141 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2142 MachineRegisterInfo &MRI, 2143 MachineIRBuilder &B, 2144 bool Signed) const { 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src = MI.getOperand(1).getReg(); 2148 2149 const LLT S64 = LLT::scalar(64); 2150 const LLT S32 = LLT::scalar(32); 2151 2152 const LLT SrcLT = MRI.getType(Src); 2153 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2154 2155 unsigned Flags = MI.getFlags(); 2156 2157 // The basic idea of converting a floating point number into a pair of 32-bit 2158 // integers is illustrated as follows: 2159 // 2160 // tf := trunc(val); 2161 // hif := floor(tf * 2^-32); 2162 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2163 // hi := fptoi(hif); 2164 // lo := fptoi(lof); 2165 // 2166 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2167 MachineInstrBuilder Sign; 2168 if (Signed && SrcLT == S32) { 2169 // However, a 32-bit floating point number has only 23 bits mantissa and 2170 // it's not enough to hold all the significant bits of `lof` if val is 2171 // negative. To avoid the loss of precision, We need to take the absolute 2172 // value after truncating and flip the result back based on the original 2173 // signedness. 2174 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2175 Trunc = B.buildFAbs(S32, Trunc, Flags); 2176 } 2177 MachineInstrBuilder K0, K1; 2178 if (SrcLT == S64) { 2179 K0 = B.buildFConstant(S64, 2180 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2181 K1 = B.buildFConstant(S64, 2182 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2183 } else { 2184 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2185 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2186 } 2187 2188 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2189 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2190 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2191 2192 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2193 : B.buildFPTOUI(S32, FloorMul); 2194 auto Lo = B.buildFPTOUI(S32, Fma); 2195 2196 if (Signed && SrcLT == S32) { 2197 // Flip the result based on the signedness, which is either all 0s or 1s. 2198 Sign = B.buildMerge(S64, {Sign, Sign}); 2199 // r := xor({lo, hi}, sign) - sign; 2200 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2201 } else 2202 B.buildMerge(Dst, {Lo, Hi}); 2203 MI.eraseFromParent(); 2204 2205 return true; 2206 } 2207 2208 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2209 MachineInstr &MI) const { 2210 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2211 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2212 2213 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2214 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2215 2216 // With ieee_mode disabled, the instructions have the correct behavior 2217 // already for G_FMINNUM/G_FMAXNUM 2218 if (!MFI->getMode().IEEE) 2219 return !IsIEEEOp; 2220 2221 if (IsIEEEOp) 2222 return true; 2223 2224 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2225 } 2226 2227 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2228 MachineInstr &MI, MachineRegisterInfo &MRI, 2229 MachineIRBuilder &B) const { 2230 // TODO: Should move some of this into LegalizerHelper. 2231 2232 // TODO: Promote dynamic indexing of s16 to s32 2233 2234 // FIXME: Artifact combiner probably should have replaced the truncated 2235 // constant before this, so we shouldn't need 2236 // getIConstantVRegValWithLookThrough. 2237 Optional<ValueAndVReg> MaybeIdxVal = 2238 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2239 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2240 return true; 2241 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2242 2243 Register Dst = MI.getOperand(0).getReg(); 2244 Register Vec = MI.getOperand(1).getReg(); 2245 2246 LLT VecTy = MRI.getType(Vec); 2247 LLT EltTy = VecTy.getElementType(); 2248 assert(EltTy == MRI.getType(Dst)); 2249 2250 if (IdxVal < VecTy.getNumElements()) { 2251 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2252 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2253 } else { 2254 B.buildUndef(Dst); 2255 } 2256 2257 MI.eraseFromParent(); 2258 return true; 2259 } 2260 2261 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2262 MachineInstr &MI, MachineRegisterInfo &MRI, 2263 MachineIRBuilder &B) const { 2264 // TODO: Should move some of this into LegalizerHelper. 2265 2266 // TODO: Promote dynamic indexing of s16 to s32 2267 2268 // FIXME: Artifact combiner probably should have replaced the truncated 2269 // constant before this, so we shouldn't need 2270 // getIConstantVRegValWithLookThrough. 2271 Optional<ValueAndVReg> MaybeIdxVal = 2272 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2273 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2274 return true; 2275 2276 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2277 Register Dst = MI.getOperand(0).getReg(); 2278 Register Vec = MI.getOperand(1).getReg(); 2279 Register Ins = MI.getOperand(2).getReg(); 2280 2281 LLT VecTy = MRI.getType(Vec); 2282 LLT EltTy = VecTy.getElementType(); 2283 assert(EltTy == MRI.getType(Ins)); 2284 (void)Ins; 2285 2286 unsigned NumElts = VecTy.getNumElements(); 2287 if (IdxVal < NumElts) { 2288 SmallVector<Register, 8> SrcRegs; 2289 for (unsigned i = 0; i < NumElts; ++i) 2290 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2291 B.buildUnmerge(SrcRegs, Vec); 2292 2293 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2294 B.buildMerge(Dst, SrcRegs); 2295 } else { 2296 B.buildUndef(Dst); 2297 } 2298 2299 MI.eraseFromParent(); 2300 return true; 2301 } 2302 2303 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2304 MachineInstr &MI, MachineRegisterInfo &MRI, 2305 MachineIRBuilder &B) const { 2306 const LLT V2S16 = LLT::fixed_vector(2, 16); 2307 2308 Register Dst = MI.getOperand(0).getReg(); 2309 Register Src0 = MI.getOperand(1).getReg(); 2310 LLT DstTy = MRI.getType(Dst); 2311 LLT SrcTy = MRI.getType(Src0); 2312 2313 if (SrcTy == V2S16 && DstTy == V2S16 && 2314 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2315 return true; 2316 2317 MachineIRBuilder HelperBuilder(MI); 2318 GISelObserverWrapper DummyObserver; 2319 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2320 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2321 } 2322 2323 bool AMDGPULegalizerInfo::legalizeSinCos( 2324 MachineInstr &MI, MachineRegisterInfo &MRI, 2325 MachineIRBuilder &B) const { 2326 2327 Register DstReg = MI.getOperand(0).getReg(); 2328 Register SrcReg = MI.getOperand(1).getReg(); 2329 LLT Ty = MRI.getType(DstReg); 2330 unsigned Flags = MI.getFlags(); 2331 2332 Register TrigVal; 2333 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2334 if (ST.hasTrigReducedRange()) { 2335 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2336 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2337 .addUse(MulVal.getReg(0)) 2338 .setMIFlags(Flags).getReg(0); 2339 } else 2340 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2341 2342 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2343 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2344 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2345 .addUse(TrigVal) 2346 .setMIFlags(Flags); 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2352 MachineIRBuilder &B, 2353 const GlobalValue *GV, 2354 int64_t Offset, 2355 unsigned GAFlags) const { 2356 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2357 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2358 // to the following code sequence: 2359 // 2360 // For constant address space: 2361 // s_getpc_b64 s[0:1] 2362 // s_add_u32 s0, s0, $symbol 2363 // s_addc_u32 s1, s1, 0 2364 // 2365 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2366 // a fixup or relocation is emitted to replace $symbol with a literal 2367 // constant, which is a pc-relative offset from the encoding of the $symbol 2368 // operand to the global variable. 2369 // 2370 // For global address space: 2371 // s_getpc_b64 s[0:1] 2372 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2373 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2374 // 2375 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2376 // fixups or relocations are emitted to replace $symbol@*@lo and 2377 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2378 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2379 // operand to the global variable. 2380 // 2381 // What we want here is an offset from the value returned by s_getpc 2382 // (which is the address of the s_add_u32 instruction) to the global 2383 // variable, but since the encoding of $symbol starts 4 bytes after the start 2384 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2385 // small. This requires us to add 4 to the global variable offset in order to 2386 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2387 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2388 // instruction. 2389 2390 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2391 2392 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2393 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2394 2395 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2396 .addDef(PCReg); 2397 2398 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2399 if (GAFlags == SIInstrInfo::MO_NONE) 2400 MIB.addImm(0); 2401 else 2402 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2403 2404 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2405 2406 if (PtrTy.getSizeInBits() == 32) 2407 B.buildExtract(DstReg, PCReg, 0); 2408 return true; 2409 } 2410 2411 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2412 MachineInstr &MI, MachineRegisterInfo &MRI, 2413 MachineIRBuilder &B) const { 2414 Register DstReg = MI.getOperand(0).getReg(); 2415 LLT Ty = MRI.getType(DstReg); 2416 unsigned AS = Ty.getAddressSpace(); 2417 2418 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2419 MachineFunction &MF = B.getMF(); 2420 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2421 2422 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2423 if (!MFI->isModuleEntryFunction() && 2424 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2425 const Function &Fn = MF.getFunction(); 2426 DiagnosticInfoUnsupported BadLDSDecl( 2427 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2428 DS_Warning); 2429 Fn.getContext().diagnose(BadLDSDecl); 2430 2431 // We currently don't have a way to correctly allocate LDS objects that 2432 // aren't directly associated with a kernel. We do force inlining of 2433 // functions that use local objects. However, if these dead functions are 2434 // not eliminated, we don't want a compile time error. Just emit a warning 2435 // and a trap, since there should be no callable path here. 2436 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2437 B.buildUndef(DstReg); 2438 MI.eraseFromParent(); 2439 return true; 2440 } 2441 2442 // TODO: We could emit code to handle the initialization somewhere. 2443 // We ignore the initializer for now and legalize it to allow selection. 2444 // The initializer will anyway get errored out during assembly emission. 2445 const SITargetLowering *TLI = ST.getTargetLowering(); 2446 if (!TLI->shouldUseLDSConstAddress(GV)) { 2447 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2448 return true; // Leave in place; 2449 } 2450 2451 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2452 Type *Ty = GV->getValueType(); 2453 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2454 // zero-sized type in other languages to declare the dynamic shared 2455 // memory which size is not known at the compile time. They will be 2456 // allocated by the runtime and placed directly after the static 2457 // allocated ones. They all share the same offset. 2458 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2459 // Adjust alignment for that dynamic shared memory array. 2460 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2461 LLT S32 = LLT::scalar(32); 2462 auto Sz = 2463 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2464 B.buildIntToPtr(DstReg, Sz); 2465 MI.eraseFromParent(); 2466 return true; 2467 } 2468 } 2469 2470 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2471 *cast<GlobalVariable>(GV))); 2472 MI.eraseFromParent(); 2473 return true; 2474 } 2475 2476 const SITargetLowering *TLI = ST.getTargetLowering(); 2477 2478 if (TLI->shouldEmitFixup(GV)) { 2479 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2480 MI.eraseFromParent(); 2481 return true; 2482 } 2483 2484 if (TLI->shouldEmitPCReloc(GV)) { 2485 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2486 MI.eraseFromParent(); 2487 return true; 2488 } 2489 2490 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2491 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2492 2493 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2494 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2495 MachinePointerInfo::getGOT(MF), 2496 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2497 MachineMemOperand::MOInvariant, 2498 LoadTy, Align(8)); 2499 2500 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2501 2502 if (Ty.getSizeInBits() == 32) { 2503 // Truncate if this is a 32-bit constant address. 2504 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2505 B.buildExtract(DstReg, Load, 0); 2506 } else 2507 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2508 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 static LLT widenToNextPowerOf2(LLT Ty) { 2514 if (Ty.isVector()) 2515 return Ty.changeElementCount( 2516 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2517 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2518 } 2519 2520 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2521 MachineInstr &MI) const { 2522 MachineIRBuilder &B = Helper.MIRBuilder; 2523 MachineRegisterInfo &MRI = *B.getMRI(); 2524 GISelChangeObserver &Observer = Helper.Observer; 2525 2526 Register PtrReg = MI.getOperand(1).getReg(); 2527 LLT PtrTy = MRI.getType(PtrReg); 2528 unsigned AddrSpace = PtrTy.getAddressSpace(); 2529 2530 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2531 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2532 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2533 Observer.changingInstr(MI); 2534 MI.getOperand(1).setReg(Cast.getReg(0)); 2535 Observer.changedInstr(MI); 2536 return true; 2537 } 2538 2539 if (MI.getOpcode() != AMDGPU::G_LOAD) 2540 return false; 2541 2542 Register ValReg = MI.getOperand(0).getReg(); 2543 LLT ValTy = MRI.getType(ValReg); 2544 2545 MachineMemOperand *MMO = *MI.memoperands_begin(); 2546 const unsigned ValSize = ValTy.getSizeInBits(); 2547 const LLT MemTy = MMO->getMemoryType(); 2548 const Align MemAlign = MMO->getAlign(); 2549 const unsigned MemSize = MemTy.getSizeInBits(); 2550 const unsigned AlignInBits = 8 * MemAlign.value(); 2551 2552 // Widen non-power-of-2 loads to the alignment if needed 2553 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2554 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2555 2556 // This was already the correct extending load result type, so just adjust 2557 // the memory type. 2558 if (WideMemSize == ValSize) { 2559 MachineFunction &MF = B.getMF(); 2560 2561 MachineMemOperand *WideMMO = 2562 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2563 Observer.changingInstr(MI); 2564 MI.setMemRefs(MF, {WideMMO}); 2565 Observer.changedInstr(MI); 2566 return true; 2567 } 2568 2569 // Don't bother handling edge case that should probably never be produced. 2570 if (ValSize > WideMemSize) 2571 return false; 2572 2573 LLT WideTy = widenToNextPowerOf2(ValTy); 2574 2575 Register WideLoad; 2576 if (!WideTy.isVector()) { 2577 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2578 B.buildTrunc(ValReg, WideLoad).getReg(0); 2579 } else { 2580 // Extract the subvector. 2581 2582 if (isRegisterType(ValTy)) { 2583 // If this a case where G_EXTRACT is legal, use it. 2584 // (e.g. <3 x s32> -> <4 x s32>) 2585 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2586 B.buildExtract(ValReg, WideLoad, 0); 2587 } else { 2588 // For cases where the widened type isn't a nice register value, unmerge 2589 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2590 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2591 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2592 } 2593 } 2594 2595 MI.eraseFromParent(); 2596 return true; 2597 } 2598 2599 return false; 2600 } 2601 2602 bool AMDGPULegalizerInfo::legalizeFMad( 2603 MachineInstr &MI, MachineRegisterInfo &MRI, 2604 MachineIRBuilder &B) const { 2605 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2606 assert(Ty.isScalar()); 2607 2608 MachineFunction &MF = B.getMF(); 2609 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2610 2611 // TODO: Always legal with future ftz flag. 2612 // FIXME: Do we need just output? 2613 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2614 return true; 2615 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2616 return true; 2617 2618 MachineIRBuilder HelperBuilder(MI); 2619 GISelObserverWrapper DummyObserver; 2620 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2621 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2622 } 2623 2624 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2625 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2626 Register DstReg = MI.getOperand(0).getReg(); 2627 Register PtrReg = MI.getOperand(1).getReg(); 2628 Register CmpVal = MI.getOperand(2).getReg(); 2629 Register NewVal = MI.getOperand(3).getReg(); 2630 2631 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2632 "this should not have been custom lowered"); 2633 2634 LLT ValTy = MRI.getType(CmpVal); 2635 LLT VecTy = LLT::fixed_vector(2, ValTy); 2636 2637 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2638 2639 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2640 .addDef(DstReg) 2641 .addUse(PtrReg) 2642 .addUse(PackedVal) 2643 .setMemRefs(MI.memoperands()); 2644 2645 MI.eraseFromParent(); 2646 return true; 2647 } 2648 2649 bool AMDGPULegalizerInfo::legalizeFlog( 2650 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2651 Register Dst = MI.getOperand(0).getReg(); 2652 Register Src = MI.getOperand(1).getReg(); 2653 LLT Ty = B.getMRI()->getType(Dst); 2654 unsigned Flags = MI.getFlags(); 2655 2656 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2657 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2658 2659 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2660 MI.eraseFromParent(); 2661 return true; 2662 } 2663 2664 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2665 MachineIRBuilder &B) const { 2666 Register Dst = MI.getOperand(0).getReg(); 2667 Register Src = MI.getOperand(1).getReg(); 2668 unsigned Flags = MI.getFlags(); 2669 LLT Ty = B.getMRI()->getType(Dst); 2670 2671 auto K = B.buildFConstant(Ty, numbers::log2e); 2672 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2673 B.buildFExp2(Dst, Mul, Flags); 2674 MI.eraseFromParent(); 2675 return true; 2676 } 2677 2678 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2679 MachineIRBuilder &B) const { 2680 Register Dst = MI.getOperand(0).getReg(); 2681 Register Src0 = MI.getOperand(1).getReg(); 2682 Register Src1 = MI.getOperand(2).getReg(); 2683 unsigned Flags = MI.getFlags(); 2684 LLT Ty = B.getMRI()->getType(Dst); 2685 const LLT S16 = LLT::scalar(16); 2686 const LLT S32 = LLT::scalar(32); 2687 2688 if (Ty == S32) { 2689 auto Log = B.buildFLog2(S32, Src0, Flags); 2690 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2691 .addUse(Log.getReg(0)) 2692 .addUse(Src1) 2693 .setMIFlags(Flags); 2694 B.buildFExp2(Dst, Mul, Flags); 2695 } else if (Ty == S16) { 2696 // There's no f16 fmul_legacy, so we need to convert for it. 2697 auto Log = B.buildFLog2(S16, Src0, Flags); 2698 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2699 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2700 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2701 .addUse(Ext0.getReg(0)) 2702 .addUse(Ext1.getReg(0)) 2703 .setMIFlags(Flags); 2704 2705 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2706 } else 2707 return false; 2708 2709 MI.eraseFromParent(); 2710 return true; 2711 } 2712 2713 // Find a source register, ignoring any possible source modifiers. 2714 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2715 Register ModSrc = OrigSrc; 2716 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2717 ModSrc = SrcFNeg->getOperand(1).getReg(); 2718 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2719 ModSrc = SrcFAbs->getOperand(1).getReg(); 2720 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2721 ModSrc = SrcFAbs->getOperand(1).getReg(); 2722 return ModSrc; 2723 } 2724 2725 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2726 MachineRegisterInfo &MRI, 2727 MachineIRBuilder &B) const { 2728 2729 const LLT S1 = LLT::scalar(1); 2730 const LLT S64 = LLT::scalar(64); 2731 Register Dst = MI.getOperand(0).getReg(); 2732 Register OrigSrc = MI.getOperand(1).getReg(); 2733 unsigned Flags = MI.getFlags(); 2734 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2735 "this should not have been custom lowered"); 2736 2737 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2738 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2739 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2740 // V_FRACT bug is: 2741 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2742 // 2743 // Convert floor(x) to (x - fract(x)) 2744 2745 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2746 .addUse(OrigSrc) 2747 .setMIFlags(Flags); 2748 2749 // Give source modifier matching some assistance before obscuring a foldable 2750 // pattern. 2751 2752 // TODO: We can avoid the neg on the fract? The input sign to fract 2753 // shouldn't matter? 2754 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2755 2756 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2757 2758 Register Min = MRI.createGenericVirtualRegister(S64); 2759 2760 // We don't need to concern ourselves with the snan handling difference, so 2761 // use the one which will directly select. 2762 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2763 if (MFI->getMode().IEEE) 2764 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2765 else 2766 B.buildFMinNum(Min, Fract, Const, Flags); 2767 2768 Register CorrectedFract = Min; 2769 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2770 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2771 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2772 } 2773 2774 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2775 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2776 2777 MI.eraseFromParent(); 2778 return true; 2779 } 2780 2781 // Turn an illegal packed v2s16 build vector into bit operations. 2782 // TODO: This should probably be a bitcast action in LegalizerHelper. 2783 bool AMDGPULegalizerInfo::legalizeBuildVector( 2784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2785 Register Dst = MI.getOperand(0).getReg(); 2786 const LLT S32 = LLT::scalar(32); 2787 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2788 2789 Register Src0 = MI.getOperand(1).getReg(); 2790 Register Src1 = MI.getOperand(2).getReg(); 2791 assert(MRI.getType(Src0) == LLT::scalar(16)); 2792 2793 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2794 B.buildBitcast(Dst, Merge); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2801 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2802 // case with a single min instruction instead of a compare+select. 2803 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2804 MachineRegisterInfo &MRI, 2805 MachineIRBuilder &B) const { 2806 Register Dst = MI.getOperand(0).getReg(); 2807 Register Src = MI.getOperand(1).getReg(); 2808 LLT DstTy = MRI.getType(Dst); 2809 LLT SrcTy = MRI.getType(Src); 2810 2811 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2812 ? AMDGPU::G_AMDGPU_FFBH_U32 2813 : AMDGPU::G_AMDGPU_FFBL_B32; 2814 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2815 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2816 2817 MI.eraseFromParent(); 2818 return true; 2819 } 2820 2821 // Check that this is a G_XOR x, -1 2822 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2823 if (MI.getOpcode() != TargetOpcode::G_XOR) 2824 return false; 2825 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2826 return ConstVal && *ConstVal == -1; 2827 } 2828 2829 // Return the use branch instruction, otherwise null if the usage is invalid. 2830 static MachineInstr * 2831 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2832 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2833 Register CondDef = MI.getOperand(0).getReg(); 2834 if (!MRI.hasOneNonDBGUse(CondDef)) 2835 return nullptr; 2836 2837 MachineBasicBlock *Parent = MI.getParent(); 2838 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2839 2840 if (isNot(MRI, *UseMI)) { 2841 Register NegatedCond = UseMI->getOperand(0).getReg(); 2842 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2843 return nullptr; 2844 2845 // We're deleting the def of this value, so we need to remove it. 2846 eraseInstr(*UseMI, MRI); 2847 2848 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2849 Negated = true; 2850 } 2851 2852 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2853 return nullptr; 2854 2855 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2856 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2857 if (Next == Parent->end()) { 2858 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2859 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2860 return nullptr; 2861 UncondBrTarget = &*NextMBB; 2862 } else { 2863 if (Next->getOpcode() != AMDGPU::G_BR) 2864 return nullptr; 2865 Br = &*Next; 2866 UncondBrTarget = Br->getOperand(0).getMBB(); 2867 } 2868 2869 return UseMI; 2870 } 2871 2872 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2873 const ArgDescriptor *Arg, 2874 const TargetRegisterClass *ArgRC, 2875 LLT ArgTy) const { 2876 MCRegister SrcReg = Arg->getRegister(); 2877 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2878 assert(DstReg.isVirtual() && "Virtual register expected"); 2879 2880 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2881 *ArgRC, B.getDebugLoc(), ArgTy); 2882 if (Arg->isMasked()) { 2883 // TODO: Should we try to emit this once in the entry block? 2884 const LLT S32 = LLT::scalar(32); 2885 const unsigned Mask = Arg->getMask(); 2886 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2887 2888 Register AndMaskSrc = LiveIn; 2889 2890 if (Shift != 0) { 2891 auto ShiftAmt = B.buildConstant(S32, Shift); 2892 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2893 } 2894 2895 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2896 } else { 2897 B.buildCopy(DstReg, LiveIn); 2898 } 2899 2900 return true; 2901 } 2902 2903 bool AMDGPULegalizerInfo::loadInputValue( 2904 Register DstReg, MachineIRBuilder &B, 2905 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2906 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2907 const ArgDescriptor *Arg; 2908 const TargetRegisterClass *ArgRC; 2909 LLT ArgTy; 2910 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2911 2912 if (!Arg) { 2913 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2914 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2915 // case the pointer argument may be missing and we use null. 2916 B.buildConstant(DstReg, 0); 2917 return true; 2918 } 2919 2920 // It's undefined behavior if a function marked with the amdgpu-no-* 2921 // attributes uses the corresponding intrinsic. 2922 B.buildUndef(DstReg); 2923 return true; 2924 } 2925 2926 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2927 return false; // TODO: Handle these 2928 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2929 } 2930 2931 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2932 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2933 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2934 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2935 return false; 2936 2937 MI.eraseFromParent(); 2938 return true; 2939 } 2940 2941 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2942 MachineRegisterInfo &MRI, 2943 MachineIRBuilder &B) const { 2944 Register Dst = MI.getOperand(0).getReg(); 2945 LLT DstTy = MRI.getType(Dst); 2946 LLT S16 = LLT::scalar(16); 2947 LLT S32 = LLT::scalar(32); 2948 LLT S64 = LLT::scalar(64); 2949 2950 if (DstTy == S16) 2951 return legalizeFDIV16(MI, MRI, B); 2952 if (DstTy == S32) 2953 return legalizeFDIV32(MI, MRI, B); 2954 if (DstTy == S64) 2955 return legalizeFDIV64(MI, MRI, B); 2956 2957 return false; 2958 } 2959 2960 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2961 Register DstDivReg, 2962 Register DstRemReg, 2963 Register X, 2964 Register Y) const { 2965 const LLT S1 = LLT::scalar(1); 2966 const LLT S32 = LLT::scalar(32); 2967 2968 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2969 // algorithm used here. 2970 2971 // Initial estimate of inv(y). 2972 auto FloatY = B.buildUITOFP(S32, Y); 2973 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2974 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2975 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2976 auto Z = B.buildFPTOUI(S32, ScaledY); 2977 2978 // One round of UNR. 2979 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2980 auto NegYZ = B.buildMul(S32, NegY, Z); 2981 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2982 2983 // Quotient/remainder estimate. 2984 auto Q = B.buildUMulH(S32, X, Z); 2985 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2986 2987 // First quotient/remainder refinement. 2988 auto One = B.buildConstant(S32, 1); 2989 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2990 if (DstDivReg) 2991 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2992 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2993 2994 // Second quotient/remainder refinement. 2995 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2996 if (DstDivReg) 2997 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2998 2999 if (DstRemReg) 3000 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3001 } 3002 3003 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3004 // 3005 // Return lo, hi of result 3006 // 3007 // %cvt.lo = G_UITOFP Val.lo 3008 // %cvt.hi = G_UITOFP Val.hi 3009 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3010 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3011 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3012 // %mul2 = G_FMUL %mul1, 2**(-32) 3013 // %trunc = G_INTRINSIC_TRUNC %mul2 3014 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3015 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3016 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3017 Register Val) { 3018 const LLT S32 = LLT::scalar(32); 3019 auto Unmerge = B.buildUnmerge(S32, Val); 3020 3021 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3022 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3023 3024 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3025 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3026 3027 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3028 auto Mul1 = 3029 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3030 3031 // 2**(-32) 3032 auto Mul2 = 3033 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3034 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3035 3036 // -(2**32) 3037 auto Mad2 = B.buildFMAD(S32, Trunc, 3038 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3039 3040 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3041 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3042 3043 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3044 } 3045 3046 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3047 Register DstDivReg, 3048 Register DstRemReg, 3049 Register Numer, 3050 Register Denom) const { 3051 const LLT S32 = LLT::scalar(32); 3052 const LLT S64 = LLT::scalar(64); 3053 const LLT S1 = LLT::scalar(1); 3054 Register RcpLo, RcpHi; 3055 3056 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3057 3058 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3059 3060 auto Zero64 = B.buildConstant(S64, 0); 3061 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3062 3063 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3064 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3065 3066 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3067 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3068 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3069 3070 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3071 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3072 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3073 3074 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3075 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3076 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3077 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3078 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3079 3080 auto Zero32 = B.buildConstant(S32, 0); 3081 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3082 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3083 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3084 3085 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3086 Register NumerLo = UnmergeNumer.getReg(0); 3087 Register NumerHi = UnmergeNumer.getReg(1); 3088 3089 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3090 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3091 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3092 Register Mul3_Lo = UnmergeMul3.getReg(0); 3093 Register Mul3_Hi = UnmergeMul3.getReg(1); 3094 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3095 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3096 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3097 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3098 3099 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3100 Register DenomLo = UnmergeDenom.getReg(0); 3101 Register DenomHi = UnmergeDenom.getReg(1); 3102 3103 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3104 auto C1 = B.buildSExt(S32, CmpHi); 3105 3106 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3107 auto C2 = B.buildSExt(S32, CmpLo); 3108 3109 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3110 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3111 3112 // TODO: Here and below portions of the code can be enclosed into if/endif. 3113 // Currently control flow is unconditional and we have 4 selects after 3114 // potential endif to substitute PHIs. 3115 3116 // if C3 != 0 ... 3117 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3118 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3119 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3120 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3121 3122 auto One64 = B.buildConstant(S64, 1); 3123 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3124 3125 auto C4 = 3126 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3127 auto C5 = 3128 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3129 auto C6 = B.buildSelect( 3130 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3131 3132 // if (C6 != 0) 3133 auto Add4 = B.buildAdd(S64, Add3, One64); 3134 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3135 3136 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3137 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3138 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3139 3140 // endif C6 3141 // endif C3 3142 3143 if (DstDivReg) { 3144 auto Sel1 = B.buildSelect( 3145 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3146 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3147 Sel1, MulHi3); 3148 } 3149 3150 if (DstRemReg) { 3151 auto Sel2 = B.buildSelect( 3152 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3153 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3154 Sel2, Sub1); 3155 } 3156 } 3157 3158 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3159 MachineRegisterInfo &MRI, 3160 MachineIRBuilder &B) const { 3161 Register DstDivReg, DstRemReg; 3162 switch (MI.getOpcode()) { 3163 default: 3164 llvm_unreachable("Unexpected opcode!"); 3165 case AMDGPU::G_UDIV: { 3166 DstDivReg = MI.getOperand(0).getReg(); 3167 break; 3168 } 3169 case AMDGPU::G_UREM: { 3170 DstRemReg = MI.getOperand(0).getReg(); 3171 break; 3172 } 3173 case AMDGPU::G_UDIVREM: { 3174 DstDivReg = MI.getOperand(0).getReg(); 3175 DstRemReg = MI.getOperand(1).getReg(); 3176 break; 3177 } 3178 } 3179 3180 const LLT S64 = LLT::scalar(64); 3181 const LLT S32 = LLT::scalar(32); 3182 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3183 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3184 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3185 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3186 3187 if (Ty == S32) 3188 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3189 else if (Ty == S64) 3190 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3191 else 3192 return false; 3193 3194 MI.eraseFromParent(); 3195 return true; 3196 } 3197 3198 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3199 MachineRegisterInfo &MRI, 3200 MachineIRBuilder &B) const { 3201 const LLT S64 = LLT::scalar(64); 3202 const LLT S32 = LLT::scalar(32); 3203 3204 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3205 if (Ty != S32 && Ty != S64) 3206 return false; 3207 3208 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3209 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3210 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3211 3212 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3213 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3214 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3215 3216 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3217 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3218 3219 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3220 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3221 3222 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3223 switch (MI.getOpcode()) { 3224 default: 3225 llvm_unreachable("Unexpected opcode!"); 3226 case AMDGPU::G_SDIV: { 3227 DstDivReg = MI.getOperand(0).getReg(); 3228 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3229 break; 3230 } 3231 case AMDGPU::G_SREM: { 3232 DstRemReg = MI.getOperand(0).getReg(); 3233 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3234 break; 3235 } 3236 case AMDGPU::G_SDIVREM: { 3237 DstDivReg = MI.getOperand(0).getReg(); 3238 DstRemReg = MI.getOperand(1).getReg(); 3239 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3240 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3241 break; 3242 } 3243 } 3244 3245 if (Ty == S32) 3246 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3247 else 3248 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3249 3250 if (DstDivReg) { 3251 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3252 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3253 B.buildSub(DstDivReg, SignXor, Sign); 3254 } 3255 3256 if (DstRemReg) { 3257 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3258 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3259 B.buildSub(DstRemReg, SignXor, Sign); 3260 } 3261 3262 MI.eraseFromParent(); 3263 return true; 3264 } 3265 3266 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3267 MachineRegisterInfo &MRI, 3268 MachineIRBuilder &B) const { 3269 Register Res = MI.getOperand(0).getReg(); 3270 Register LHS = MI.getOperand(1).getReg(); 3271 Register RHS = MI.getOperand(2).getReg(); 3272 uint16_t Flags = MI.getFlags(); 3273 LLT ResTy = MRI.getType(Res); 3274 3275 const MachineFunction &MF = B.getMF(); 3276 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3277 MI.getFlag(MachineInstr::FmAfn); 3278 3279 if (!AllowInaccurateRcp) 3280 return false; 3281 3282 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3283 // 1 / x -> RCP(x) 3284 if (CLHS->isExactlyValue(1.0)) { 3285 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3286 .addUse(RHS) 3287 .setMIFlags(Flags); 3288 3289 MI.eraseFromParent(); 3290 return true; 3291 } 3292 3293 // -1 / x -> RCP( FNEG(x) ) 3294 if (CLHS->isExactlyValue(-1.0)) { 3295 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3296 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3297 .addUse(FNeg.getReg(0)) 3298 .setMIFlags(Flags); 3299 3300 MI.eraseFromParent(); 3301 return true; 3302 } 3303 } 3304 3305 // x / y -> x * (1.0 / y) 3306 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3307 .addUse(RHS) 3308 .setMIFlags(Flags); 3309 B.buildFMul(Res, LHS, RCP, Flags); 3310 3311 MI.eraseFromParent(); 3312 return true; 3313 } 3314 3315 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3316 MachineRegisterInfo &MRI, 3317 MachineIRBuilder &B) const { 3318 Register Res = MI.getOperand(0).getReg(); 3319 Register X = MI.getOperand(1).getReg(); 3320 Register Y = MI.getOperand(2).getReg(); 3321 uint16_t Flags = MI.getFlags(); 3322 LLT ResTy = MRI.getType(Res); 3323 3324 const MachineFunction &MF = B.getMF(); 3325 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3326 MI.getFlag(MachineInstr::FmAfn); 3327 3328 if (!AllowInaccurateRcp) 3329 return false; 3330 3331 auto NegY = B.buildFNeg(ResTy, Y); 3332 auto One = B.buildFConstant(ResTy, 1.0); 3333 3334 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3335 .addUse(Y) 3336 .setMIFlags(Flags); 3337 3338 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3339 R = B.buildFMA(ResTy, Tmp0, R, R); 3340 3341 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3342 R = B.buildFMA(ResTy, Tmp1, R, R); 3343 3344 auto Ret = B.buildFMul(ResTy, X, R); 3345 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3346 3347 B.buildFMA(Res, Tmp2, R, Ret); 3348 MI.eraseFromParent(); 3349 return true; 3350 } 3351 3352 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3353 MachineRegisterInfo &MRI, 3354 MachineIRBuilder &B) const { 3355 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3356 return true; 3357 3358 Register Res = MI.getOperand(0).getReg(); 3359 Register LHS = MI.getOperand(1).getReg(); 3360 Register RHS = MI.getOperand(2).getReg(); 3361 3362 uint16_t Flags = MI.getFlags(); 3363 3364 LLT S16 = LLT::scalar(16); 3365 LLT S32 = LLT::scalar(32); 3366 3367 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3368 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3369 3370 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3371 .addUse(RHSExt.getReg(0)) 3372 .setMIFlags(Flags); 3373 3374 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3375 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3376 3377 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3378 .addUse(RDst.getReg(0)) 3379 .addUse(RHS) 3380 .addUse(LHS) 3381 .setMIFlags(Flags); 3382 3383 MI.eraseFromParent(); 3384 return true; 3385 } 3386 3387 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3388 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3389 static void toggleSPDenormMode(bool Enable, 3390 MachineIRBuilder &B, 3391 const GCNSubtarget &ST, 3392 AMDGPU::SIModeRegisterDefaults Mode) { 3393 // Set SP denorm mode to this value. 3394 unsigned SPDenormMode = 3395 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3396 3397 if (ST.hasDenormModeInst()) { 3398 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3399 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3400 3401 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3402 B.buildInstr(AMDGPU::S_DENORM_MODE) 3403 .addImm(NewDenormModeValue); 3404 3405 } else { 3406 // Select FP32 bit field in mode register. 3407 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3408 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3409 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3410 3411 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3412 .addImm(SPDenormMode) 3413 .addImm(SPDenormModeBitField); 3414 } 3415 } 3416 3417 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3418 MachineRegisterInfo &MRI, 3419 MachineIRBuilder &B) const { 3420 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3421 return true; 3422 3423 Register Res = MI.getOperand(0).getReg(); 3424 Register LHS = MI.getOperand(1).getReg(); 3425 Register RHS = MI.getOperand(2).getReg(); 3426 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3427 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3428 3429 uint16_t Flags = MI.getFlags(); 3430 3431 LLT S32 = LLT::scalar(32); 3432 LLT S1 = LLT::scalar(1); 3433 3434 auto One = B.buildFConstant(S32, 1.0f); 3435 3436 auto DenominatorScaled = 3437 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3438 .addUse(LHS) 3439 .addUse(RHS) 3440 .addImm(0) 3441 .setMIFlags(Flags); 3442 auto NumeratorScaled = 3443 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3444 .addUse(LHS) 3445 .addUse(RHS) 3446 .addImm(1) 3447 .setMIFlags(Flags); 3448 3449 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3450 .addUse(DenominatorScaled.getReg(0)) 3451 .setMIFlags(Flags); 3452 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3453 3454 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3455 // aren't modeled as reading it. 3456 if (!Mode.allFP32Denormals()) 3457 toggleSPDenormMode(true, B, ST, Mode); 3458 3459 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3460 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3461 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3462 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3463 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3464 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3465 3466 if (!Mode.allFP32Denormals()) 3467 toggleSPDenormMode(false, B, ST, Mode); 3468 3469 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3470 .addUse(Fma4.getReg(0)) 3471 .addUse(Fma1.getReg(0)) 3472 .addUse(Fma3.getReg(0)) 3473 .addUse(NumeratorScaled.getReg(1)) 3474 .setMIFlags(Flags); 3475 3476 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3477 .addUse(Fmas.getReg(0)) 3478 .addUse(RHS) 3479 .addUse(LHS) 3480 .setMIFlags(Flags); 3481 3482 MI.eraseFromParent(); 3483 return true; 3484 } 3485 3486 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3487 MachineRegisterInfo &MRI, 3488 MachineIRBuilder &B) const { 3489 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3490 return true; 3491 3492 Register Res = MI.getOperand(0).getReg(); 3493 Register LHS = MI.getOperand(1).getReg(); 3494 Register RHS = MI.getOperand(2).getReg(); 3495 3496 uint16_t Flags = MI.getFlags(); 3497 3498 LLT S64 = LLT::scalar(64); 3499 LLT S1 = LLT::scalar(1); 3500 3501 auto One = B.buildFConstant(S64, 1.0); 3502 3503 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3504 .addUse(LHS) 3505 .addUse(RHS) 3506 .addImm(0) 3507 .setMIFlags(Flags); 3508 3509 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3510 3511 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3512 .addUse(DivScale0.getReg(0)) 3513 .setMIFlags(Flags); 3514 3515 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3516 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3517 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3518 3519 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3520 .addUse(LHS) 3521 .addUse(RHS) 3522 .addImm(1) 3523 .setMIFlags(Flags); 3524 3525 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3526 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3527 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3528 3529 Register Scale; 3530 if (!ST.hasUsableDivScaleConditionOutput()) { 3531 // Workaround a hardware bug on SI where the condition output from div_scale 3532 // is not usable. 3533 3534 LLT S32 = LLT::scalar(32); 3535 3536 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3537 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3538 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3539 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3540 3541 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3542 Scale1Unmerge.getReg(1)); 3543 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3544 Scale0Unmerge.getReg(1)); 3545 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3546 } else { 3547 Scale = DivScale1.getReg(1); 3548 } 3549 3550 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3551 .addUse(Fma4.getReg(0)) 3552 .addUse(Fma3.getReg(0)) 3553 .addUse(Mul.getReg(0)) 3554 .addUse(Scale) 3555 .setMIFlags(Flags); 3556 3557 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3558 .addUse(Fmas.getReg(0)) 3559 .addUse(RHS) 3560 .addUse(LHS) 3561 .setMIFlags(Flags); 3562 3563 MI.eraseFromParent(); 3564 return true; 3565 } 3566 3567 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3568 MachineRegisterInfo &MRI, 3569 MachineIRBuilder &B) const { 3570 Register Res = MI.getOperand(0).getReg(); 3571 Register LHS = MI.getOperand(2).getReg(); 3572 Register RHS = MI.getOperand(3).getReg(); 3573 uint16_t Flags = MI.getFlags(); 3574 3575 LLT S32 = LLT::scalar(32); 3576 LLT S1 = LLT::scalar(1); 3577 3578 auto Abs = B.buildFAbs(S32, RHS, Flags); 3579 const APFloat C0Val(1.0f); 3580 3581 auto C0 = B.buildConstant(S32, 0x6f800000); 3582 auto C1 = B.buildConstant(S32, 0x2f800000); 3583 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3584 3585 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3586 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3587 3588 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3589 3590 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3591 .addUse(Mul0.getReg(0)) 3592 .setMIFlags(Flags); 3593 3594 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3595 3596 B.buildFMul(Res, Sel, Mul1, Flags); 3597 3598 MI.eraseFromParent(); 3599 return true; 3600 } 3601 3602 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3603 // FIXME: Why do we handle this one but not other removed instructions? 3604 // 3605 // Reciprocal square root. The clamp prevents infinite results, clamping 3606 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3607 // +-max_float. 3608 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3609 MachineRegisterInfo &MRI, 3610 MachineIRBuilder &B) const { 3611 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3612 return true; 3613 3614 Register Dst = MI.getOperand(0).getReg(); 3615 Register Src = MI.getOperand(2).getReg(); 3616 auto Flags = MI.getFlags(); 3617 3618 LLT Ty = MRI.getType(Dst); 3619 3620 const fltSemantics *FltSemantics; 3621 if (Ty == LLT::scalar(32)) 3622 FltSemantics = &APFloat::IEEEsingle(); 3623 else if (Ty == LLT::scalar(64)) 3624 FltSemantics = &APFloat::IEEEdouble(); 3625 else 3626 return false; 3627 3628 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3629 .addUse(Src) 3630 .setMIFlags(Flags); 3631 3632 // We don't need to concern ourselves with the snan handling difference, since 3633 // the rsq quieted (or not) so use the one which will directly select. 3634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3635 const bool UseIEEE = MFI->getMode().IEEE; 3636 3637 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3638 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3639 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3640 3641 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3642 3643 if (UseIEEE) 3644 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3645 else 3646 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3647 MI.eraseFromParent(); 3648 return true; 3649 } 3650 3651 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3652 switch (IID) { 3653 case Intrinsic::amdgcn_ds_fadd: 3654 return AMDGPU::G_ATOMICRMW_FADD; 3655 case Intrinsic::amdgcn_ds_fmin: 3656 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3657 case Intrinsic::amdgcn_ds_fmax: 3658 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3659 default: 3660 llvm_unreachable("not a DS FP intrinsic"); 3661 } 3662 } 3663 3664 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3665 MachineInstr &MI, 3666 Intrinsic::ID IID) const { 3667 GISelChangeObserver &Observer = Helper.Observer; 3668 Observer.changingInstr(MI); 3669 3670 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3671 3672 // The remaining operands were used to set fields in the MemOperand on 3673 // construction. 3674 for (int I = 6; I > 3; --I) 3675 MI.RemoveOperand(I); 3676 3677 MI.RemoveOperand(1); // Remove the intrinsic ID. 3678 Observer.changedInstr(MI); 3679 return true; 3680 } 3681 3682 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3683 MachineRegisterInfo &MRI, 3684 MachineIRBuilder &B) const { 3685 uint64_t Offset = 3686 ST.getTargetLowering()->getImplicitParameterOffset( 3687 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3688 LLT DstTy = MRI.getType(DstReg); 3689 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3690 3691 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3692 if (!loadInputValue(KernargPtrReg, B, 3693 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3694 return false; 3695 3696 // FIXME: This should be nuw 3697 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3698 return true; 3699 } 3700 3701 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3702 MachineRegisterInfo &MRI, 3703 MachineIRBuilder &B) const { 3704 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3705 if (!MFI->isEntryFunction()) { 3706 return legalizePreloadedArgIntrin(MI, MRI, B, 3707 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3708 } 3709 3710 Register DstReg = MI.getOperand(0).getReg(); 3711 if (!getImplicitArgPtr(DstReg, MRI, B)) 3712 return false; 3713 3714 MI.eraseFromParent(); 3715 return true; 3716 } 3717 3718 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3719 MachineRegisterInfo &MRI, 3720 MachineIRBuilder &B, 3721 unsigned AddrSpace) const { 3722 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3723 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3724 Register Hi32 = Unmerge.getReg(1); 3725 3726 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3727 MI.eraseFromParent(); 3728 return true; 3729 } 3730 3731 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3732 // offset (the offset that is included in bounds checking and swizzling, to be 3733 // split between the instruction's voffset and immoffset fields) and soffset 3734 // (the offset that is excluded from bounds checking and swizzling, to go in 3735 // the instruction's soffset field). This function takes the first kind of 3736 // offset and figures out how to split it between voffset and immoffset. 3737 std::pair<Register, unsigned> 3738 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3739 Register OrigOffset) const { 3740 const unsigned MaxImm = 4095; 3741 Register BaseReg; 3742 unsigned ImmOffset; 3743 const LLT S32 = LLT::scalar(32); 3744 MachineRegisterInfo &MRI = *B.getMRI(); 3745 3746 std::tie(BaseReg, ImmOffset) = 3747 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3748 3749 // If BaseReg is a pointer, convert it to int. 3750 if (MRI.getType(BaseReg).isPointer()) 3751 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3752 3753 // If the immediate value is too big for the immoffset field, put the value 3754 // and -4096 into the immoffset field so that the value that is copied/added 3755 // for the voffset field is a multiple of 4096, and it stands more chance 3756 // of being CSEd with the copy/add for another similar load/store. 3757 // However, do not do that rounding down to a multiple of 4096 if that is a 3758 // negative number, as it appears to be illegal to have a negative offset 3759 // in the vgpr, even if adding the immediate offset makes it positive. 3760 unsigned Overflow = ImmOffset & ~MaxImm; 3761 ImmOffset -= Overflow; 3762 if ((int32_t)Overflow < 0) { 3763 Overflow += ImmOffset; 3764 ImmOffset = 0; 3765 } 3766 3767 if (Overflow != 0) { 3768 if (!BaseReg) { 3769 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3770 } else { 3771 auto OverflowVal = B.buildConstant(S32, Overflow); 3772 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3773 } 3774 } 3775 3776 if (!BaseReg) 3777 BaseReg = B.buildConstant(S32, 0).getReg(0); 3778 3779 return std::make_pair(BaseReg, ImmOffset); 3780 } 3781 3782 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3783 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3784 Register VOffset, Register SOffset, 3785 unsigned ImmOffset, Register VIndex, 3786 MachineRegisterInfo &MRI) const { 3787 Optional<ValueAndVReg> MaybeVOffsetVal = 3788 getIConstantVRegValWithLookThrough(VOffset, MRI); 3789 Optional<ValueAndVReg> MaybeSOffsetVal = 3790 getIConstantVRegValWithLookThrough(SOffset, MRI); 3791 Optional<ValueAndVReg> MaybeVIndexVal = 3792 getIConstantVRegValWithLookThrough(VIndex, MRI); 3793 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3794 // update the MMO with that offset. The stride is unknown so we can only do 3795 // this if VIndex is constant 0. 3796 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3797 MaybeVIndexVal->Value == 0) { 3798 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3799 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3800 MMO->setOffset(TotalOffset); 3801 } else { 3802 // We don't have a constant combined offset to use in the MMO. Give up. 3803 MMO->setValue((Value *)nullptr); 3804 } 3805 } 3806 3807 /// Handle register layout difference for f16 images for some subtargets. 3808 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3809 MachineRegisterInfo &MRI, 3810 Register Reg, 3811 bool ImageStore) const { 3812 const LLT S16 = LLT::scalar(16); 3813 const LLT S32 = LLT::scalar(32); 3814 LLT StoreVT = MRI.getType(Reg); 3815 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3816 3817 if (ST.hasUnpackedD16VMem()) { 3818 auto Unmerge = B.buildUnmerge(S16, Reg); 3819 3820 SmallVector<Register, 4> WideRegs; 3821 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3822 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3823 3824 int NumElts = StoreVT.getNumElements(); 3825 3826 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3827 .getReg(0); 3828 } 3829 3830 if (ImageStore && ST.hasImageStoreD16Bug()) { 3831 if (StoreVT.getNumElements() == 2) { 3832 SmallVector<Register, 4> PackedRegs; 3833 Reg = B.buildBitcast(S32, Reg).getReg(0); 3834 PackedRegs.push_back(Reg); 3835 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3836 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3837 .getReg(0); 3838 } 3839 3840 if (StoreVT.getNumElements() == 3) { 3841 SmallVector<Register, 4> PackedRegs; 3842 auto Unmerge = B.buildUnmerge(S16, Reg); 3843 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3844 PackedRegs.push_back(Unmerge.getReg(I)); 3845 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3846 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3847 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3848 } 3849 3850 if (StoreVT.getNumElements() == 4) { 3851 SmallVector<Register, 4> PackedRegs; 3852 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3853 auto Unmerge = B.buildUnmerge(S32, Reg); 3854 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3855 PackedRegs.push_back(Unmerge.getReg(I)); 3856 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3857 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3858 .getReg(0); 3859 } 3860 3861 llvm_unreachable("invalid data type"); 3862 } 3863 3864 if (StoreVT == LLT::fixed_vector(3, S16)) { 3865 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3866 .getReg(0); 3867 } 3868 return Reg; 3869 } 3870 3871 Register AMDGPULegalizerInfo::fixStoreSourceType( 3872 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3873 MachineRegisterInfo *MRI = B.getMRI(); 3874 LLT Ty = MRI->getType(VData); 3875 3876 const LLT S16 = LLT::scalar(16); 3877 3878 // Fixup illegal register types for i8 stores. 3879 if (Ty == LLT::scalar(8) || Ty == S16) { 3880 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3881 return AnyExt; 3882 } 3883 3884 if (Ty.isVector()) { 3885 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3886 if (IsFormat) 3887 return handleD16VData(B, *MRI, VData); 3888 } 3889 } 3890 3891 return VData; 3892 } 3893 3894 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3895 MachineRegisterInfo &MRI, 3896 MachineIRBuilder &B, 3897 bool IsTyped, 3898 bool IsFormat) const { 3899 Register VData = MI.getOperand(1).getReg(); 3900 LLT Ty = MRI.getType(VData); 3901 LLT EltTy = Ty.getScalarType(); 3902 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3903 const LLT S32 = LLT::scalar(32); 3904 3905 VData = fixStoreSourceType(B, VData, IsFormat); 3906 Register RSrc = MI.getOperand(2).getReg(); 3907 3908 MachineMemOperand *MMO = *MI.memoperands_begin(); 3909 const int MemSize = MMO->getSize(); 3910 3911 unsigned ImmOffset; 3912 3913 // The typed intrinsics add an immediate after the registers. 3914 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3915 3916 // The struct intrinsic variants add one additional operand over raw. 3917 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3918 Register VIndex; 3919 int OpOffset = 0; 3920 if (HasVIndex) { 3921 VIndex = MI.getOperand(3).getReg(); 3922 OpOffset = 1; 3923 } else { 3924 VIndex = B.buildConstant(S32, 0).getReg(0); 3925 } 3926 3927 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3928 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3929 3930 unsigned Format = 0; 3931 if (IsTyped) { 3932 Format = MI.getOperand(5 + OpOffset).getImm(); 3933 ++OpOffset; 3934 } 3935 3936 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3937 3938 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3939 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3940 3941 unsigned Opc; 3942 if (IsTyped) { 3943 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3944 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3945 } else if (IsFormat) { 3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3947 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3948 } else { 3949 switch (MemSize) { 3950 case 1: 3951 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3952 break; 3953 case 2: 3954 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3955 break; 3956 default: 3957 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3958 break; 3959 } 3960 } 3961 3962 auto MIB = B.buildInstr(Opc) 3963 .addUse(VData) // vdata 3964 .addUse(RSrc) // rsrc 3965 .addUse(VIndex) // vindex 3966 .addUse(VOffset) // voffset 3967 .addUse(SOffset) // soffset 3968 .addImm(ImmOffset); // offset(imm) 3969 3970 if (IsTyped) 3971 MIB.addImm(Format); 3972 3973 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3974 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3975 .addMemOperand(MMO); 3976 3977 MI.eraseFromParent(); 3978 return true; 3979 } 3980 3981 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3982 MachineRegisterInfo &MRI, 3983 MachineIRBuilder &B, 3984 bool IsFormat, 3985 bool IsTyped) const { 3986 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3987 MachineMemOperand *MMO = *MI.memoperands_begin(); 3988 const LLT MemTy = MMO->getMemoryType(); 3989 const LLT S32 = LLT::scalar(32); 3990 3991 Register Dst = MI.getOperand(0).getReg(); 3992 Register RSrc = MI.getOperand(2).getReg(); 3993 3994 // The typed intrinsics add an immediate after the registers. 3995 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3996 3997 // The struct intrinsic variants add one additional operand over raw. 3998 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3999 Register VIndex; 4000 int OpOffset = 0; 4001 if (HasVIndex) { 4002 VIndex = MI.getOperand(3).getReg(); 4003 OpOffset = 1; 4004 } else { 4005 VIndex = B.buildConstant(S32, 0).getReg(0); 4006 } 4007 4008 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4009 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4010 4011 unsigned Format = 0; 4012 if (IsTyped) { 4013 Format = MI.getOperand(5 + OpOffset).getImm(); 4014 ++OpOffset; 4015 } 4016 4017 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4018 unsigned ImmOffset; 4019 4020 LLT Ty = MRI.getType(Dst); 4021 LLT EltTy = Ty.getScalarType(); 4022 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4023 const bool Unpacked = ST.hasUnpackedD16VMem(); 4024 4025 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4026 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4027 4028 unsigned Opc; 4029 4030 if (IsTyped) { 4031 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4032 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4033 } else if (IsFormat) { 4034 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4035 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4036 } else { 4037 switch (MemTy.getSizeInBits()) { 4038 case 8: 4039 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4040 break; 4041 case 16: 4042 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4043 break; 4044 default: 4045 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4046 break; 4047 } 4048 } 4049 4050 Register LoadDstReg; 4051 4052 bool IsExtLoad = 4053 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4054 LLT UnpackedTy = Ty.changeElementSize(32); 4055 4056 if (IsExtLoad) 4057 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4058 else if (Unpacked && IsD16 && Ty.isVector()) 4059 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4060 else 4061 LoadDstReg = Dst; 4062 4063 auto MIB = B.buildInstr(Opc) 4064 .addDef(LoadDstReg) // vdata 4065 .addUse(RSrc) // rsrc 4066 .addUse(VIndex) // vindex 4067 .addUse(VOffset) // voffset 4068 .addUse(SOffset) // soffset 4069 .addImm(ImmOffset); // offset(imm) 4070 4071 if (IsTyped) 4072 MIB.addImm(Format); 4073 4074 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4075 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4076 .addMemOperand(MMO); 4077 4078 if (LoadDstReg != Dst) { 4079 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4080 4081 // Widen result for extending loads was widened. 4082 if (IsExtLoad) 4083 B.buildTrunc(Dst, LoadDstReg); 4084 else { 4085 // Repack to original 16-bit vector result 4086 // FIXME: G_TRUNC should work, but legalization currently fails 4087 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4088 SmallVector<Register, 4> Repack; 4089 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4090 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4091 B.buildMerge(Dst, Repack); 4092 } 4093 } 4094 4095 MI.eraseFromParent(); 4096 return true; 4097 } 4098 4099 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4100 MachineIRBuilder &B, 4101 bool IsInc) const { 4102 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4103 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4104 B.buildInstr(Opc) 4105 .addDef(MI.getOperand(0).getReg()) 4106 .addUse(MI.getOperand(2).getReg()) 4107 .addUse(MI.getOperand(3).getReg()) 4108 .cloneMemRefs(MI); 4109 MI.eraseFromParent(); 4110 return true; 4111 } 4112 4113 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4114 switch (IntrID) { 4115 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4116 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4118 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4119 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4120 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4121 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4122 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4123 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4124 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4125 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4126 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4127 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4128 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4130 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4131 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4132 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4133 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4134 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4135 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4136 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4137 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4138 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4139 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4140 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4141 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4142 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4143 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4145 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4146 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4148 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4149 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4150 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4151 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4152 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4153 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4154 case Intrinsic::amdgcn_buffer_atomic_fadd: 4155 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4156 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4158 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4159 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4161 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4162 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4164 default: 4165 llvm_unreachable("unhandled atomic opcode"); 4166 } 4167 } 4168 4169 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4170 MachineIRBuilder &B, 4171 Intrinsic::ID IID) const { 4172 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4173 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4174 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4175 4176 Register Dst; 4177 4178 int OpOffset = 0; 4179 if (HasReturn) { 4180 // A few FP atomics do not support return values. 4181 Dst = MI.getOperand(0).getReg(); 4182 } else { 4183 OpOffset = -1; 4184 } 4185 4186 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4187 Register CmpVal; 4188 4189 if (IsCmpSwap) { 4190 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4191 ++OpOffset; 4192 } 4193 4194 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4195 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4196 4197 // The struct intrinsic variants add one additional operand over raw. 4198 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4199 Register VIndex; 4200 if (HasVIndex) { 4201 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4202 ++OpOffset; 4203 } else { 4204 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4205 } 4206 4207 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4208 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4209 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4210 4211 MachineMemOperand *MMO = *MI.memoperands_begin(); 4212 4213 unsigned ImmOffset; 4214 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4215 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4216 4217 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4218 4219 if (HasReturn) 4220 MIB.addDef(Dst); 4221 4222 MIB.addUse(VData); // vdata 4223 4224 if (IsCmpSwap) 4225 MIB.addReg(CmpVal); 4226 4227 MIB.addUse(RSrc) // rsrc 4228 .addUse(VIndex) // vindex 4229 .addUse(VOffset) // voffset 4230 .addUse(SOffset) // soffset 4231 .addImm(ImmOffset) // offset(imm) 4232 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4233 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4234 .addMemOperand(MMO); 4235 4236 MI.eraseFromParent(); 4237 return true; 4238 } 4239 4240 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4241 /// vector with s16 typed elements. 4242 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4243 SmallVectorImpl<Register> &PackedAddrs, 4244 unsigned ArgOffset, 4245 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4246 bool IsA16, bool IsG16) { 4247 const LLT S16 = LLT::scalar(16); 4248 const LLT V2S16 = LLT::fixed_vector(2, 16); 4249 auto EndIdx = Intr->VAddrEnd; 4250 4251 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4252 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4253 if (!SrcOp.isReg()) 4254 continue; // _L to _LZ may have eliminated this. 4255 4256 Register AddrReg = SrcOp.getReg(); 4257 4258 if ((I < Intr->GradientStart) || 4259 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4260 (I >= Intr->CoordStart && !IsA16)) { 4261 // Handle any gradient or coordinate operands that should not be packed 4262 if ((I < Intr->GradientStart) && IsA16 && 4263 (B.getMRI()->getType(AddrReg) == S16)) { 4264 // Special handling of bias when A16 is on. Bias is of type half but 4265 // occupies full 32-bit. 4266 PackedAddrs.push_back( 4267 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4268 .getReg(0)); 4269 } else { 4270 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4271 PackedAddrs.push_back(AddrReg); 4272 } 4273 } else { 4274 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4275 // derivatives dx/dh and dx/dv are packed with undef. 4276 if (((I + 1) >= EndIdx) || 4277 ((Intr->NumGradients / 2) % 2 == 1 && 4278 (I == static_cast<unsigned>(Intr->GradientStart + 4279 (Intr->NumGradients / 2) - 1) || 4280 I == static_cast<unsigned>(Intr->GradientStart + 4281 Intr->NumGradients - 1))) || 4282 // Check for _L to _LZ optimization 4283 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4284 PackedAddrs.push_back( 4285 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4286 .getReg(0)); 4287 } else { 4288 PackedAddrs.push_back( 4289 B.buildBuildVector( 4290 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4291 .getReg(0)); 4292 ++I; 4293 } 4294 } 4295 } 4296 } 4297 4298 /// Convert from separate vaddr components to a single vector address register, 4299 /// and replace the remaining operands with $noreg. 4300 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4301 int DimIdx, int NumVAddrs) { 4302 const LLT S32 = LLT::scalar(32); 4303 4304 SmallVector<Register, 8> AddrRegs; 4305 for (int I = 0; I != NumVAddrs; ++I) { 4306 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4307 if (SrcOp.isReg()) { 4308 AddrRegs.push_back(SrcOp.getReg()); 4309 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4310 } 4311 } 4312 4313 int NumAddrRegs = AddrRegs.size(); 4314 if (NumAddrRegs != 1) { 4315 // Above 8 elements round up to next power of 2 (i.e. 16). 4316 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4317 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4318 auto Undef = B.buildUndef(S32); 4319 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4320 NumAddrRegs = RoundedNumRegs; 4321 } 4322 4323 auto VAddr = 4324 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4325 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4326 } 4327 4328 for (int I = 1; I != NumVAddrs; ++I) { 4329 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4330 if (SrcOp.isReg()) 4331 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4332 } 4333 } 4334 4335 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4336 /// 4337 /// Depending on the subtarget, load/store with 16-bit element data need to be 4338 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4339 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4340 /// registers. 4341 /// 4342 /// We don't want to directly select image instructions just yet, but also want 4343 /// to exposes all register repacking to the legalizer/combiners. We also don't 4344 /// want a selected instrution entering RegBankSelect. In order to avoid 4345 /// defining a multitude of intermediate image instructions, directly hack on 4346 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4347 /// padding now unnecessary arguments with $noreg. 4348 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4349 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4350 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4351 4352 const unsigned NumDefs = MI.getNumExplicitDefs(); 4353 const unsigned ArgOffset = NumDefs + 1; 4354 bool IsTFE = NumDefs == 2; 4355 // We are only processing the operands of d16 image operations on subtargets 4356 // that use the unpacked register layout, or need to repack the TFE result. 4357 4358 // TODO: Do we need to guard against already legalized intrinsics? 4359 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4360 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4361 4362 MachineRegisterInfo *MRI = B.getMRI(); 4363 const LLT S32 = LLT::scalar(32); 4364 const LLT S16 = LLT::scalar(16); 4365 const LLT V2S16 = LLT::fixed_vector(2, 16); 4366 4367 unsigned DMask = 0; 4368 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4369 LLT Ty = MRI->getType(VData); 4370 4371 // Check for 16 bit addresses and pack if true. 4372 LLT GradTy = 4373 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4374 LLT AddrTy = 4375 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4376 const bool IsG16 = GradTy == S16; 4377 const bool IsA16 = AddrTy == S16; 4378 const bool IsD16 = Ty.getScalarType() == S16; 4379 4380 int DMaskLanes = 0; 4381 if (!BaseOpcode->Atomic) { 4382 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4383 if (BaseOpcode->Gather4) { 4384 DMaskLanes = 4; 4385 } else if (DMask != 0) { 4386 DMaskLanes = countPopulation(DMask); 4387 } else if (!IsTFE && !BaseOpcode->Store) { 4388 // If dmask is 0, this is a no-op load. This can be eliminated. 4389 B.buildUndef(MI.getOperand(0)); 4390 MI.eraseFromParent(); 4391 return true; 4392 } 4393 } 4394 4395 Observer.changingInstr(MI); 4396 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4397 4398 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4399 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4400 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4401 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4402 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4403 4404 // Track that we legalized this 4405 MI.setDesc(B.getTII().get(NewOpcode)); 4406 4407 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4408 // dmask to be at least 1 otherwise the instruction will fail 4409 if (IsTFE && DMask == 0) { 4410 DMask = 0x1; 4411 DMaskLanes = 1; 4412 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4413 } 4414 4415 if (BaseOpcode->Atomic) { 4416 Register VData0 = MI.getOperand(2).getReg(); 4417 LLT Ty = MRI->getType(VData0); 4418 4419 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4420 if (Ty.isVector()) 4421 return false; 4422 4423 if (BaseOpcode->AtomicX2) { 4424 Register VData1 = MI.getOperand(3).getReg(); 4425 // The two values are packed in one register. 4426 LLT PackedTy = LLT::fixed_vector(2, Ty); 4427 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4428 MI.getOperand(2).setReg(Concat.getReg(0)); 4429 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4430 } 4431 } 4432 4433 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4434 4435 // Optimize _L to _LZ when _L is zero 4436 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4437 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4438 const ConstantFP *ConstantLod; 4439 4440 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4441 m_GFCst(ConstantLod))) { 4442 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4443 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4444 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4445 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 4446 Intr->Dim); 4447 4448 // The starting indexes should remain in the same place. 4449 --CorrectedNumVAddrs; 4450 4451 MI.getOperand(NumDefs).setIntrinsicID( 4452 static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4453 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4454 Intr = NewImageDimIntr; 4455 } 4456 } 4457 } 4458 4459 // Optimize _mip away, when 'lod' is zero 4460 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4461 int64_t ConstantLod; 4462 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4463 m_ICst(ConstantLod))) { 4464 if (ConstantLod == 0) { 4465 // TODO: Change intrinsic opcode and remove operand instead or replacing 4466 // it with 0, as the _L to _LZ handling is done above. 4467 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4468 --CorrectedNumVAddrs; 4469 } 4470 } 4471 } 4472 4473 // Rewrite the addressing register layout before doing anything else. 4474 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4475 // 16 bit gradients are supported, but are tied to the A16 control 4476 // so both gradients and addresses must be 16 bit 4477 return false; 4478 } 4479 4480 if (IsA16 && !ST.hasA16()) { 4481 // A16 not supported 4482 return false; 4483 } 4484 4485 if (IsA16 || IsG16) { 4486 if (Intr->NumVAddrs > 1) { 4487 SmallVector<Register, 4> PackedRegs; 4488 4489 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4490 IsG16); 4491 4492 // See also below in the non-a16 branch 4493 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4494 PackedRegs.size() <= ST.getNSAMaxSize(); 4495 4496 if (!UseNSA && PackedRegs.size() > 1) { 4497 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4498 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4499 PackedRegs[0] = Concat.getReg(0); 4500 PackedRegs.resize(1); 4501 } 4502 4503 const unsigned NumPacked = PackedRegs.size(); 4504 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4505 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4506 if (!SrcOp.isReg()) { 4507 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4508 continue; 4509 } 4510 4511 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4512 4513 if (I - Intr->VAddrStart < NumPacked) 4514 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4515 else 4516 SrcOp.setReg(AMDGPU::NoRegister); 4517 } 4518 } 4519 } else { 4520 // If the register allocator cannot place the address registers contiguously 4521 // without introducing moves, then using the non-sequential address encoding 4522 // is always preferable, since it saves VALU instructions and is usually a 4523 // wash in terms of code size or even better. 4524 // 4525 // However, we currently have no way of hinting to the register allocator 4526 // that MIMG addresses should be placed contiguously when it is possible to 4527 // do so, so force non-NSA for the common 2-address case as a heuristic. 4528 // 4529 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4530 // allocation when possible. 4531 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4532 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4533 4534 if (!UseNSA && Intr->NumVAddrs > 1) 4535 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4536 Intr->NumVAddrs); 4537 } 4538 4539 int Flags = 0; 4540 if (IsA16) 4541 Flags |= 1; 4542 if (IsG16) 4543 Flags |= 2; 4544 MI.addOperand(MachineOperand::CreateImm(Flags)); 4545 4546 if (BaseOpcode->Store) { // No TFE for stores? 4547 // TODO: Handle dmask trim 4548 if (!Ty.isVector() || !IsD16) 4549 return true; 4550 4551 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4552 if (RepackedReg != VData) { 4553 MI.getOperand(1).setReg(RepackedReg); 4554 } 4555 4556 return true; 4557 } 4558 4559 Register DstReg = MI.getOperand(0).getReg(); 4560 const LLT EltTy = Ty.getScalarType(); 4561 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4562 4563 // Confirm that the return type is large enough for the dmask specified 4564 if (NumElts < DMaskLanes) 4565 return false; 4566 4567 if (NumElts > 4 || DMaskLanes > 4) 4568 return false; 4569 4570 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4571 const LLT AdjustedTy = 4572 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4573 4574 // The raw dword aligned data component of the load. The only legal cases 4575 // where this matters should be when using the packed D16 format, for 4576 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4577 LLT RoundedTy; 4578 4579 // S32 vector to to cover all data, plus TFE result element. 4580 LLT TFETy; 4581 4582 // Register type to use for each loaded component. Will be S32 or V2S16. 4583 LLT RegTy; 4584 4585 if (IsD16 && ST.hasUnpackedD16VMem()) { 4586 RoundedTy = 4587 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4588 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4589 RegTy = S32; 4590 } else { 4591 unsigned EltSize = EltTy.getSizeInBits(); 4592 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4593 unsigned RoundedSize = 32 * RoundedElts; 4594 RoundedTy = LLT::scalarOrVector( 4595 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4596 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4597 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4598 } 4599 4600 // The return type does not need adjustment. 4601 // TODO: Should we change s16 case to s32 or <2 x s16>? 4602 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4603 return true; 4604 4605 Register Dst1Reg; 4606 4607 // Insert after the instruction. 4608 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4609 4610 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4611 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4612 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4613 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4614 4615 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4616 4617 MI.getOperand(0).setReg(NewResultReg); 4618 4619 // In the IR, TFE is supposed to be used with a 2 element struct return 4620 // type. The instruction really returns these two values in one contiguous 4621 // register, with one additional dword beyond the loaded data. Rewrite the 4622 // return type to use a single register result. 4623 4624 if (IsTFE) { 4625 Dst1Reg = MI.getOperand(1).getReg(); 4626 if (MRI->getType(Dst1Reg) != S32) 4627 return false; 4628 4629 // TODO: Make sure the TFE operand bit is set. 4630 MI.RemoveOperand(1); 4631 4632 // Handle the easy case that requires no repack instructions. 4633 if (Ty == S32) { 4634 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4635 return true; 4636 } 4637 } 4638 4639 // Now figure out how to copy the new result register back into the old 4640 // result. 4641 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4642 4643 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4644 4645 if (ResultNumRegs == 1) { 4646 assert(!IsTFE); 4647 ResultRegs[0] = NewResultReg; 4648 } else { 4649 // We have to repack into a new vector of some kind. 4650 for (int I = 0; I != NumDataRegs; ++I) 4651 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4652 B.buildUnmerge(ResultRegs, NewResultReg); 4653 4654 // Drop the final TFE element to get the data part. The TFE result is 4655 // directly written to the right place already. 4656 if (IsTFE) 4657 ResultRegs.resize(NumDataRegs); 4658 } 4659 4660 // For an s16 scalar result, we form an s32 result with a truncate regardless 4661 // of packed vs. unpacked. 4662 if (IsD16 && !Ty.isVector()) { 4663 B.buildTrunc(DstReg, ResultRegs[0]); 4664 return true; 4665 } 4666 4667 // Avoid a build/concat_vector of 1 entry. 4668 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4669 B.buildBitcast(DstReg, ResultRegs[0]); 4670 return true; 4671 } 4672 4673 assert(Ty.isVector()); 4674 4675 if (IsD16) { 4676 // For packed D16 results with TFE enabled, all the data components are 4677 // S32. Cast back to the expected type. 4678 // 4679 // TODO: We don't really need to use load s32 elements. We would only need one 4680 // cast for the TFE result if a multiple of v2s16 was used. 4681 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4682 for (Register &Reg : ResultRegs) 4683 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4684 } else if (ST.hasUnpackedD16VMem()) { 4685 for (Register &Reg : ResultRegs) 4686 Reg = B.buildTrunc(S16, Reg).getReg(0); 4687 } 4688 } 4689 4690 auto padWithUndef = [&](LLT Ty, int NumElts) { 4691 if (NumElts == 0) 4692 return; 4693 Register Undef = B.buildUndef(Ty).getReg(0); 4694 for (int I = 0; I != NumElts; ++I) 4695 ResultRegs.push_back(Undef); 4696 }; 4697 4698 // Pad out any elements eliminated due to the dmask. 4699 LLT ResTy = MRI->getType(ResultRegs[0]); 4700 if (!ResTy.isVector()) { 4701 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4702 B.buildBuildVector(DstReg, ResultRegs); 4703 return true; 4704 } 4705 4706 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4707 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4708 4709 // Deal with the one annoying legal case. 4710 const LLT V3S16 = LLT::fixed_vector(3, 16); 4711 if (Ty == V3S16) { 4712 if (IsTFE) { 4713 if (ResultRegs.size() == 1) { 4714 NewResultReg = ResultRegs[0]; 4715 } else if (ResultRegs.size() == 2) { 4716 LLT V4S16 = LLT::fixed_vector(4, 16); 4717 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4718 } else { 4719 return false; 4720 } 4721 } 4722 4723 if (MRI->getType(DstReg).getNumElements() < 4724 MRI->getType(NewResultReg).getNumElements()) { 4725 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4726 } else { 4727 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4728 } 4729 return true; 4730 } 4731 4732 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4733 B.buildConcatVectors(DstReg, ResultRegs); 4734 return true; 4735 } 4736 4737 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4738 LegalizerHelper &Helper, MachineInstr &MI) const { 4739 MachineIRBuilder &B = Helper.MIRBuilder; 4740 GISelChangeObserver &Observer = Helper.Observer; 4741 4742 Register Dst = MI.getOperand(0).getReg(); 4743 LLT Ty = B.getMRI()->getType(Dst); 4744 unsigned Size = Ty.getSizeInBits(); 4745 MachineFunction &MF = B.getMF(); 4746 4747 Observer.changingInstr(MI); 4748 4749 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4750 Ty = getBitcastRegisterType(Ty); 4751 Helper.bitcastDst(MI, Ty, 0); 4752 Dst = MI.getOperand(0).getReg(); 4753 B.setInsertPt(B.getMBB(), MI); 4754 } 4755 4756 // FIXME: We don't really need this intermediate instruction. The intrinsic 4757 // should be fixed to have a memory operand. Since it's readnone, we're not 4758 // allowed to add one. 4759 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4760 MI.RemoveOperand(1); // Remove intrinsic ID 4761 4762 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4763 // TODO: Should this use datalayout alignment? 4764 const unsigned MemSize = (Size + 7) / 8; 4765 const Align MemAlign(4); 4766 MachineMemOperand *MMO = MF.getMachineMemOperand( 4767 MachinePointerInfo(), 4768 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4769 MachineMemOperand::MOInvariant, 4770 MemSize, MemAlign); 4771 MI.addMemOperand(MF, MMO); 4772 4773 // There are no 96-bit result scalar loads, but widening to 128-bit should 4774 // always be legal. We may need to restore this to a 96-bit result if it turns 4775 // out this needs to be converted to a vector load during RegBankSelect. 4776 if (!isPowerOf2_32(Size)) { 4777 if (Ty.isVector()) 4778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4779 else 4780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4781 } 4782 4783 Observer.changedInstr(MI); 4784 return true; 4785 } 4786 4787 // TODO: Move to selection 4788 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4789 MachineRegisterInfo &MRI, 4790 MachineIRBuilder &B) const { 4791 if (!ST.isTrapHandlerEnabled() || 4792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4793 return legalizeTrapEndpgm(MI, MRI, B); 4794 4795 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4796 switch (*HsaAbiVer) { 4797 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4798 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4799 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4800 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4801 return ST.supportsGetDoorbellID() ? 4802 legalizeTrapHsa(MI, MRI, B) : 4803 legalizeTrapHsaQueuePtr(MI, MRI, B); 4804 } 4805 } 4806 4807 llvm_unreachable("Unknown trap handler"); 4808 } 4809 4810 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4811 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4812 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4813 MI.eraseFromParent(); 4814 return true; 4815 } 4816 4817 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4818 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4819 // Pass queue pointer to trap handler as input, and insert trap instruction 4820 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4821 Register LiveIn = 4822 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4823 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4824 return false; 4825 4826 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4827 B.buildCopy(SGPR01, LiveIn); 4828 B.buildInstr(AMDGPU::S_TRAP) 4829 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4830 .addReg(SGPR01, RegState::Implicit); 4831 4832 MI.eraseFromParent(); 4833 return true; 4834 } 4835 4836 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4837 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4838 B.buildInstr(AMDGPU::S_TRAP) 4839 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4840 MI.eraseFromParent(); 4841 return true; 4842 } 4843 4844 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4845 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4846 // Is non-HSA path or trap-handler disabled? Then, report a warning 4847 // accordingly 4848 if (!ST.isTrapHandlerEnabled() || 4849 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4850 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4851 "debugtrap handler not supported", 4852 MI.getDebugLoc(), DS_Warning); 4853 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4854 Ctx.diagnose(NoTrap); 4855 } else { 4856 // Insert debug-trap instruction 4857 B.buildInstr(AMDGPU::S_TRAP) 4858 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4859 } 4860 4861 MI.eraseFromParent(); 4862 return true; 4863 } 4864 4865 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4866 MachineIRBuilder &B) const { 4867 MachineRegisterInfo &MRI = *B.getMRI(); 4868 const LLT S16 = LLT::scalar(16); 4869 const LLT S32 = LLT::scalar(32); 4870 4871 Register DstReg = MI.getOperand(0).getReg(); 4872 Register NodePtr = MI.getOperand(2).getReg(); 4873 Register RayExtent = MI.getOperand(3).getReg(); 4874 Register RayOrigin = MI.getOperand(4).getReg(); 4875 Register RayDir = MI.getOperand(5).getReg(); 4876 Register RayInvDir = MI.getOperand(6).getReg(); 4877 Register TDescr = MI.getOperand(7).getReg(); 4878 4879 if (!ST.hasGFX10_AEncoding()) { 4880 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4881 "intrinsic not supported on subtarget", 4882 MI.getDebugLoc()); 4883 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4884 return false; 4885 } 4886 4887 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4888 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4889 const unsigned NumVDataDwords = 4; 4890 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4891 const bool UseNSA = 4892 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4893 const unsigned BaseOpcodes[2][2] = { 4894 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4895 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4896 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4897 int Opcode; 4898 if (UseNSA) { 4899 Opcode = 4900 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4901 NumVDataDwords, NumVAddrDwords); 4902 } else { 4903 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4904 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4905 PowerOf2Ceil(NumVAddrDwords)); 4906 } 4907 assert(Opcode != -1); 4908 4909 SmallVector<Register, 12> Ops; 4910 if (Is64) { 4911 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4912 Ops.push_back(Unmerge.getReg(0)); 4913 Ops.push_back(Unmerge.getReg(1)); 4914 } else { 4915 Ops.push_back(NodePtr); 4916 } 4917 Ops.push_back(RayExtent); 4918 4919 auto packLanes = [&Ops, &S32, &B](Register Src) { 4920 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4921 Ops.push_back(Unmerge.getReg(0)); 4922 Ops.push_back(Unmerge.getReg(1)); 4923 Ops.push_back(Unmerge.getReg(2)); 4924 }; 4925 4926 packLanes(RayOrigin); 4927 if (IsA16) { 4928 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4929 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4930 Register R1 = MRI.createGenericVirtualRegister(S32); 4931 Register R2 = MRI.createGenericVirtualRegister(S32); 4932 Register R3 = MRI.createGenericVirtualRegister(S32); 4933 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4934 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4935 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4936 Ops.push_back(R1); 4937 Ops.push_back(R2); 4938 Ops.push_back(R3); 4939 } else { 4940 packLanes(RayDir); 4941 packLanes(RayInvDir); 4942 } 4943 4944 if (!UseNSA) { 4945 // Build a single vector containing all the operands so far prepared. 4946 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4947 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4948 Ops.clear(); 4949 Ops.push_back(MergedOps); 4950 } 4951 4952 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4953 .addDef(DstReg) 4954 .addImm(Opcode); 4955 4956 for (Register R : Ops) { 4957 MIB.addUse(R); 4958 } 4959 4960 MIB.addUse(TDescr) 4961 .addImm(IsA16 ? 1 : 0) 4962 .cloneMemRefs(MI); 4963 4964 MI.eraseFromParent(); 4965 return true; 4966 } 4967 4968 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4969 MachineInstr &MI) const { 4970 MachineIRBuilder &B = Helper.MIRBuilder; 4971 MachineRegisterInfo &MRI = *B.getMRI(); 4972 4973 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4974 auto IntrID = MI.getIntrinsicID(); 4975 switch (IntrID) { 4976 case Intrinsic::amdgcn_if: 4977 case Intrinsic::amdgcn_else: { 4978 MachineInstr *Br = nullptr; 4979 MachineBasicBlock *UncondBrTarget = nullptr; 4980 bool Negated = false; 4981 if (MachineInstr *BrCond = 4982 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4983 const SIRegisterInfo *TRI 4984 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4985 4986 Register Def = MI.getOperand(1).getReg(); 4987 Register Use = MI.getOperand(3).getReg(); 4988 4989 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4990 4991 if (Negated) 4992 std::swap(CondBrTarget, UncondBrTarget); 4993 4994 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4995 if (IntrID == Intrinsic::amdgcn_if) { 4996 B.buildInstr(AMDGPU::SI_IF) 4997 .addDef(Def) 4998 .addUse(Use) 4999 .addMBB(UncondBrTarget); 5000 } else { 5001 B.buildInstr(AMDGPU::SI_ELSE) 5002 .addDef(Def) 5003 .addUse(Use) 5004 .addMBB(UncondBrTarget); 5005 } 5006 5007 if (Br) { 5008 Br->getOperand(0).setMBB(CondBrTarget); 5009 } else { 5010 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5011 // since we're swapping branch targets it needs to be reinserted. 5012 // FIXME: IRTranslator should probably not do this 5013 B.buildBr(*CondBrTarget); 5014 } 5015 5016 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5017 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5018 MI.eraseFromParent(); 5019 BrCond->eraseFromParent(); 5020 return true; 5021 } 5022 5023 return false; 5024 } 5025 case Intrinsic::amdgcn_loop: { 5026 MachineInstr *Br = nullptr; 5027 MachineBasicBlock *UncondBrTarget = nullptr; 5028 bool Negated = false; 5029 if (MachineInstr *BrCond = 5030 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5031 const SIRegisterInfo *TRI 5032 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5033 5034 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5035 Register Reg = MI.getOperand(2).getReg(); 5036 5037 if (Negated) 5038 std::swap(CondBrTarget, UncondBrTarget); 5039 5040 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5041 B.buildInstr(AMDGPU::SI_LOOP) 5042 .addUse(Reg) 5043 .addMBB(UncondBrTarget); 5044 5045 if (Br) 5046 Br->getOperand(0).setMBB(CondBrTarget); 5047 else 5048 B.buildBr(*CondBrTarget); 5049 5050 MI.eraseFromParent(); 5051 BrCond->eraseFromParent(); 5052 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5053 return true; 5054 } 5055 5056 return false; 5057 } 5058 case Intrinsic::amdgcn_kernarg_segment_ptr: 5059 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5060 // This only makes sense to call in a kernel, so just lower to null. 5061 B.buildConstant(MI.getOperand(0).getReg(), 0); 5062 MI.eraseFromParent(); 5063 return true; 5064 } 5065 5066 return legalizePreloadedArgIntrin( 5067 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5068 case Intrinsic::amdgcn_implicitarg_ptr: 5069 return legalizeImplicitArgPtr(MI, MRI, B); 5070 case Intrinsic::amdgcn_workitem_id_x: 5071 return legalizePreloadedArgIntrin(MI, MRI, B, 5072 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5073 case Intrinsic::amdgcn_workitem_id_y: 5074 return legalizePreloadedArgIntrin(MI, MRI, B, 5075 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5076 case Intrinsic::amdgcn_workitem_id_z: 5077 return legalizePreloadedArgIntrin(MI, MRI, B, 5078 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5079 case Intrinsic::amdgcn_workgroup_id_x: 5080 return legalizePreloadedArgIntrin(MI, MRI, B, 5081 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5082 case Intrinsic::amdgcn_workgroup_id_y: 5083 return legalizePreloadedArgIntrin(MI, MRI, B, 5084 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5085 case Intrinsic::amdgcn_workgroup_id_z: 5086 return legalizePreloadedArgIntrin(MI, MRI, B, 5087 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5088 case Intrinsic::amdgcn_dispatch_ptr: 5089 return legalizePreloadedArgIntrin(MI, MRI, B, 5090 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5091 case Intrinsic::amdgcn_queue_ptr: 5092 return legalizePreloadedArgIntrin(MI, MRI, B, 5093 AMDGPUFunctionArgInfo::QUEUE_PTR); 5094 case Intrinsic::amdgcn_implicit_buffer_ptr: 5095 return legalizePreloadedArgIntrin( 5096 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5097 case Intrinsic::amdgcn_dispatch_id: 5098 return legalizePreloadedArgIntrin(MI, MRI, B, 5099 AMDGPUFunctionArgInfo::DISPATCH_ID); 5100 case Intrinsic::amdgcn_fdiv_fast: 5101 return legalizeFDIVFastIntrin(MI, MRI, B); 5102 case Intrinsic::amdgcn_is_shared: 5103 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5104 case Intrinsic::amdgcn_is_private: 5105 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5106 case Intrinsic::amdgcn_wavefrontsize: { 5107 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5108 MI.eraseFromParent(); 5109 return true; 5110 } 5111 case Intrinsic::amdgcn_s_buffer_load: 5112 return legalizeSBufferLoad(Helper, MI); 5113 case Intrinsic::amdgcn_raw_buffer_store: 5114 case Intrinsic::amdgcn_struct_buffer_store: 5115 return legalizeBufferStore(MI, MRI, B, false, false); 5116 case Intrinsic::amdgcn_raw_buffer_store_format: 5117 case Intrinsic::amdgcn_struct_buffer_store_format: 5118 return legalizeBufferStore(MI, MRI, B, false, true); 5119 case Intrinsic::amdgcn_raw_tbuffer_store: 5120 case Intrinsic::amdgcn_struct_tbuffer_store: 5121 return legalizeBufferStore(MI, MRI, B, true, true); 5122 case Intrinsic::amdgcn_raw_buffer_load: 5123 case Intrinsic::amdgcn_struct_buffer_load: 5124 return legalizeBufferLoad(MI, MRI, B, false, false); 5125 case Intrinsic::amdgcn_raw_buffer_load_format: 5126 case Intrinsic::amdgcn_struct_buffer_load_format: 5127 return legalizeBufferLoad(MI, MRI, B, true, false); 5128 case Intrinsic::amdgcn_raw_tbuffer_load: 5129 case Intrinsic::amdgcn_struct_tbuffer_load: 5130 return legalizeBufferLoad(MI, MRI, B, true, true); 5131 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5132 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5133 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5134 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5135 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5136 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5137 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5138 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5139 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5140 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5141 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5142 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5143 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5144 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5145 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5146 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5147 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5148 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5149 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5150 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5151 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5152 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5153 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5154 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5155 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5156 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5157 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5158 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5159 case Intrinsic::amdgcn_buffer_atomic_fadd: 5160 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5161 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5162 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5163 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5164 return legalizeBufferAtomic(MI, B, IntrID); 5165 case Intrinsic::amdgcn_atomic_inc: 5166 return legalizeAtomicIncDec(MI, B, true); 5167 case Intrinsic::amdgcn_atomic_dec: 5168 return legalizeAtomicIncDec(MI, B, false); 5169 case Intrinsic::trap: 5170 return legalizeTrapIntrinsic(MI, MRI, B); 5171 case Intrinsic::debugtrap: 5172 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5173 case Intrinsic::amdgcn_rsq_clamp: 5174 return legalizeRsqClampIntrinsic(MI, MRI, B); 5175 case Intrinsic::amdgcn_ds_fadd: 5176 case Intrinsic::amdgcn_ds_fmin: 5177 case Intrinsic::amdgcn_ds_fmax: 5178 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5179 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5180 return legalizeBVHIntrinsic(MI, B); 5181 default: { 5182 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5183 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5184 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5185 return true; 5186 } 5187 } 5188 5189 return true; 5190 } 5191