1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerFor({{S32, S64}}) 817 .lowerIf(typeIs(1, S1)) 818 .customFor({{S64, S64}}); 819 if (ST.has16BitInsts()) 820 IToFP.legalFor({{S16, S16}}); 821 IToFP.clampScalar(1, S32, S64) 822 .minScalar(0, S32) 823 .scalarize(0) 824 .widenScalarToNextPow2(1); 825 826 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 827 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 828 .customFor({{S64, S32}, {S64, S64}}) 829 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 830 if (ST.has16BitInsts()) 831 FPToI.legalFor({{S16, S16}}); 832 else 833 FPToI.minScalar(1, S32); 834 835 FPToI.minScalar(0, S32) 836 .widenScalarToNextPow2(0, 32) 837 .scalarize(0) 838 .lower(); 839 840 // Lower roundeven into G_FRINT 841 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 842 .scalarize(0) 843 .lower(); 844 845 if (ST.has16BitInsts()) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S16, S32, S64}) 848 .clampScalar(0, S16, S64) 849 .scalarize(0); 850 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32, S64}) 853 .clampScalar(0, S32, S64) 854 .scalarize(0); 855 } else { 856 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 857 .legalFor({S32}) 858 .customFor({S64}) 859 .clampScalar(0, S32, S64) 860 .scalarize(0); 861 } 862 863 getActionDefinitionsBuilder(G_PTR_ADD) 864 .legalIf(all(isPointer(0), sameSize(0, 1))) 865 .scalarize(0) 866 .scalarSameSizeAs(1, 0); 867 868 getActionDefinitionsBuilder(G_PTRMASK) 869 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 870 .scalarSameSizeAs(1, 0) 871 .scalarize(0); 872 873 auto &CmpBuilder = 874 getActionDefinitionsBuilder(G_ICMP) 875 // The compare output type differs based on the register bank of the output, 876 // so make both s1 and s32 legal. 877 // 878 // Scalar compares producing output in scc will be promoted to s32, as that 879 // is the allocatable register type that will be needed for the copy from 880 // scc. This will be promoted during RegBankSelect, and we assume something 881 // before that won't try to use s32 result types. 882 // 883 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 884 // bank. 885 .legalForCartesianProduct( 886 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 887 .legalForCartesianProduct( 888 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 889 if (ST.has16BitInsts()) { 890 CmpBuilder.legalFor({{S1, S16}}); 891 } 892 893 CmpBuilder 894 .widenScalarToNextPow2(1) 895 .clampScalar(1, S32, S64) 896 .scalarize(0) 897 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 898 899 getActionDefinitionsBuilder(G_FCMP) 900 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0); 904 905 // FIXME: fpow has a selection pattern that should move to custom lowering. 906 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 907 if (ST.has16BitInsts()) 908 Exp2Ops.legalFor({S32, S16}); 909 else 910 Exp2Ops.legalFor({S32}); 911 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 912 Exp2Ops.scalarize(0); 913 914 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 915 if (ST.has16BitInsts()) 916 ExpOps.customFor({{S32}, {S16}}); 917 else 918 ExpOps.customFor({S32}); 919 ExpOps.clampScalar(0, MinScalarFPTy, S32) 920 .scalarize(0); 921 922 getActionDefinitionsBuilder(G_FPOWI) 923 .clampScalar(0, MinScalarFPTy, S32) 924 .lower(); 925 926 // The 64-bit versions produce 32-bit results, but only on the SALU. 927 getActionDefinitionsBuilder(G_CTPOP) 928 .legalFor({{S32, S32}, {S32, S64}}) 929 .clampScalar(0, S32, S32) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .widenScalarToNextPow2(0, 32) 933 .widenScalarToNextPow2(1, 32); 934 935 // The hardware instructions return a different result on 0 than the generic 936 // instructions expect. The hardware produces -1, but these produce the 937 // bitwidth. 938 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 939 .scalarize(0) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .widenScalarToNextPow2(0, 32) 943 .widenScalarToNextPow2(1, 32) 944 .lower(); 945 946 // The 64-bit versions produce 32-bit results, but only on the SALU. 947 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 948 .legalFor({{S32, S32}, {S32, S64}}) 949 .clampScalar(0, S32, S32) 950 .clampScalar(1, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0, 32) 953 .widenScalarToNextPow2(1, 32); 954 955 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 956 // RegBankSelect. 957 getActionDefinitionsBuilder(G_BITREVERSE) 958 .legalFor({S32, S64}) 959 .clampScalar(0, S32, S64) 960 .scalarize(0) 961 .widenScalarToNextPow2(0); 962 963 if (ST.has16BitInsts()) { 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S16, S32, V2S16}) 966 .clampMaxNumElements(0, S16, 2) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .clampScalar(0, S16, S32) 971 .scalarize(0); 972 973 if (ST.hasVOP3PInsts()) { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 975 .legalFor({S32, S16, V2S16}) 976 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 977 .clampMaxNumElements(0, S16, 2) 978 .minScalar(0, S16) 979 .widenScalarToNextPow2(0) 980 .scalarize(0) 981 .lower(); 982 } else { 983 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 984 .legalFor({S32, S16}) 985 .widenScalarToNextPow2(0) 986 .minScalar(0, S16) 987 .scalarize(0) 988 .lower(); 989 } 990 } else { 991 // TODO: Should have same legality without v_perm_b32 992 getActionDefinitionsBuilder(G_BSWAP) 993 .legalFor({S32}) 994 .lowerIf(scalarNarrowerThan(0, 32)) 995 // FIXME: Fixing non-power-of-2 before clamp is workaround for 996 // narrowScalar limitation. 997 .widenScalarToNextPow2(0) 998 .maxScalar(0, S32) 999 .scalarize(0) 1000 .lower(); 1001 1002 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1003 .legalFor({S32}) 1004 .minScalar(0, S32) 1005 .widenScalarToNextPow2(0) 1006 .scalarize(0) 1007 .lower(); 1008 } 1009 1010 getActionDefinitionsBuilder(G_INTTOPTR) 1011 // List the common cases 1012 .legalForCartesianProduct(AddrSpaces64, {S64}) 1013 .legalForCartesianProduct(AddrSpaces32, {S32}) 1014 .scalarize(0) 1015 // Accept any address space as long as the size matches 1016 .legalIf(sameSize(0, 1)) 1017 .widenScalarIf(smallerThan(1, 0), 1018 [](const LegalityQuery &Query) { 1019 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1020 }) 1021 .narrowScalarIf(largerThan(1, 0), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_PTRTOINT) 1027 // List the common cases 1028 .legalForCartesianProduct(AddrSpaces64, {S64}) 1029 .legalForCartesianProduct(AddrSpaces32, {S32}) 1030 .scalarize(0) 1031 // Accept any address space as long as the size matches 1032 .legalIf(sameSize(0, 1)) 1033 .widenScalarIf(smallerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }) 1037 .narrowScalarIf( 1038 largerThan(0, 1), 1039 [](const LegalityQuery &Query) { 1040 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1041 }); 1042 1043 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1044 .scalarize(0) 1045 .custom(); 1046 1047 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1048 bool IsLoad) -> bool { 1049 const LLT DstTy = Query.Types[0]; 1050 1051 // Split vector extloads. 1052 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1053 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1054 1055 if (MemSize < DstTy.getSizeInBits()) 1056 MemSize = std::max(MemSize, AlignBits); 1057 1058 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1059 return true; 1060 1061 const LLT PtrTy = Query.Types[1]; 1062 unsigned AS = PtrTy.getAddressSpace(); 1063 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1064 return true; 1065 1066 // Catch weird sized loads that don't evenly divide into the access sizes 1067 // TODO: May be able to widen depending on alignment etc. 1068 unsigned NumRegs = (MemSize + 31) / 32; 1069 if (NumRegs == 3) { 1070 if (!ST.hasDwordx3LoadStores()) 1071 return true; 1072 } else { 1073 // If the alignment allows, these should have been widened. 1074 if (!isPowerOf2_32(NumRegs)) 1075 return true; 1076 } 1077 1078 if (AlignBits < MemSize) { 1079 const SITargetLowering *TLI = ST.getTargetLowering(); 1080 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1081 Align(AlignBits / 8)); 1082 } 1083 1084 return false; 1085 }; 1086 1087 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1088 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1089 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1090 1091 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1092 // LDS 1093 // TODO: Unsupported flat for SI. 1094 1095 for (unsigned Op : {G_LOAD, G_STORE}) { 1096 const bool IsStore = Op == G_STORE; 1097 1098 auto &Actions = getActionDefinitionsBuilder(Op); 1099 // Explicitly list some common cases. 1100 // TODO: Does this help compile time at all? 1101 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1102 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1103 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1104 {S64, GlobalPtr, S64, GlobalAlign32}, 1105 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1106 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1107 {S32, GlobalPtr, S8, GlobalAlign8}, 1108 {S32, GlobalPtr, S16, GlobalAlign16}, 1109 1110 {S32, LocalPtr, S32, 32}, 1111 {S64, LocalPtr, S64, 32}, 1112 {V2S32, LocalPtr, V2S32, 32}, 1113 {S32, LocalPtr, S8, 8}, 1114 {S32, LocalPtr, S16, 16}, 1115 {V2S16, LocalPtr, S32, 32}, 1116 1117 {S32, PrivatePtr, S32, 32}, 1118 {S32, PrivatePtr, S8, 8}, 1119 {S32, PrivatePtr, S16, 16}, 1120 {V2S16, PrivatePtr, S32, 32}, 1121 1122 {S32, ConstantPtr, S32, GlobalAlign32}, 1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1124 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1125 {S64, ConstantPtr, S64, GlobalAlign32}, 1126 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1127 Actions.legalIf( 1128 [=](const LegalityQuery &Query) -> bool { 1129 return isLoadStoreLegal(ST, Query); 1130 }); 1131 1132 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1133 // 64-bits. 1134 // 1135 // TODO: Should generalize bitcast action into coerce, which will also cover 1136 // inserting addrspacecasts. 1137 Actions.customIf(typeIs(1, Constant32Ptr)); 1138 1139 // Turn any illegal element vectors into something easier to deal 1140 // with. These will ultimately produce 32-bit scalar shifts to extract the 1141 // parts anyway. 1142 // 1143 // For odd 16-bit element vectors, prefer to split those into pieces with 1144 // 16-bit vector parts. 1145 Actions.bitcastIf( 1146 [=](const LegalityQuery &Query) -> bool { 1147 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1148 Query.MMODescrs[0].MemoryTy); 1149 }, bitcastToRegisterType(0)); 1150 1151 if (!IsStore) { 1152 // Widen suitably aligned loads by loading extra bytes. The standard 1153 // legalization actions can't properly express widening memory operands. 1154 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1155 return shouldWidenLoad(ST, Query, G_LOAD); 1156 }); 1157 } 1158 1159 // FIXME: load/store narrowing should be moved to lower action 1160 Actions 1161 .narrowScalarIf( 1162 [=](const LegalityQuery &Query) -> bool { 1163 return !Query.Types[0].isVector() && 1164 needToSplitMemOp(Query, Op == G_LOAD); 1165 }, 1166 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1167 const LLT DstTy = Query.Types[0]; 1168 const LLT PtrTy = Query.Types[1]; 1169 1170 const unsigned DstSize = DstTy.getSizeInBits(); 1171 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1172 1173 // Split extloads. 1174 if (DstSize > MemSize) 1175 return std::make_pair(0, LLT::scalar(MemSize)); 1176 1177 if (!isPowerOf2_32(DstSize)) { 1178 // We're probably decomposing an odd sized store. Try to split 1179 // to the widest type. TODO: Account for alignment. As-is it 1180 // should be OK, since the new parts will be further legalized. 1181 unsigned FloorSize = PowerOf2Floor(DstSize); 1182 return std::make_pair(0, LLT::scalar(FloorSize)); 1183 } 1184 1185 if (DstSize > 32 && (DstSize % 32 != 0)) { 1186 // FIXME: Need a way to specify non-extload of larger size if 1187 // suitably aligned. 1188 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1189 } 1190 1191 unsigned MaxSize = maxSizeForAddrSpace(ST, 1192 PtrTy.getAddressSpace(), 1193 Op == G_LOAD); 1194 if (MemSize > MaxSize) 1195 return std::make_pair(0, LLT::scalar(MaxSize)); 1196 1197 unsigned Align = Query.MMODescrs[0].AlignInBits; 1198 return std::make_pair(0, LLT::scalar(Align)); 1199 }) 1200 .fewerElementsIf( 1201 [=](const LegalityQuery &Query) -> bool { 1202 return Query.Types[0].isVector() && 1203 needToSplitMemOp(Query, Op == G_LOAD); 1204 }, 1205 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1206 const LLT DstTy = Query.Types[0]; 1207 const LLT PtrTy = Query.Types[1]; 1208 1209 LLT EltTy = DstTy.getElementType(); 1210 unsigned MaxSize = maxSizeForAddrSpace(ST, 1211 PtrTy.getAddressSpace(), 1212 Op == G_LOAD); 1213 1214 // FIXME: Handle widened to power of 2 results better. This ends 1215 // up scalarizing. 1216 // FIXME: 3 element stores scalarized on SI 1217 1218 // Split if it's too large for the address space. 1219 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1220 if (MemSize > MaxSize) { 1221 unsigned NumElts = DstTy.getNumElements(); 1222 unsigned EltSize = EltTy.getSizeInBits(); 1223 1224 if (MaxSize % EltSize == 0) { 1225 return std::make_pair( 1226 0, LLT::scalarOrVector( 1227 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1228 } 1229 1230 unsigned NumPieces = MemSize / MaxSize; 1231 1232 // FIXME: Refine when odd breakdowns handled 1233 // The scalars will need to be re-legalized. 1234 if (NumPieces == 1 || NumPieces >= NumElts || 1235 NumElts % NumPieces != 0) 1236 return std::make_pair(0, EltTy); 1237 1238 return std::make_pair( 1239 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1240 } 1241 1242 // FIXME: We could probably handle weird extending loads better. 1243 if (DstTy.getSizeInBits() > MemSize) 1244 return std::make_pair(0, EltTy); 1245 1246 unsigned EltSize = EltTy.getSizeInBits(); 1247 unsigned DstSize = DstTy.getSizeInBits(); 1248 if (!isPowerOf2_32(DstSize)) { 1249 // We're probably decomposing an odd sized store. Try to split 1250 // to the widest type. TODO: Account for alignment. As-is it 1251 // should be OK, since the new parts will be further legalized. 1252 unsigned FloorSize = PowerOf2Floor(DstSize); 1253 return std::make_pair( 1254 0, LLT::scalarOrVector( 1255 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1256 } 1257 1258 // Need to split because of alignment. 1259 unsigned Align = Query.MMODescrs[0].AlignInBits; 1260 if (EltSize > Align && 1261 (EltSize / Align < DstTy.getNumElements())) { 1262 return std::make_pair( 1263 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1264 } 1265 1266 // May need relegalization for the scalars. 1267 return std::make_pair(0, EltTy); 1268 }) 1269 .lowerIfMemSizeNotPow2() 1270 .minScalar(0, S32) 1271 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1272 .widenScalarToNextPow2(0) 1273 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1274 .lower(); 1275 } 1276 1277 // FIXME: Unaligned accesses not lowered. 1278 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1279 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1280 {S32, GlobalPtr, S16, 2 * 8}, 1281 {S32, LocalPtr, S8, 8}, 1282 {S32, LocalPtr, S16, 16}, 1283 {S32, PrivatePtr, S8, 8}, 1284 {S32, PrivatePtr, S16, 16}, 1285 {S32, ConstantPtr, S8, 8}, 1286 {S32, ConstantPtr, S16, 2 * 8}}) 1287 .legalIf( 1288 [=](const LegalityQuery &Query) -> bool { 1289 return isLoadStoreLegal(ST, Query); 1290 }); 1291 1292 if (ST.hasFlatAddressSpace()) { 1293 ExtLoads.legalForTypesWithMemDesc( 1294 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1295 } 1296 1297 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1298 // 64-bits. 1299 // 1300 // TODO: Should generalize bitcast action into coerce, which will also cover 1301 // inserting addrspacecasts. 1302 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1303 1304 ExtLoads.clampScalar(0, S32, S32) 1305 .widenScalarToNextPow2(0) 1306 .lower(); 1307 1308 auto &Atomics = getActionDefinitionsBuilder( 1309 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1310 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1311 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1312 G_ATOMICRMW_UMIN}) 1313 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1314 {S64, GlobalPtr}, {S64, LocalPtr}, 1315 {S32, RegionPtr}, {S64, RegionPtr}}); 1316 if (ST.hasFlatAddressSpace()) { 1317 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1318 } 1319 1320 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1321 if (ST.hasLDSFPAtomics()) { 1322 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1323 if (ST.hasGFX90AInsts()) 1324 Atomic.legalFor({{S64, LocalPtr}}); 1325 } 1326 if (ST.hasAtomicFaddInsts()) 1327 Atomic.legalFor({{S32, GlobalPtr}}); 1328 1329 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1330 // demarshalling 1331 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1332 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1333 {S32, FlatPtr}, {S64, FlatPtr}}) 1334 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1335 {S32, RegionPtr}, {S64, RegionPtr}}); 1336 // TODO: Pointer types, any 32-bit or 64-bit vector 1337 1338 // Condition should be s32 for scalar, s1 for vector. 1339 getActionDefinitionsBuilder(G_SELECT) 1340 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1341 LocalPtr, FlatPtr, PrivatePtr, 1342 LLT::fixed_vector(2, LocalPtr), 1343 LLT::fixed_vector(2, PrivatePtr)}, 1344 {S1, S32}) 1345 .clampScalar(0, S16, S64) 1346 .scalarize(1) 1347 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1348 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1349 .clampMaxNumElements(0, S32, 2) 1350 .clampMaxNumElements(0, LocalPtr, 2) 1351 .clampMaxNumElements(0, PrivatePtr, 2) 1352 .scalarize(0) 1353 .widenScalarToNextPow2(0) 1354 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1355 1356 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1357 // be more flexible with the shift amount type. 1358 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1359 .legalFor({{S32, S32}, {S64, S32}}); 1360 if (ST.has16BitInsts()) { 1361 if (ST.hasVOP3PInsts()) { 1362 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1363 .clampMaxNumElements(0, S16, 2); 1364 } else 1365 Shifts.legalFor({{S16, S16}}); 1366 1367 // TODO: Support 16-bit shift amounts for all types 1368 Shifts.widenScalarIf( 1369 [=](const LegalityQuery &Query) { 1370 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1371 // 32-bit amount. 1372 const LLT ValTy = Query.Types[0]; 1373 const LLT AmountTy = Query.Types[1]; 1374 return ValTy.getSizeInBits() <= 16 && 1375 AmountTy.getSizeInBits() < 16; 1376 }, changeTo(1, S16)); 1377 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1378 Shifts.clampScalar(1, S32, S32); 1379 Shifts.clampScalar(0, S16, S64); 1380 Shifts.widenScalarToNextPow2(0, 16); 1381 1382 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1383 .minScalar(0, S16) 1384 .scalarize(0) 1385 .lower(); 1386 } else { 1387 // Make sure we legalize the shift amount type first, as the general 1388 // expansion for the shifted type will produce much worse code if it hasn't 1389 // been truncated already. 1390 Shifts.clampScalar(1, S32, S32); 1391 Shifts.clampScalar(0, S32, S64); 1392 Shifts.widenScalarToNextPow2(0, 32); 1393 1394 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1395 .minScalar(0, S32) 1396 .scalarize(0) 1397 .lower(); 1398 } 1399 Shifts.scalarize(0); 1400 1401 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1402 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1403 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1404 unsigned IdxTypeIdx = 2; 1405 1406 getActionDefinitionsBuilder(Op) 1407 .customIf([=](const LegalityQuery &Query) { 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1411 const unsigned EltSize = EltTy.getSizeInBits(); 1412 return (EltSize == 32 || EltSize == 64) && 1413 VecTy.getSizeInBits() % 32 == 0 && 1414 VecTy.getSizeInBits() <= MaxRegisterSize && 1415 IdxTy.getSizeInBits() == 32; 1416 }) 1417 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1418 bitcastToVectorElement32(VecTypeIdx)) 1419 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1420 .bitcastIf( 1421 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1422 [=](const LegalityQuery &Query) { 1423 // For > 64-bit element types, try to turn this into a 64-bit 1424 // element vector since we may be able to do better indexing 1425 // if this is scalar. If not, fall back to 32. 1426 const LLT EltTy = Query.Types[EltTypeIdx]; 1427 const LLT VecTy = Query.Types[VecTypeIdx]; 1428 const unsigned DstEltSize = EltTy.getSizeInBits(); 1429 const unsigned VecSize = VecTy.getSizeInBits(); 1430 1431 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1432 return std::make_pair( 1433 VecTypeIdx, 1434 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1435 }) 1436 .clampScalar(EltTypeIdx, S32, S64) 1437 .clampScalar(VecTypeIdx, S32, S64) 1438 .clampScalar(IdxTypeIdx, S32, S32) 1439 .clampMaxNumElements(VecTypeIdx, S32, 32) 1440 // TODO: Clamp elements for 64-bit vectors? 1441 // It should only be necessary with variable indexes. 1442 // As a last resort, lower to the stack 1443 .lower(); 1444 } 1445 1446 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1447 .unsupportedIf([=](const LegalityQuery &Query) { 1448 const LLT &EltTy = Query.Types[1].getElementType(); 1449 return Query.Types[0] != EltTy; 1450 }); 1451 1452 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1453 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1454 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1455 1456 // FIXME: Doesn't handle extract of illegal sizes. 1457 getActionDefinitionsBuilder(Op) 1458 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1459 // FIXME: Multiples of 16 should not be legal. 1460 .legalIf([=](const LegalityQuery &Query) { 1461 const LLT BigTy = Query.Types[BigTyIdx]; 1462 const LLT LitTy = Query.Types[LitTyIdx]; 1463 return (BigTy.getSizeInBits() % 32 == 0) && 1464 (LitTy.getSizeInBits() % 16 == 0); 1465 }) 1466 .widenScalarIf( 1467 [=](const LegalityQuery &Query) { 1468 const LLT BigTy = Query.Types[BigTyIdx]; 1469 return (BigTy.getScalarSizeInBits() < 16); 1470 }, 1471 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1472 .widenScalarIf( 1473 [=](const LegalityQuery &Query) { 1474 const LLT LitTy = Query.Types[LitTyIdx]; 1475 return (LitTy.getScalarSizeInBits() < 16); 1476 }, 1477 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1478 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1479 .widenScalarToNextPow2(BigTyIdx, 32); 1480 1481 } 1482 1483 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1484 .legalForCartesianProduct(AllS32Vectors, {S32}) 1485 .legalForCartesianProduct(AllS64Vectors, {S64}) 1486 .clampNumElements(0, V16S32, V32S32) 1487 .clampNumElements(0, V2S64, V16S64) 1488 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1489 1490 if (ST.hasScalarPackInsts()) { 1491 BuildVector 1492 // FIXME: Should probably widen s1 vectors straight to s32 1493 .minScalarOrElt(0, S16) 1494 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1495 .minScalar(1, S32); 1496 1497 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1498 .legalFor({V2S16, S32}) 1499 .lower(); 1500 BuildVector.minScalarOrElt(0, S32); 1501 } else { 1502 BuildVector.customFor({V2S16, S16}); 1503 BuildVector.minScalarOrElt(0, S32); 1504 1505 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1506 .customFor({V2S16, S32}) 1507 .lower(); 1508 } 1509 1510 BuildVector.legalIf(isRegisterType(0)); 1511 1512 // FIXME: Clamp maximum size 1513 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1514 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1515 .clampMaxNumElements(0, S32, 32) 1516 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1517 .clampMaxNumElements(0, S16, 64); 1518 1519 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1520 // pre-legalize. 1521 if (ST.hasVOP3PInsts()) { 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1523 .customFor({V2S16, V2S16}) 1524 .lower(); 1525 } else 1526 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1527 1528 // Merge/Unmerge 1529 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1530 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1531 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1532 1533 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1534 const LLT Ty = Query.Types[TypeIdx]; 1535 if (Ty.isVector()) { 1536 const LLT &EltTy = Ty.getElementType(); 1537 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1538 return true; 1539 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1540 return true; 1541 } 1542 return false; 1543 }; 1544 1545 auto &Builder = getActionDefinitionsBuilder(Op) 1546 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1547 .lowerFor({{S16, V2S16}}) 1548 .lowerIf([=](const LegalityQuery &Query) { 1549 const LLT BigTy = Query.Types[BigTyIdx]; 1550 return BigTy.getSizeInBits() == 32; 1551 }) 1552 // Try to widen to s16 first for small types. 1553 // TODO: Only do this on targets with legal s16 shifts 1554 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1555 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1556 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1557 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1558 elementTypeIs(1, S16)), 1559 changeTo(1, V2S16)) 1560 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1561 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1562 // valid. 1563 .clampScalar(LitTyIdx, S32, S512) 1564 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1565 // Break up vectors with weird elements into scalars 1566 .fewerElementsIf( 1567 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1568 scalarize(0)) 1569 .fewerElementsIf( 1570 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1571 scalarize(1)) 1572 .clampScalar(BigTyIdx, S32, MaxScalar); 1573 1574 if (Op == G_MERGE_VALUES) { 1575 Builder.widenScalarIf( 1576 // TODO: Use 16-bit shifts if legal for 8-bit values? 1577 [=](const LegalityQuery &Query) { 1578 const LLT Ty = Query.Types[LitTyIdx]; 1579 return Ty.getSizeInBits() < 32; 1580 }, 1581 changeTo(LitTyIdx, S32)); 1582 } 1583 1584 Builder.widenScalarIf( 1585 [=](const LegalityQuery &Query) { 1586 const LLT Ty = Query.Types[BigTyIdx]; 1587 return !isPowerOf2_32(Ty.getSizeInBits()) && 1588 Ty.getSizeInBits() % 16 != 0; 1589 }, 1590 [=](const LegalityQuery &Query) { 1591 // Pick the next power of 2, or a multiple of 64 over 128. 1592 // Whichever is smaller. 1593 const LLT &Ty = Query.Types[BigTyIdx]; 1594 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1595 if (NewSizeInBits >= 256) { 1596 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1597 if (RoundedTo < NewSizeInBits) 1598 NewSizeInBits = RoundedTo; 1599 } 1600 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1601 }) 1602 // Any vectors left are the wrong size. Scalarize them. 1603 .scalarize(0) 1604 .scalarize(1); 1605 } 1606 1607 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1608 // RegBankSelect. 1609 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1610 .legalFor({{S32}, {S64}}); 1611 1612 if (ST.hasVOP3PInsts()) { 1613 SextInReg.lowerFor({{V2S16}}) 1614 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1615 // get more vector shift opportunities, since we'll get those when 1616 // expanded. 1617 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1618 } else if (ST.has16BitInsts()) { 1619 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1620 } else { 1621 // Prefer to promote to s32 before lowering if we don't have 16-bit 1622 // shifts. This avoid a lot of intermediate truncate and extend operations. 1623 SextInReg.lowerFor({{S32}, {S64}}); 1624 } 1625 1626 SextInReg 1627 .scalarize(0) 1628 .clampScalar(0, S32, S64) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1685 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1686 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1687 .unsupported(); 1688 1689 getLegacyLegalizerInfo().computeTables(); 1690 verify(*ST.getInstrInfo()); 1691 } 1692 1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1694 MachineInstr &MI) const { 1695 MachineIRBuilder &B = Helper.MIRBuilder; 1696 MachineRegisterInfo &MRI = *B.getMRI(); 1697 1698 switch (MI.getOpcode()) { 1699 case TargetOpcode::G_ADDRSPACE_CAST: 1700 return legalizeAddrSpaceCast(MI, MRI, B); 1701 case TargetOpcode::G_FRINT: 1702 return legalizeFrint(MI, MRI, B); 1703 case TargetOpcode::G_FCEIL: 1704 return legalizeFceil(MI, MRI, B); 1705 case TargetOpcode::G_FREM: 1706 return legalizeFrem(MI, MRI, B); 1707 case TargetOpcode::G_INTRINSIC_TRUNC: 1708 return legalizeIntrinsicTrunc(MI, MRI, B); 1709 case TargetOpcode::G_SITOFP: 1710 return legalizeITOFP(MI, MRI, B, true); 1711 case TargetOpcode::G_UITOFP: 1712 return legalizeITOFP(MI, MRI, B, false); 1713 case TargetOpcode::G_FPTOSI: 1714 return legalizeFPTOI(MI, MRI, B, true); 1715 case TargetOpcode::G_FPTOUI: 1716 return legalizeFPTOI(MI, MRI, B, false); 1717 case TargetOpcode::G_FMINNUM: 1718 case TargetOpcode::G_FMAXNUM: 1719 case TargetOpcode::G_FMINNUM_IEEE: 1720 case TargetOpcode::G_FMAXNUM_IEEE: 1721 return legalizeMinNumMaxNum(Helper, MI); 1722 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1723 return legalizeExtractVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_INSERT_VECTOR_ELT: 1725 return legalizeInsertVectorElt(MI, MRI, B); 1726 case TargetOpcode::G_SHUFFLE_VECTOR: 1727 return legalizeShuffleVector(MI, MRI, B); 1728 case TargetOpcode::G_FSIN: 1729 case TargetOpcode::G_FCOS: 1730 return legalizeSinCos(MI, MRI, B); 1731 case TargetOpcode::G_GLOBAL_VALUE: 1732 return legalizeGlobalValue(MI, MRI, B); 1733 case TargetOpcode::G_LOAD: 1734 case TargetOpcode::G_SEXTLOAD: 1735 case TargetOpcode::G_ZEXTLOAD: 1736 return legalizeLoad(Helper, MI); 1737 case TargetOpcode::G_FMAD: 1738 return legalizeFMad(MI, MRI, B); 1739 case TargetOpcode::G_FDIV: 1740 return legalizeFDIV(MI, MRI, B); 1741 case TargetOpcode::G_UDIV: 1742 case TargetOpcode::G_UREM: 1743 case TargetOpcode::G_UDIVREM: 1744 return legalizeUnsignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_SDIV: 1746 case TargetOpcode::G_SREM: 1747 case TargetOpcode::G_SDIVREM: 1748 return legalizeSignedDIV_REM(MI, MRI, B); 1749 case TargetOpcode::G_ATOMIC_CMPXCHG: 1750 return legalizeAtomicCmpXChg(MI, MRI, B); 1751 case TargetOpcode::G_FLOG: 1752 return legalizeFlog(MI, B, numbers::ln2f); 1753 case TargetOpcode::G_FLOG10: 1754 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1755 case TargetOpcode::G_FEXP: 1756 return legalizeFExp(MI, B); 1757 case TargetOpcode::G_FPOW: 1758 return legalizeFPow(MI, B); 1759 case TargetOpcode::G_FFLOOR: 1760 return legalizeFFloor(MI, MRI, B); 1761 case TargetOpcode::G_BUILD_VECTOR: 1762 return legalizeBuildVector(MI, MRI, B); 1763 default: 1764 return false; 1765 } 1766 1767 llvm_unreachable("expected switch to return"); 1768 } 1769 1770 Register AMDGPULegalizerInfo::getSegmentAperture( 1771 unsigned AS, 1772 MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B) const { 1774 MachineFunction &MF = B.getMF(); 1775 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1776 const LLT S32 = LLT::scalar(32); 1777 1778 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1779 1780 if (ST.hasApertureRegs()) { 1781 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1782 // getreg. 1783 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1784 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1785 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1786 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1789 unsigned Encoding = 1790 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1791 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1792 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1793 1794 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1795 1796 B.buildInstr(AMDGPU::S_GETREG_B32) 1797 .addDef(GetReg) 1798 .addImm(Encoding); 1799 MRI.setType(GetReg, S32); 1800 1801 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1802 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1803 } 1804 1805 Register QueuePtr = MRI.createGenericVirtualRegister( 1806 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1807 1808 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1809 return Register(); 1810 1811 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1812 // private_segment_aperture_base_hi. 1813 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1814 1815 // TODO: can we be smarter about machine pointer info? 1816 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1817 MachineMemOperand *MMO = MF.getMachineMemOperand( 1818 PtrInfo, 1819 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1820 MachineMemOperand::MOInvariant, 1821 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1822 1823 Register LoadAddr; 1824 1825 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1826 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1827 } 1828 1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1830 MachineInstr &MI, MachineRegisterInfo &MRI, 1831 MachineIRBuilder &B) const { 1832 MachineFunction &MF = B.getMF(); 1833 1834 const LLT S32 = LLT::scalar(32); 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Src = MI.getOperand(1).getReg(); 1837 1838 LLT DstTy = MRI.getType(Dst); 1839 LLT SrcTy = MRI.getType(Src); 1840 unsigned DestAS = DstTy.getAddressSpace(); 1841 unsigned SrcAS = SrcTy.getAddressSpace(); 1842 1843 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1844 // vector element. 1845 assert(!DstTy.isVector()); 1846 1847 const AMDGPUTargetMachine &TM 1848 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1849 1850 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1851 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1852 return true; 1853 } 1854 1855 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1856 // Truncate. 1857 B.buildExtract(Dst, Src, 0); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1863 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1864 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1865 1866 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1867 // another. Merge operands are required to be the same type, but creating an 1868 // extra ptrtoint would be kind of pointless. 1869 auto HighAddr = B.buildConstant( 1870 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1871 B.buildMerge(Dst, {Src, HighAddr}); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1877 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1878 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1879 unsigned NullVal = TM.getNullPointerValue(DestAS); 1880 1881 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1882 auto FlatNull = B.buildConstant(SrcTy, 0); 1883 1884 // Extract low 32-bits of the pointer. 1885 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1886 1887 auto CmpRes = 1888 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1889 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1890 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1896 return false; 1897 1898 if (!ST.hasFlatAddressSpace()) 1899 return false; 1900 1901 auto SegmentNull = 1902 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1903 auto FlatNull = 1904 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1905 1906 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1907 if (!ApertureReg.isValid()) 1908 return false; 1909 1910 auto CmpRes = 1911 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1912 1913 // Coerce the type of the low half of the result so we can use merge_values. 1914 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1915 1916 // TODO: Should we allow mismatched types but matching sizes in merges to 1917 // avoid the ptrtoint? 1918 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1919 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1920 1921 MI.eraseFromParent(); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeFrint( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 Register Src = MI.getOperand(1).getReg(); 1929 LLT Ty = MRI.getType(Src); 1930 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1931 1932 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1933 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1934 1935 auto C1 = B.buildFConstant(Ty, C1Val); 1936 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1937 1938 // TODO: Should this propagate fast-math-flags? 1939 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1940 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1941 1942 auto C2 = B.buildFConstant(Ty, C2Val); 1943 auto Fabs = B.buildFAbs(Ty, Src); 1944 1945 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1946 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeFceil( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 1955 const LLT S1 = LLT::scalar(1); 1956 const LLT S64 = LLT::scalar(64); 1957 1958 Register Src = MI.getOperand(1).getReg(); 1959 assert(MRI.getType(Src) == S64); 1960 1961 // result = trunc(src) 1962 // if (src > 0.0 && src != result) 1963 // result += 1.0 1964 1965 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1966 1967 const auto Zero = B.buildFConstant(S64, 0.0); 1968 const auto One = B.buildFConstant(S64, 1.0); 1969 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1970 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1971 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1972 auto Add = B.buildSelect(S64, And, One, Zero); 1973 1974 // TODO: Should this propagate fast-math-flags? 1975 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1976 return true; 1977 } 1978 1979 bool AMDGPULegalizerInfo::legalizeFrem( 1980 MachineInstr &MI, MachineRegisterInfo &MRI, 1981 MachineIRBuilder &B) const { 1982 Register DstReg = MI.getOperand(0).getReg(); 1983 Register Src0Reg = MI.getOperand(1).getReg(); 1984 Register Src1Reg = MI.getOperand(2).getReg(); 1985 auto Flags = MI.getFlags(); 1986 LLT Ty = MRI.getType(DstReg); 1987 1988 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1989 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1990 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1991 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1992 MI.eraseFromParent(); 1993 return true; 1994 } 1995 1996 static MachineInstrBuilder extractF64Exponent(Register Hi, 1997 MachineIRBuilder &B) { 1998 const unsigned FractBits = 52; 1999 const unsigned ExpBits = 11; 2000 LLT S32 = LLT::scalar(32); 2001 2002 auto Const0 = B.buildConstant(S32, FractBits - 32); 2003 auto Const1 = B.buildConstant(S32, ExpBits); 2004 2005 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2006 .addUse(Hi) 2007 .addUse(Const0.getReg(0)) 2008 .addUse(Const1.getReg(0)); 2009 2010 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 const LLT S1 = LLT::scalar(1); 2017 const LLT S32 = LLT::scalar(32); 2018 const LLT S64 = LLT::scalar(64); 2019 2020 Register Src = MI.getOperand(1).getReg(); 2021 assert(MRI.getType(Src) == S64); 2022 2023 // TODO: Should this use extract since the low half is unused? 2024 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2025 Register Hi = Unmerge.getReg(1); 2026 2027 // Extract the upper half, since this is where we will find the sign and 2028 // exponent. 2029 auto Exp = extractF64Exponent(Hi, B); 2030 2031 const unsigned FractBits = 52; 2032 2033 // Extract the sign bit. 2034 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2035 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2036 2037 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2038 2039 const auto Zero32 = B.buildConstant(S32, 0); 2040 2041 // Extend back to 64-bits. 2042 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2043 2044 auto Shr = B.buildAShr(S64, FractMask, Exp); 2045 auto Not = B.buildNot(S64, Shr); 2046 auto Tmp0 = B.buildAnd(S64, Src, Not); 2047 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2048 2049 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2050 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2051 2052 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2053 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2054 MI.eraseFromParent(); 2055 return true; 2056 } 2057 2058 bool AMDGPULegalizerInfo::legalizeITOFP( 2059 MachineInstr &MI, MachineRegisterInfo &MRI, 2060 MachineIRBuilder &B, bool Signed) const { 2061 2062 Register Dst = MI.getOperand(0).getReg(); 2063 Register Src = MI.getOperand(1).getReg(); 2064 2065 const LLT S64 = LLT::scalar(64); 2066 const LLT S32 = LLT::scalar(32); 2067 2068 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2069 2070 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2071 2072 auto CvtHi = Signed ? 2073 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2074 B.buildUITOFP(S64, Unmerge.getReg(1)); 2075 2076 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2077 2078 auto ThirtyTwo = B.buildConstant(S32, 32); 2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2080 .addUse(CvtHi.getReg(0)) 2081 .addUse(ThirtyTwo.getReg(0)); 2082 2083 // TODO: Should this propagate fast-math-flags? 2084 B.buildFAdd(Dst, LdExp, CvtLo); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 // TODO: Copied from DAG implementation. Verify logic and document how this 2090 // actually works. 2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2092 MachineRegisterInfo &MRI, 2093 MachineIRBuilder &B, 2094 bool Signed) const { 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src = MI.getOperand(1).getReg(); 2098 2099 const LLT S64 = LLT::scalar(64); 2100 const LLT S32 = LLT::scalar(32); 2101 2102 const LLT SrcLT = MRI.getType(Src); 2103 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2104 2105 unsigned Flags = MI.getFlags(); 2106 2107 // The basic idea of converting a floating point number into a pair of 32-bit 2108 // integers is illustrated as follows: 2109 // 2110 // tf := trunc(val); 2111 // hif := floor(tf * 2^-32); 2112 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2113 // hi := fptoi(hif); 2114 // lo := fptoi(lof); 2115 // 2116 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2117 MachineInstrBuilder Sign; 2118 if (Signed && SrcLT == S32) { 2119 // However, a 32-bit floating point number has only 23 bits mantissa and 2120 // it's not enough to hold all the significant bits of `lof` if val is 2121 // negative. To avoid the loss of precision, We need to take the absolute 2122 // value after truncating and flip the result back based on the original 2123 // signedness. 2124 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2125 Trunc = B.buildFAbs(S32, Trunc, Flags); 2126 } 2127 MachineInstrBuilder K0, K1; 2128 if (SrcLT == S64) { 2129 K0 = B.buildFConstant(S64, 2130 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2131 K1 = B.buildFConstant(S64, 2132 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2133 } else { 2134 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2135 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2136 } 2137 2138 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2139 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2140 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2141 2142 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2143 : B.buildFPTOUI(S32, FloorMul); 2144 auto Lo = B.buildFPTOUI(S32, Fma); 2145 2146 if (Signed && SrcLT == S32) { 2147 // Flip the result based on the signedness, which is either all 0s or 1s. 2148 Sign = B.buildMerge(S64, {Sign, Sign}); 2149 // r := xor({lo, hi}, sign) - sign; 2150 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2151 } else 2152 B.buildMerge(Dst, {Lo, Hi}); 2153 MI.eraseFromParent(); 2154 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2159 MachineInstr &MI) const { 2160 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2161 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2162 2163 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2164 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2165 2166 // With ieee_mode disabled, the instructions have the correct behavior 2167 // already for G_FMINNUM/G_FMAXNUM 2168 if (!MFI->getMode().IEEE) 2169 return !IsIEEEOp; 2170 2171 if (IsIEEEOp) 2172 return true; 2173 2174 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2175 } 2176 2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2178 MachineInstr &MI, MachineRegisterInfo &MRI, 2179 MachineIRBuilder &B) const { 2180 // TODO: Should move some of this into LegalizerHelper. 2181 2182 // TODO: Promote dynamic indexing of s16 to s32 2183 2184 // FIXME: Artifact combiner probably should have replaced the truncated 2185 // constant before this, so we shouldn't need 2186 // getConstantVRegValWithLookThrough. 2187 Optional<ValueAndVReg> MaybeIdxVal = 2188 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2189 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2190 return true; 2191 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2192 2193 Register Dst = MI.getOperand(0).getReg(); 2194 Register Vec = MI.getOperand(1).getReg(); 2195 2196 LLT VecTy = MRI.getType(Vec); 2197 LLT EltTy = VecTy.getElementType(); 2198 assert(EltTy == MRI.getType(Dst)); 2199 2200 if (IdxVal < VecTy.getNumElements()) 2201 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2202 else 2203 B.buildUndef(Dst); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2210 MachineInstr &MI, MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B) const { 2212 // TODO: Should move some of this into LegalizerHelper. 2213 2214 // TODO: Promote dynamic indexing of s16 to s32 2215 2216 // FIXME: Artifact combiner probably should have replaced the truncated 2217 // constant before this, so we shouldn't need 2218 // getConstantVRegValWithLookThrough. 2219 Optional<ValueAndVReg> MaybeIdxVal = 2220 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2221 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2222 return true; 2223 2224 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2225 Register Dst = MI.getOperand(0).getReg(); 2226 Register Vec = MI.getOperand(1).getReg(); 2227 Register Ins = MI.getOperand(2).getReg(); 2228 2229 LLT VecTy = MRI.getType(Vec); 2230 LLT EltTy = VecTy.getElementType(); 2231 assert(EltTy == MRI.getType(Ins)); 2232 2233 if (IdxVal < VecTy.getNumElements()) 2234 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2235 else 2236 B.buildUndef(Dst); 2237 2238 MI.eraseFromParent(); 2239 return true; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, 2244 MachineIRBuilder &B) const { 2245 const LLT V2S16 = LLT::fixed_vector(2, 16); 2246 2247 Register Dst = MI.getOperand(0).getReg(); 2248 Register Src0 = MI.getOperand(1).getReg(); 2249 LLT DstTy = MRI.getType(Dst); 2250 LLT SrcTy = MRI.getType(Src0); 2251 2252 if (SrcTy == V2S16 && DstTy == V2S16 && 2253 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2254 return true; 2255 2256 MachineIRBuilder HelperBuilder(MI); 2257 GISelObserverWrapper DummyObserver; 2258 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2259 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2260 } 2261 2262 bool AMDGPULegalizerInfo::legalizeSinCos( 2263 MachineInstr &MI, MachineRegisterInfo &MRI, 2264 MachineIRBuilder &B) const { 2265 2266 Register DstReg = MI.getOperand(0).getReg(); 2267 Register SrcReg = MI.getOperand(1).getReg(); 2268 LLT Ty = MRI.getType(DstReg); 2269 unsigned Flags = MI.getFlags(); 2270 2271 Register TrigVal; 2272 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2273 if (ST.hasTrigReducedRange()) { 2274 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2275 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2276 .addUse(MulVal.getReg(0)) 2277 .setMIFlags(Flags).getReg(0); 2278 } else 2279 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2280 2281 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2282 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2283 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2284 .addUse(TrigVal) 2285 .setMIFlags(Flags); 2286 MI.eraseFromParent(); 2287 return true; 2288 } 2289 2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2291 MachineIRBuilder &B, 2292 const GlobalValue *GV, 2293 int64_t Offset, 2294 unsigned GAFlags) const { 2295 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2296 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2297 // to the following code sequence: 2298 // 2299 // For constant address space: 2300 // s_getpc_b64 s[0:1] 2301 // s_add_u32 s0, s0, $symbol 2302 // s_addc_u32 s1, s1, 0 2303 // 2304 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2305 // a fixup or relocation is emitted to replace $symbol with a literal 2306 // constant, which is a pc-relative offset from the encoding of the $symbol 2307 // operand to the global variable. 2308 // 2309 // For global address space: 2310 // s_getpc_b64 s[0:1] 2311 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2312 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2313 // 2314 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2315 // fixups or relocations are emitted to replace $symbol@*@lo and 2316 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2317 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2318 // operand to the global variable. 2319 // 2320 // What we want here is an offset from the value returned by s_getpc 2321 // (which is the address of the s_add_u32 instruction) to the global 2322 // variable, but since the encoding of $symbol starts 4 bytes after the start 2323 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2324 // small. This requires us to add 4 to the global variable offset in order to 2325 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2326 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2327 // instruction. 2328 2329 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2330 2331 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2332 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2333 2334 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2335 .addDef(PCReg); 2336 2337 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2338 if (GAFlags == SIInstrInfo::MO_NONE) 2339 MIB.addImm(0); 2340 else 2341 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2342 2343 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2344 2345 if (PtrTy.getSizeInBits() == 32) 2346 B.buildExtract(DstReg, PCReg, 0); 2347 return true; 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2351 MachineInstr &MI, MachineRegisterInfo &MRI, 2352 MachineIRBuilder &B) const { 2353 Register DstReg = MI.getOperand(0).getReg(); 2354 LLT Ty = MRI.getType(DstReg); 2355 unsigned AS = Ty.getAddressSpace(); 2356 2357 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2358 MachineFunction &MF = B.getMF(); 2359 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2360 2361 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2362 if (!MFI->isModuleEntryFunction() && 2363 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2364 const Function &Fn = MF.getFunction(); 2365 DiagnosticInfoUnsupported BadLDSDecl( 2366 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2367 DS_Warning); 2368 Fn.getContext().diagnose(BadLDSDecl); 2369 2370 // We currently don't have a way to correctly allocate LDS objects that 2371 // aren't directly associated with a kernel. We do force inlining of 2372 // functions that use local objects. However, if these dead functions are 2373 // not eliminated, we don't want a compile time error. Just emit a warning 2374 // and a trap, since there should be no callable path here. 2375 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2376 B.buildUndef(DstReg); 2377 MI.eraseFromParent(); 2378 return true; 2379 } 2380 2381 // TODO: We could emit code to handle the initialization somewhere. 2382 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2383 const SITargetLowering *TLI = ST.getTargetLowering(); 2384 if (!TLI->shouldUseLDSConstAddress(GV)) { 2385 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2386 return true; // Leave in place; 2387 } 2388 2389 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2390 Type *Ty = GV->getValueType(); 2391 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2392 // zero-sized type in other languages to declare the dynamic shared 2393 // memory which size is not known at the compile time. They will be 2394 // allocated by the runtime and placed directly after the static 2395 // allocated ones. They all share the same offset. 2396 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2397 // Adjust alignment for that dynamic shared memory array. 2398 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2399 LLT S32 = LLT::scalar(32); 2400 auto Sz = 2401 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2402 B.buildIntToPtr(DstReg, Sz); 2403 MI.eraseFromParent(); 2404 return true; 2405 } 2406 } 2407 2408 B.buildConstant( 2409 DstReg, 2410 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 const Function &Fn = MF.getFunction(); 2416 DiagnosticInfoUnsupported BadInit( 2417 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2418 Fn.getContext().diagnose(BadInit); 2419 return true; 2420 } 2421 2422 const SITargetLowering *TLI = ST.getTargetLowering(); 2423 2424 if (TLI->shouldEmitFixup(GV)) { 2425 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2426 MI.eraseFromParent(); 2427 return true; 2428 } 2429 2430 if (TLI->shouldEmitPCReloc(GV)) { 2431 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2437 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2438 2439 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2440 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2441 MachinePointerInfo::getGOT(MF), 2442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2443 MachineMemOperand::MOInvariant, 2444 LoadTy, Align(8)); 2445 2446 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2447 2448 if (Ty.getSizeInBits() == 32) { 2449 // Truncate if this is a 32-bit constant adrdess. 2450 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2451 B.buildExtract(DstReg, Load, 0); 2452 } else 2453 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2454 2455 MI.eraseFromParent(); 2456 return true; 2457 } 2458 2459 static LLT widenToNextPowerOf2(LLT Ty) { 2460 if (Ty.isVector()) 2461 return Ty.changeElementCount( 2462 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2463 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2464 } 2465 2466 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2467 MachineInstr &MI) const { 2468 MachineIRBuilder &B = Helper.MIRBuilder; 2469 MachineRegisterInfo &MRI = *B.getMRI(); 2470 GISelChangeObserver &Observer = Helper.Observer; 2471 2472 Register PtrReg = MI.getOperand(1).getReg(); 2473 LLT PtrTy = MRI.getType(PtrReg); 2474 unsigned AddrSpace = PtrTy.getAddressSpace(); 2475 2476 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2477 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2478 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2479 Observer.changingInstr(MI); 2480 MI.getOperand(1).setReg(Cast.getReg(0)); 2481 Observer.changedInstr(MI); 2482 return true; 2483 } 2484 2485 if (MI.getOpcode() != AMDGPU::G_LOAD) 2486 return false; 2487 2488 Register ValReg = MI.getOperand(0).getReg(); 2489 LLT ValTy = MRI.getType(ValReg); 2490 2491 MachineMemOperand *MMO = *MI.memoperands_begin(); 2492 const unsigned ValSize = ValTy.getSizeInBits(); 2493 const LLT MemTy = MMO->getMemoryType(); 2494 const Align MemAlign = MMO->getAlign(); 2495 const unsigned MemSize = MemTy.getSizeInBits(); 2496 const unsigned AlignInBits = 8 * MemAlign.value(); 2497 2498 // Widen non-power-of-2 loads to the alignment if needed 2499 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2500 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2501 2502 // This was already the correct extending load result type, so just adjust 2503 // the memory type. 2504 if (WideMemSize == ValSize) { 2505 MachineFunction &MF = B.getMF(); 2506 2507 MachineMemOperand *WideMMO = 2508 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2509 Observer.changingInstr(MI); 2510 MI.setMemRefs(MF, {WideMMO}); 2511 Observer.changedInstr(MI); 2512 return true; 2513 } 2514 2515 // Don't bother handling edge case that should probably never be produced. 2516 if (ValSize > WideMemSize) 2517 return false; 2518 2519 LLT WideTy = widenToNextPowerOf2(ValTy); 2520 2521 Register WideLoad; 2522 if (!WideTy.isVector()) { 2523 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2524 B.buildTrunc(ValReg, WideLoad).getReg(0); 2525 } else { 2526 // Extract the subvector. 2527 2528 if (isRegisterType(ValTy)) { 2529 // If this a case where G_EXTRACT is legal, use it. 2530 // (e.g. <3 x s32> -> <4 x s32>) 2531 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2532 B.buildExtract(ValReg, WideLoad, 0); 2533 } else { 2534 // For cases where the widened type isn't a nice register value, unmerge 2535 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2536 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2537 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2538 B.setInsertPt(B.getMBB(), MI.getIterator()); 2539 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2540 } 2541 } 2542 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 return false; 2548 } 2549 2550 bool AMDGPULegalizerInfo::legalizeFMad( 2551 MachineInstr &MI, MachineRegisterInfo &MRI, 2552 MachineIRBuilder &B) const { 2553 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2554 assert(Ty.isScalar()); 2555 2556 MachineFunction &MF = B.getMF(); 2557 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2558 2559 // TODO: Always legal with future ftz flag. 2560 // FIXME: Do we need just output? 2561 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2562 return true; 2563 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2564 return true; 2565 2566 MachineIRBuilder HelperBuilder(MI); 2567 GISelObserverWrapper DummyObserver; 2568 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2569 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2570 } 2571 2572 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2573 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2574 Register DstReg = MI.getOperand(0).getReg(); 2575 Register PtrReg = MI.getOperand(1).getReg(); 2576 Register CmpVal = MI.getOperand(2).getReg(); 2577 Register NewVal = MI.getOperand(3).getReg(); 2578 2579 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2580 "this should not have been custom lowered"); 2581 2582 LLT ValTy = MRI.getType(CmpVal); 2583 LLT VecTy = LLT::fixed_vector(2, ValTy); 2584 2585 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2586 2587 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2588 .addDef(DstReg) 2589 .addUse(PtrReg) 2590 .addUse(PackedVal) 2591 .setMemRefs(MI.memoperands()); 2592 2593 MI.eraseFromParent(); 2594 return true; 2595 } 2596 2597 bool AMDGPULegalizerInfo::legalizeFlog( 2598 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2599 Register Dst = MI.getOperand(0).getReg(); 2600 Register Src = MI.getOperand(1).getReg(); 2601 LLT Ty = B.getMRI()->getType(Dst); 2602 unsigned Flags = MI.getFlags(); 2603 2604 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2605 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2606 2607 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2608 MI.eraseFromParent(); 2609 return true; 2610 } 2611 2612 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2613 MachineIRBuilder &B) const { 2614 Register Dst = MI.getOperand(0).getReg(); 2615 Register Src = MI.getOperand(1).getReg(); 2616 unsigned Flags = MI.getFlags(); 2617 LLT Ty = B.getMRI()->getType(Dst); 2618 2619 auto K = B.buildFConstant(Ty, numbers::log2e); 2620 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2621 B.buildFExp2(Dst, Mul, Flags); 2622 MI.eraseFromParent(); 2623 return true; 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2627 MachineIRBuilder &B) const { 2628 Register Dst = MI.getOperand(0).getReg(); 2629 Register Src0 = MI.getOperand(1).getReg(); 2630 Register Src1 = MI.getOperand(2).getReg(); 2631 unsigned Flags = MI.getFlags(); 2632 LLT Ty = B.getMRI()->getType(Dst); 2633 const LLT S16 = LLT::scalar(16); 2634 const LLT S32 = LLT::scalar(32); 2635 2636 if (Ty == S32) { 2637 auto Log = B.buildFLog2(S32, Src0, Flags); 2638 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2639 .addUse(Log.getReg(0)) 2640 .addUse(Src1) 2641 .setMIFlags(Flags); 2642 B.buildFExp2(Dst, Mul, Flags); 2643 } else if (Ty == S16) { 2644 // There's no f16 fmul_legacy, so we need to convert for it. 2645 auto Log = B.buildFLog2(S16, Src0, Flags); 2646 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2647 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2648 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2649 .addUse(Ext0.getReg(0)) 2650 .addUse(Ext1.getReg(0)) 2651 .setMIFlags(Flags); 2652 2653 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2654 } else 2655 return false; 2656 2657 MI.eraseFromParent(); 2658 return true; 2659 } 2660 2661 // Find a source register, ignoring any possible source modifiers. 2662 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2663 Register ModSrc = OrigSrc; 2664 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2665 ModSrc = SrcFNeg->getOperand(1).getReg(); 2666 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2667 ModSrc = SrcFAbs->getOperand(1).getReg(); 2668 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2669 ModSrc = SrcFAbs->getOperand(1).getReg(); 2670 return ModSrc; 2671 } 2672 2673 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2674 MachineRegisterInfo &MRI, 2675 MachineIRBuilder &B) const { 2676 2677 const LLT S1 = LLT::scalar(1); 2678 const LLT S64 = LLT::scalar(64); 2679 Register Dst = MI.getOperand(0).getReg(); 2680 Register OrigSrc = MI.getOperand(1).getReg(); 2681 unsigned Flags = MI.getFlags(); 2682 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2683 "this should not have been custom lowered"); 2684 2685 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2686 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2687 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2688 // V_FRACT bug is: 2689 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2690 // 2691 // Convert floor(x) to (x - fract(x)) 2692 2693 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2694 .addUse(OrigSrc) 2695 .setMIFlags(Flags); 2696 2697 // Give source modifier matching some assistance before obscuring a foldable 2698 // pattern. 2699 2700 // TODO: We can avoid the neg on the fract? The input sign to fract 2701 // shouldn't matter? 2702 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2703 2704 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2705 2706 Register Min = MRI.createGenericVirtualRegister(S64); 2707 2708 // We don't need to concern ourselves with the snan handling difference, so 2709 // use the one which will directly select. 2710 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2711 if (MFI->getMode().IEEE) 2712 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2713 else 2714 B.buildFMinNum(Min, Fract, Const, Flags); 2715 2716 Register CorrectedFract = Min; 2717 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2718 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2719 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2720 } 2721 2722 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2723 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2724 2725 MI.eraseFromParent(); 2726 return true; 2727 } 2728 2729 // Turn an illegal packed v2s16 build vector into bit operations. 2730 // TODO: This should probably be a bitcast action in LegalizerHelper. 2731 bool AMDGPULegalizerInfo::legalizeBuildVector( 2732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2733 Register Dst = MI.getOperand(0).getReg(); 2734 const LLT S32 = LLT::scalar(32); 2735 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2736 2737 Register Src0 = MI.getOperand(1).getReg(); 2738 Register Src1 = MI.getOperand(2).getReg(); 2739 assert(MRI.getType(Src0) == LLT::scalar(16)); 2740 2741 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2742 B.buildBitcast(Dst, Merge); 2743 2744 MI.eraseFromParent(); 2745 return true; 2746 } 2747 2748 // Check that this is a G_XOR x, -1 2749 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2750 if (MI.getOpcode() != TargetOpcode::G_XOR) 2751 return false; 2752 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2753 return ConstVal && *ConstVal == -1; 2754 } 2755 2756 // Return the use branch instruction, otherwise null if the usage is invalid. 2757 static MachineInstr * 2758 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2759 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2760 Register CondDef = MI.getOperand(0).getReg(); 2761 if (!MRI.hasOneNonDBGUse(CondDef)) 2762 return nullptr; 2763 2764 MachineBasicBlock *Parent = MI.getParent(); 2765 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2766 2767 if (isNot(MRI, *UseMI)) { 2768 Register NegatedCond = UseMI->getOperand(0).getReg(); 2769 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2770 return nullptr; 2771 2772 // We're deleting the def of this value, so we need to remove it. 2773 UseMI->eraseFromParent(); 2774 2775 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2776 Negated = true; 2777 } 2778 2779 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2780 return nullptr; 2781 2782 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2783 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2784 if (Next == Parent->end()) { 2785 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2786 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2787 return nullptr; 2788 UncondBrTarget = &*NextMBB; 2789 } else { 2790 if (Next->getOpcode() != AMDGPU::G_BR) 2791 return nullptr; 2792 Br = &*Next; 2793 UncondBrTarget = Br->getOperand(0).getMBB(); 2794 } 2795 2796 return UseMI; 2797 } 2798 2799 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2800 const ArgDescriptor *Arg, 2801 const TargetRegisterClass *ArgRC, 2802 LLT ArgTy) const { 2803 MCRegister SrcReg = Arg->getRegister(); 2804 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2805 assert(DstReg.isVirtual() && "Virtual register expected"); 2806 2807 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2808 ArgTy); 2809 if (Arg->isMasked()) { 2810 // TODO: Should we try to emit this once in the entry block? 2811 const LLT S32 = LLT::scalar(32); 2812 const unsigned Mask = Arg->getMask(); 2813 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2814 2815 Register AndMaskSrc = LiveIn; 2816 2817 if (Shift != 0) { 2818 auto ShiftAmt = B.buildConstant(S32, Shift); 2819 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2820 } 2821 2822 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2823 } else { 2824 B.buildCopy(DstReg, LiveIn); 2825 } 2826 2827 return true; 2828 } 2829 2830 bool AMDGPULegalizerInfo::loadInputValue( 2831 Register DstReg, MachineIRBuilder &B, 2832 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2833 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2834 const ArgDescriptor *Arg; 2835 const TargetRegisterClass *ArgRC; 2836 LLT ArgTy; 2837 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2838 2839 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2840 return false; // TODO: Handle these 2841 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2842 } 2843 2844 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2845 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2846 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2847 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2848 return false; 2849 2850 MI.eraseFromParent(); 2851 return true; 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2855 MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register Dst = MI.getOperand(0).getReg(); 2858 LLT DstTy = MRI.getType(Dst); 2859 LLT S16 = LLT::scalar(16); 2860 LLT S32 = LLT::scalar(32); 2861 LLT S64 = LLT::scalar(64); 2862 2863 if (DstTy == S16) 2864 return legalizeFDIV16(MI, MRI, B); 2865 if (DstTy == S32) 2866 return legalizeFDIV32(MI, MRI, B); 2867 if (DstTy == S64) 2868 return legalizeFDIV64(MI, MRI, B); 2869 2870 return false; 2871 } 2872 2873 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2874 Register DstDivReg, 2875 Register DstRemReg, 2876 Register X, 2877 Register Y) const { 2878 const LLT S1 = LLT::scalar(1); 2879 const LLT S32 = LLT::scalar(32); 2880 2881 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2882 // algorithm used here. 2883 2884 // Initial estimate of inv(y). 2885 auto FloatY = B.buildUITOFP(S32, Y); 2886 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2887 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2888 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2889 auto Z = B.buildFPTOUI(S32, ScaledY); 2890 2891 // One round of UNR. 2892 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2893 auto NegYZ = B.buildMul(S32, NegY, Z); 2894 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2895 2896 // Quotient/remainder estimate. 2897 auto Q = B.buildUMulH(S32, X, Z); 2898 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2899 2900 // First quotient/remainder refinement. 2901 auto One = B.buildConstant(S32, 1); 2902 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2903 if (DstDivReg) 2904 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2905 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2906 2907 // Second quotient/remainder refinement. 2908 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2909 if (DstDivReg) 2910 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2911 2912 if (DstRemReg) 2913 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2914 } 2915 2916 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2917 // 2918 // Return lo, hi of result 2919 // 2920 // %cvt.lo = G_UITOFP Val.lo 2921 // %cvt.hi = G_UITOFP Val.hi 2922 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2923 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2924 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2925 // %mul2 = G_FMUL %mul1, 2**(-32) 2926 // %trunc = G_INTRINSIC_TRUNC %mul2 2927 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2928 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2929 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2930 Register Val) { 2931 const LLT S32 = LLT::scalar(32); 2932 auto Unmerge = B.buildUnmerge(S32, Val); 2933 2934 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2935 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2936 2937 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2938 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2939 2940 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2941 auto Mul1 = 2942 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2943 2944 // 2**(-32) 2945 auto Mul2 = 2946 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2947 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2948 2949 // -(2**32) 2950 auto Mad2 = B.buildFMAD(S32, Trunc, 2951 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2952 2953 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2954 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2955 2956 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2957 } 2958 2959 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2960 Register DstDivReg, 2961 Register DstRemReg, 2962 Register Numer, 2963 Register Denom) const { 2964 const LLT S32 = LLT::scalar(32); 2965 const LLT S64 = LLT::scalar(64); 2966 const LLT S1 = LLT::scalar(1); 2967 Register RcpLo, RcpHi; 2968 2969 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2970 2971 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2972 2973 auto Zero64 = B.buildConstant(S64, 0); 2974 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2975 2976 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2977 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2978 2979 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2980 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2981 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2982 2983 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2984 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2985 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2986 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2987 2988 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2989 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2990 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2991 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2992 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2993 2994 auto Zero32 = B.buildConstant(S32, 0); 2995 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2996 auto Add2_HiC = 2997 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2998 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2999 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3000 3001 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3002 Register NumerLo = UnmergeNumer.getReg(0); 3003 Register NumerHi = UnmergeNumer.getReg(1); 3004 3005 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3006 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3007 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3008 Register Mul3_Lo = UnmergeMul3.getReg(0); 3009 Register Mul3_Hi = UnmergeMul3.getReg(1); 3010 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3011 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3012 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3013 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3014 3015 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3016 Register DenomLo = UnmergeDenom.getReg(0); 3017 Register DenomHi = UnmergeDenom.getReg(1); 3018 3019 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3020 auto C1 = B.buildSExt(S32, CmpHi); 3021 3022 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3023 auto C2 = B.buildSExt(S32, CmpLo); 3024 3025 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3026 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3027 3028 // TODO: Here and below portions of the code can be enclosed into if/endif. 3029 // Currently control flow is unconditional and we have 4 selects after 3030 // potential endif to substitute PHIs. 3031 3032 // if C3 != 0 ... 3033 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3034 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3035 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3036 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3037 3038 auto One64 = B.buildConstant(S64, 1); 3039 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3040 3041 auto C4 = 3042 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3043 auto C5 = 3044 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3045 auto C6 = B.buildSelect( 3046 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3047 3048 // if (C6 != 0) 3049 auto Add4 = B.buildAdd(S64, Add3, One64); 3050 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3051 3052 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3053 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3054 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3055 3056 // endif C6 3057 // endif C3 3058 3059 if (DstDivReg) { 3060 auto Sel1 = B.buildSelect( 3061 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3062 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3063 Sel1, MulHi3); 3064 } 3065 3066 if (DstRemReg) { 3067 auto Sel2 = B.buildSelect( 3068 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3069 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3070 Sel2, Sub1); 3071 } 3072 } 3073 3074 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3075 MachineRegisterInfo &MRI, 3076 MachineIRBuilder &B) const { 3077 Register DstDivReg, DstRemReg; 3078 switch (MI.getOpcode()) { 3079 default: 3080 llvm_unreachable("Unexpected opcode!"); 3081 case AMDGPU::G_UDIV: { 3082 DstDivReg = MI.getOperand(0).getReg(); 3083 break; 3084 } 3085 case AMDGPU::G_UREM: { 3086 DstRemReg = MI.getOperand(0).getReg(); 3087 break; 3088 } 3089 case AMDGPU::G_UDIVREM: { 3090 DstDivReg = MI.getOperand(0).getReg(); 3091 DstRemReg = MI.getOperand(1).getReg(); 3092 break; 3093 } 3094 } 3095 3096 const LLT S64 = LLT::scalar(64); 3097 const LLT S32 = LLT::scalar(32); 3098 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3099 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3100 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3101 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3102 3103 if (Ty == S32) 3104 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3105 else if (Ty == S64) 3106 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3107 else 3108 return false; 3109 3110 MI.eraseFromParent(); 3111 return true; 3112 } 3113 3114 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3115 MachineRegisterInfo &MRI, 3116 MachineIRBuilder &B) const { 3117 const LLT S64 = LLT::scalar(64); 3118 const LLT S32 = LLT::scalar(32); 3119 3120 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3121 if (Ty != S32 && Ty != S64) 3122 return false; 3123 3124 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3125 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3126 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3127 3128 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3129 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3130 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3131 3132 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3133 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3134 3135 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3136 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3137 3138 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3139 switch (MI.getOpcode()) { 3140 default: 3141 llvm_unreachable("Unexpected opcode!"); 3142 case AMDGPU::G_SDIV: { 3143 DstDivReg = MI.getOperand(0).getReg(); 3144 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3145 break; 3146 } 3147 case AMDGPU::G_SREM: { 3148 DstRemReg = MI.getOperand(0).getReg(); 3149 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3150 break; 3151 } 3152 case AMDGPU::G_SDIVREM: { 3153 DstDivReg = MI.getOperand(0).getReg(); 3154 DstRemReg = MI.getOperand(1).getReg(); 3155 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3156 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3157 break; 3158 } 3159 } 3160 3161 if (Ty == S32) 3162 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3163 else 3164 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3165 3166 if (DstDivReg) { 3167 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3168 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3169 B.buildSub(DstDivReg, SignXor, Sign); 3170 } 3171 3172 if (DstRemReg) { 3173 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3174 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3175 B.buildSub(DstRemReg, SignXor, Sign); 3176 } 3177 3178 MI.eraseFromParent(); 3179 return true; 3180 } 3181 3182 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3183 MachineRegisterInfo &MRI, 3184 MachineIRBuilder &B) const { 3185 Register Res = MI.getOperand(0).getReg(); 3186 Register LHS = MI.getOperand(1).getReg(); 3187 Register RHS = MI.getOperand(2).getReg(); 3188 uint16_t Flags = MI.getFlags(); 3189 LLT ResTy = MRI.getType(Res); 3190 3191 const MachineFunction &MF = B.getMF(); 3192 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3193 MI.getFlag(MachineInstr::FmAfn); 3194 3195 if (!AllowInaccurateRcp) 3196 return false; 3197 3198 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3199 // 1 / x -> RCP(x) 3200 if (CLHS->isExactlyValue(1.0)) { 3201 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3202 .addUse(RHS) 3203 .setMIFlags(Flags); 3204 3205 MI.eraseFromParent(); 3206 return true; 3207 } 3208 3209 // -1 / x -> RCP( FNEG(x) ) 3210 if (CLHS->isExactlyValue(-1.0)) { 3211 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3212 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3213 .addUse(FNeg.getReg(0)) 3214 .setMIFlags(Flags); 3215 3216 MI.eraseFromParent(); 3217 return true; 3218 } 3219 } 3220 3221 // x / y -> x * (1.0 / y) 3222 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3223 .addUse(RHS) 3224 .setMIFlags(Flags); 3225 B.buildFMul(Res, LHS, RCP, Flags); 3226 3227 MI.eraseFromParent(); 3228 return true; 3229 } 3230 3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3232 MachineRegisterInfo &MRI, 3233 MachineIRBuilder &B) const { 3234 Register Res = MI.getOperand(0).getReg(); 3235 Register X = MI.getOperand(1).getReg(); 3236 Register Y = MI.getOperand(2).getReg(); 3237 uint16_t Flags = MI.getFlags(); 3238 LLT ResTy = MRI.getType(Res); 3239 3240 const MachineFunction &MF = B.getMF(); 3241 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3242 MI.getFlag(MachineInstr::FmAfn); 3243 3244 if (!AllowInaccurateRcp) 3245 return false; 3246 3247 auto NegY = B.buildFNeg(ResTy, Y); 3248 auto One = B.buildFConstant(ResTy, 1.0); 3249 3250 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3251 .addUse(Y) 3252 .setMIFlags(Flags); 3253 3254 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3255 R = B.buildFMA(ResTy, Tmp0, R, R); 3256 3257 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3258 R = B.buildFMA(ResTy, Tmp1, R, R); 3259 3260 auto Ret = B.buildFMul(ResTy, X, R); 3261 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3262 3263 B.buildFMA(Res, Tmp2, R, Ret); 3264 MI.eraseFromParent(); 3265 return true; 3266 } 3267 3268 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3269 MachineRegisterInfo &MRI, 3270 MachineIRBuilder &B) const { 3271 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3272 return true; 3273 3274 Register Res = MI.getOperand(0).getReg(); 3275 Register LHS = MI.getOperand(1).getReg(); 3276 Register RHS = MI.getOperand(2).getReg(); 3277 3278 uint16_t Flags = MI.getFlags(); 3279 3280 LLT S16 = LLT::scalar(16); 3281 LLT S32 = LLT::scalar(32); 3282 3283 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3284 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3285 3286 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3287 .addUse(RHSExt.getReg(0)) 3288 .setMIFlags(Flags); 3289 3290 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3291 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3292 3293 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3294 .addUse(RDst.getReg(0)) 3295 .addUse(RHS) 3296 .addUse(LHS) 3297 .setMIFlags(Flags); 3298 3299 MI.eraseFromParent(); 3300 return true; 3301 } 3302 3303 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3304 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3305 static void toggleSPDenormMode(bool Enable, 3306 MachineIRBuilder &B, 3307 const GCNSubtarget &ST, 3308 AMDGPU::SIModeRegisterDefaults Mode) { 3309 // Set SP denorm mode to this value. 3310 unsigned SPDenormMode = 3311 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3312 3313 if (ST.hasDenormModeInst()) { 3314 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3315 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3316 3317 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3318 B.buildInstr(AMDGPU::S_DENORM_MODE) 3319 .addImm(NewDenormModeValue); 3320 3321 } else { 3322 // Select FP32 bit field in mode register. 3323 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3324 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3325 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3326 3327 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3328 .addImm(SPDenormMode) 3329 .addImm(SPDenormModeBitField); 3330 } 3331 } 3332 3333 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3334 MachineRegisterInfo &MRI, 3335 MachineIRBuilder &B) const { 3336 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3337 return true; 3338 3339 Register Res = MI.getOperand(0).getReg(); 3340 Register LHS = MI.getOperand(1).getReg(); 3341 Register RHS = MI.getOperand(2).getReg(); 3342 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3343 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3344 3345 uint16_t Flags = MI.getFlags(); 3346 3347 LLT S32 = LLT::scalar(32); 3348 LLT S1 = LLT::scalar(1); 3349 3350 auto One = B.buildFConstant(S32, 1.0f); 3351 3352 auto DenominatorScaled = 3353 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3354 .addUse(LHS) 3355 .addUse(RHS) 3356 .addImm(0) 3357 .setMIFlags(Flags); 3358 auto NumeratorScaled = 3359 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3360 .addUse(LHS) 3361 .addUse(RHS) 3362 .addImm(1) 3363 .setMIFlags(Flags); 3364 3365 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3366 .addUse(DenominatorScaled.getReg(0)) 3367 .setMIFlags(Flags); 3368 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3369 3370 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3371 // aren't modeled as reading it. 3372 if (!Mode.allFP32Denormals()) 3373 toggleSPDenormMode(true, B, ST, Mode); 3374 3375 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3376 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3377 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3378 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3379 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3380 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3381 3382 if (!Mode.allFP32Denormals()) 3383 toggleSPDenormMode(false, B, ST, Mode); 3384 3385 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3386 .addUse(Fma4.getReg(0)) 3387 .addUse(Fma1.getReg(0)) 3388 .addUse(Fma3.getReg(0)) 3389 .addUse(NumeratorScaled.getReg(1)) 3390 .setMIFlags(Flags); 3391 3392 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3393 .addUse(Fmas.getReg(0)) 3394 .addUse(RHS) 3395 .addUse(LHS) 3396 .setMIFlags(Flags); 3397 3398 MI.eraseFromParent(); 3399 return true; 3400 } 3401 3402 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3403 MachineRegisterInfo &MRI, 3404 MachineIRBuilder &B) const { 3405 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3406 return true; 3407 3408 Register Res = MI.getOperand(0).getReg(); 3409 Register LHS = MI.getOperand(1).getReg(); 3410 Register RHS = MI.getOperand(2).getReg(); 3411 3412 uint16_t Flags = MI.getFlags(); 3413 3414 LLT S64 = LLT::scalar(64); 3415 LLT S1 = LLT::scalar(1); 3416 3417 auto One = B.buildFConstant(S64, 1.0); 3418 3419 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3420 .addUse(LHS) 3421 .addUse(RHS) 3422 .addImm(0) 3423 .setMIFlags(Flags); 3424 3425 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3426 3427 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3428 .addUse(DivScale0.getReg(0)) 3429 .setMIFlags(Flags); 3430 3431 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3432 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3433 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3434 3435 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3436 .addUse(LHS) 3437 .addUse(RHS) 3438 .addImm(1) 3439 .setMIFlags(Flags); 3440 3441 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3442 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3443 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3444 3445 Register Scale; 3446 if (!ST.hasUsableDivScaleConditionOutput()) { 3447 // Workaround a hardware bug on SI where the condition output from div_scale 3448 // is not usable. 3449 3450 LLT S32 = LLT::scalar(32); 3451 3452 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3453 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3454 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3455 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3456 3457 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3458 Scale1Unmerge.getReg(1)); 3459 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3460 Scale0Unmerge.getReg(1)); 3461 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3462 } else { 3463 Scale = DivScale1.getReg(1); 3464 } 3465 3466 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3467 .addUse(Fma4.getReg(0)) 3468 .addUse(Fma3.getReg(0)) 3469 .addUse(Mul.getReg(0)) 3470 .addUse(Scale) 3471 .setMIFlags(Flags); 3472 3473 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3474 .addUse(Fmas.getReg(0)) 3475 .addUse(RHS) 3476 .addUse(LHS) 3477 .setMIFlags(Flags); 3478 3479 MI.eraseFromParent(); 3480 return true; 3481 } 3482 3483 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3484 MachineRegisterInfo &MRI, 3485 MachineIRBuilder &B) const { 3486 Register Res = MI.getOperand(0).getReg(); 3487 Register LHS = MI.getOperand(2).getReg(); 3488 Register RHS = MI.getOperand(3).getReg(); 3489 uint16_t Flags = MI.getFlags(); 3490 3491 LLT S32 = LLT::scalar(32); 3492 LLT S1 = LLT::scalar(1); 3493 3494 auto Abs = B.buildFAbs(S32, RHS, Flags); 3495 const APFloat C0Val(1.0f); 3496 3497 auto C0 = B.buildConstant(S32, 0x6f800000); 3498 auto C1 = B.buildConstant(S32, 0x2f800000); 3499 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3500 3501 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3502 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3503 3504 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3505 3506 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3507 .addUse(Mul0.getReg(0)) 3508 .setMIFlags(Flags); 3509 3510 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3511 3512 B.buildFMul(Res, Sel, Mul1, Flags); 3513 3514 MI.eraseFromParent(); 3515 return true; 3516 } 3517 3518 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3519 // FIXME: Why do we handle this one but not other removed instructions? 3520 // 3521 // Reciprocal square root. The clamp prevents infinite results, clamping 3522 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3523 // +-max_float. 3524 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3525 MachineRegisterInfo &MRI, 3526 MachineIRBuilder &B) const { 3527 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3528 return true; 3529 3530 Register Dst = MI.getOperand(0).getReg(); 3531 Register Src = MI.getOperand(2).getReg(); 3532 auto Flags = MI.getFlags(); 3533 3534 LLT Ty = MRI.getType(Dst); 3535 3536 const fltSemantics *FltSemantics; 3537 if (Ty == LLT::scalar(32)) 3538 FltSemantics = &APFloat::IEEEsingle(); 3539 else if (Ty == LLT::scalar(64)) 3540 FltSemantics = &APFloat::IEEEdouble(); 3541 else 3542 return false; 3543 3544 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3545 .addUse(Src) 3546 .setMIFlags(Flags); 3547 3548 // We don't need to concern ourselves with the snan handling difference, since 3549 // the rsq quieted (or not) so use the one which will directly select. 3550 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3551 const bool UseIEEE = MFI->getMode().IEEE; 3552 3553 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3554 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3555 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3556 3557 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3558 3559 if (UseIEEE) 3560 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3561 else 3562 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3563 MI.eraseFromParent(); 3564 return true; 3565 } 3566 3567 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3568 switch (IID) { 3569 case Intrinsic::amdgcn_ds_fadd: 3570 return AMDGPU::G_ATOMICRMW_FADD; 3571 case Intrinsic::amdgcn_ds_fmin: 3572 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3573 case Intrinsic::amdgcn_ds_fmax: 3574 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3575 default: 3576 llvm_unreachable("not a DS FP intrinsic"); 3577 } 3578 } 3579 3580 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3581 MachineInstr &MI, 3582 Intrinsic::ID IID) const { 3583 GISelChangeObserver &Observer = Helper.Observer; 3584 Observer.changingInstr(MI); 3585 3586 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3587 3588 // The remaining operands were used to set fields in the MemOperand on 3589 // construction. 3590 for (int I = 6; I > 3; --I) 3591 MI.RemoveOperand(I); 3592 3593 MI.RemoveOperand(1); // Remove the intrinsic ID. 3594 Observer.changedInstr(MI); 3595 return true; 3596 } 3597 3598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3599 MachineRegisterInfo &MRI, 3600 MachineIRBuilder &B) const { 3601 uint64_t Offset = 3602 ST.getTargetLowering()->getImplicitParameterOffset( 3603 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3604 LLT DstTy = MRI.getType(DstReg); 3605 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3606 3607 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3608 if (!loadInputValue(KernargPtrReg, B, 3609 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3610 return false; 3611 3612 // FIXME: This should be nuw 3613 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3614 return true; 3615 } 3616 3617 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3618 MachineRegisterInfo &MRI, 3619 MachineIRBuilder &B) const { 3620 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3621 if (!MFI->isEntryFunction()) { 3622 return legalizePreloadedArgIntrin(MI, MRI, B, 3623 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3624 } 3625 3626 Register DstReg = MI.getOperand(0).getReg(); 3627 if (!getImplicitArgPtr(DstReg, MRI, B)) 3628 return false; 3629 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3635 MachineRegisterInfo &MRI, 3636 MachineIRBuilder &B, 3637 unsigned AddrSpace) const { 3638 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3639 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3640 Register Hi32 = Unmerge.getReg(1); 3641 3642 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3643 MI.eraseFromParent(); 3644 return true; 3645 } 3646 3647 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3648 // offset (the offset that is included in bounds checking and swizzling, to be 3649 // split between the instruction's voffset and immoffset fields) and soffset 3650 // (the offset that is excluded from bounds checking and swizzling, to go in 3651 // the instruction's soffset field). This function takes the first kind of 3652 // offset and figures out how to split it between voffset and immoffset. 3653 std::tuple<Register, unsigned, unsigned> 3654 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3655 Register OrigOffset) const { 3656 const unsigned MaxImm = 4095; 3657 Register BaseReg; 3658 unsigned TotalConstOffset; 3659 const LLT S32 = LLT::scalar(32); 3660 MachineRegisterInfo &MRI = *B.getMRI(); 3661 3662 std::tie(BaseReg, TotalConstOffset) = 3663 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3664 3665 unsigned ImmOffset = TotalConstOffset; 3666 3667 // If BaseReg is a pointer, convert it to int. 3668 if (MRI.getType(BaseReg).isPointer()) 3669 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3670 3671 // If the immediate value is too big for the immoffset field, put the value 3672 // and -4096 into the immoffset field so that the value that is copied/added 3673 // for the voffset field is a multiple of 4096, and it stands more chance 3674 // of being CSEd with the copy/add for another similar load/store. 3675 // However, do not do that rounding down to a multiple of 4096 if that is a 3676 // negative number, as it appears to be illegal to have a negative offset 3677 // in the vgpr, even if adding the immediate offset makes it positive. 3678 unsigned Overflow = ImmOffset & ~MaxImm; 3679 ImmOffset -= Overflow; 3680 if ((int32_t)Overflow < 0) { 3681 Overflow += ImmOffset; 3682 ImmOffset = 0; 3683 } 3684 3685 if (Overflow != 0) { 3686 if (!BaseReg) { 3687 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3688 } else { 3689 auto OverflowVal = B.buildConstant(S32, Overflow); 3690 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3691 } 3692 } 3693 3694 if (!BaseReg) 3695 BaseReg = B.buildConstant(S32, 0).getReg(0); 3696 3697 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3698 } 3699 3700 /// Handle register layout difference for f16 images for some subtargets. 3701 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3702 MachineRegisterInfo &MRI, 3703 Register Reg, 3704 bool ImageStore) const { 3705 const LLT S16 = LLT::scalar(16); 3706 const LLT S32 = LLT::scalar(32); 3707 LLT StoreVT = MRI.getType(Reg); 3708 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3709 3710 if (ST.hasUnpackedD16VMem()) { 3711 auto Unmerge = B.buildUnmerge(S16, Reg); 3712 3713 SmallVector<Register, 4> WideRegs; 3714 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3715 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3716 3717 int NumElts = StoreVT.getNumElements(); 3718 3719 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3720 .getReg(0); 3721 } 3722 3723 if (ImageStore && ST.hasImageStoreD16Bug()) { 3724 if (StoreVT.getNumElements() == 2) { 3725 SmallVector<Register, 4> PackedRegs; 3726 Reg = B.buildBitcast(S32, Reg).getReg(0); 3727 PackedRegs.push_back(Reg); 3728 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3729 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3730 .getReg(0); 3731 } 3732 3733 if (StoreVT.getNumElements() == 3) { 3734 SmallVector<Register, 4> PackedRegs; 3735 auto Unmerge = B.buildUnmerge(S16, Reg); 3736 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3737 PackedRegs.push_back(Unmerge.getReg(I)); 3738 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3739 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3740 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3741 } 3742 3743 if (StoreVT.getNumElements() == 4) { 3744 SmallVector<Register, 4> PackedRegs; 3745 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3746 auto Unmerge = B.buildUnmerge(S32, Reg); 3747 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3748 PackedRegs.push_back(Unmerge.getReg(I)); 3749 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3750 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3751 .getReg(0); 3752 } 3753 3754 llvm_unreachable("invalid data type"); 3755 } 3756 3757 return Reg; 3758 } 3759 3760 Register AMDGPULegalizerInfo::fixStoreSourceType( 3761 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3762 MachineRegisterInfo *MRI = B.getMRI(); 3763 LLT Ty = MRI->getType(VData); 3764 3765 const LLT S16 = LLT::scalar(16); 3766 3767 // Fixup illegal register types for i8 stores. 3768 if (Ty == LLT::scalar(8) || Ty == S16) { 3769 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3770 return AnyExt; 3771 } 3772 3773 if (Ty.isVector()) { 3774 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3775 if (IsFormat) 3776 return handleD16VData(B, *MRI, VData); 3777 } 3778 } 3779 3780 return VData; 3781 } 3782 3783 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3784 MachineRegisterInfo &MRI, 3785 MachineIRBuilder &B, 3786 bool IsTyped, 3787 bool IsFormat) const { 3788 Register VData = MI.getOperand(1).getReg(); 3789 LLT Ty = MRI.getType(VData); 3790 LLT EltTy = Ty.getScalarType(); 3791 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3792 const LLT S32 = LLT::scalar(32); 3793 3794 VData = fixStoreSourceType(B, VData, IsFormat); 3795 Register RSrc = MI.getOperand(2).getReg(); 3796 3797 MachineMemOperand *MMO = *MI.memoperands_begin(); 3798 const int MemSize = MMO->getSize(); 3799 3800 unsigned ImmOffset; 3801 unsigned TotalOffset; 3802 3803 // The typed intrinsics add an immediate after the registers. 3804 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3805 3806 // The struct intrinsic variants add one additional operand over raw. 3807 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3808 Register VIndex; 3809 int OpOffset = 0; 3810 if (HasVIndex) { 3811 VIndex = MI.getOperand(3).getReg(); 3812 OpOffset = 1; 3813 } 3814 3815 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3816 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3817 3818 unsigned Format = 0; 3819 if (IsTyped) { 3820 Format = MI.getOperand(5 + OpOffset).getImm(); 3821 ++OpOffset; 3822 } 3823 3824 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3825 3826 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3827 if (TotalOffset != 0) 3828 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3829 3830 unsigned Opc; 3831 if (IsTyped) { 3832 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3833 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3834 } else if (IsFormat) { 3835 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3836 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3837 } else { 3838 switch (MemSize) { 3839 case 1: 3840 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3841 break; 3842 case 2: 3843 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3844 break; 3845 default: 3846 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3847 break; 3848 } 3849 } 3850 3851 if (!VIndex) 3852 VIndex = B.buildConstant(S32, 0).getReg(0); 3853 3854 auto MIB = B.buildInstr(Opc) 3855 .addUse(VData) // vdata 3856 .addUse(RSrc) // rsrc 3857 .addUse(VIndex) // vindex 3858 .addUse(VOffset) // voffset 3859 .addUse(SOffset) // soffset 3860 .addImm(ImmOffset); // offset(imm) 3861 3862 if (IsTyped) 3863 MIB.addImm(Format); 3864 3865 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3866 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3867 .addMemOperand(MMO); 3868 3869 MI.eraseFromParent(); 3870 return true; 3871 } 3872 3873 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3874 MachineRegisterInfo &MRI, 3875 MachineIRBuilder &B, 3876 bool IsFormat, 3877 bool IsTyped) const { 3878 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3879 MachineMemOperand *MMO = *MI.memoperands_begin(); 3880 const LLT MemTy = MMO->getMemoryType(); 3881 const LLT S32 = LLT::scalar(32); 3882 3883 Register Dst = MI.getOperand(0).getReg(); 3884 Register RSrc = MI.getOperand(2).getReg(); 3885 3886 // The typed intrinsics add an immediate after the registers. 3887 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3888 3889 // The struct intrinsic variants add one additional operand over raw. 3890 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3891 Register VIndex; 3892 int OpOffset = 0; 3893 if (HasVIndex) { 3894 VIndex = MI.getOperand(3).getReg(); 3895 OpOffset = 1; 3896 } 3897 3898 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3899 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3900 3901 unsigned Format = 0; 3902 if (IsTyped) { 3903 Format = MI.getOperand(5 + OpOffset).getImm(); 3904 ++OpOffset; 3905 } 3906 3907 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3908 unsigned ImmOffset; 3909 unsigned TotalOffset; 3910 3911 LLT Ty = MRI.getType(Dst); 3912 LLT EltTy = Ty.getScalarType(); 3913 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3914 const bool Unpacked = ST.hasUnpackedD16VMem(); 3915 3916 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3917 if (TotalOffset != 0) 3918 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemTy); 3919 3920 unsigned Opc; 3921 3922 if (IsTyped) { 3923 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3924 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3925 } else if (IsFormat) { 3926 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3927 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3928 } else { 3929 switch (MemTy.getSizeInBits()) { 3930 case 8: 3931 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3932 break; 3933 case 16: 3934 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3935 break; 3936 default: 3937 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3938 break; 3939 } 3940 } 3941 3942 Register LoadDstReg; 3943 3944 bool IsExtLoad = 3945 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 3946 LLT UnpackedTy = Ty.changeElementSize(32); 3947 3948 if (IsExtLoad) 3949 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3950 else if (Unpacked && IsD16 && Ty.isVector()) 3951 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3952 else 3953 LoadDstReg = Dst; 3954 3955 if (!VIndex) 3956 VIndex = B.buildConstant(S32, 0).getReg(0); 3957 3958 auto MIB = B.buildInstr(Opc) 3959 .addDef(LoadDstReg) // vdata 3960 .addUse(RSrc) // rsrc 3961 .addUse(VIndex) // vindex 3962 .addUse(VOffset) // voffset 3963 .addUse(SOffset) // soffset 3964 .addImm(ImmOffset); // offset(imm) 3965 3966 if (IsTyped) 3967 MIB.addImm(Format); 3968 3969 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3970 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3971 .addMemOperand(MMO); 3972 3973 if (LoadDstReg != Dst) { 3974 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3975 3976 // Widen result for extending loads was widened. 3977 if (IsExtLoad) 3978 B.buildTrunc(Dst, LoadDstReg); 3979 else { 3980 // Repack to original 16-bit vector result 3981 // FIXME: G_TRUNC should work, but legalization currently fails 3982 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3983 SmallVector<Register, 4> Repack; 3984 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3985 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3986 B.buildMerge(Dst, Repack); 3987 } 3988 } 3989 3990 MI.eraseFromParent(); 3991 return true; 3992 } 3993 3994 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3995 MachineIRBuilder &B, 3996 bool IsInc) const { 3997 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3998 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3999 B.buildInstr(Opc) 4000 .addDef(MI.getOperand(0).getReg()) 4001 .addUse(MI.getOperand(2).getReg()) 4002 .addUse(MI.getOperand(3).getReg()) 4003 .cloneMemRefs(MI); 4004 MI.eraseFromParent(); 4005 return true; 4006 } 4007 4008 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4009 switch (IntrID) { 4010 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4011 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4012 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4013 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4014 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4015 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4016 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4017 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4018 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4019 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4020 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4021 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4022 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4023 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4024 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4025 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4028 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4029 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4030 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4031 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4032 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4033 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4034 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4035 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4036 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4037 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4038 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4039 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4040 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4041 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4042 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4043 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4044 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4045 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4046 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4047 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4048 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4049 case Intrinsic::amdgcn_buffer_atomic_fadd: 4050 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4051 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4052 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4053 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4054 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4055 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4056 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4057 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4058 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4059 default: 4060 llvm_unreachable("unhandled atomic opcode"); 4061 } 4062 } 4063 4064 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4065 MachineIRBuilder &B, 4066 Intrinsic::ID IID) const { 4067 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4068 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4069 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4070 4071 Register Dst; 4072 4073 int OpOffset = 0; 4074 if (HasReturn) { 4075 // A few FP atomics do not support return values. 4076 Dst = MI.getOperand(0).getReg(); 4077 } else { 4078 OpOffset = -1; 4079 } 4080 4081 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4082 Register CmpVal; 4083 4084 if (IsCmpSwap) { 4085 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4086 ++OpOffset; 4087 } 4088 4089 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4090 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4091 4092 // The struct intrinsic variants add one additional operand over raw. 4093 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4094 Register VIndex; 4095 if (HasVIndex) { 4096 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4097 ++OpOffset; 4098 } 4099 4100 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4101 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4102 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4103 4104 MachineMemOperand *MMO = *MI.memoperands_begin(); 4105 4106 unsigned ImmOffset; 4107 unsigned TotalOffset; 4108 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4109 if (TotalOffset != 0) 4110 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4111 4112 if (!VIndex) 4113 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4114 4115 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4116 4117 if (HasReturn) 4118 MIB.addDef(Dst); 4119 4120 MIB.addUse(VData); // vdata 4121 4122 if (IsCmpSwap) 4123 MIB.addReg(CmpVal); 4124 4125 MIB.addUse(RSrc) // rsrc 4126 .addUse(VIndex) // vindex 4127 .addUse(VOffset) // voffset 4128 .addUse(SOffset) // soffset 4129 .addImm(ImmOffset) // offset(imm) 4130 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4131 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4132 .addMemOperand(MMO); 4133 4134 MI.eraseFromParent(); 4135 return true; 4136 } 4137 4138 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4139 /// vector with s16 typed elements. 4140 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4141 SmallVectorImpl<Register> &PackedAddrs, 4142 unsigned ArgOffset, 4143 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4144 bool IsA16, bool IsG16) { 4145 const LLT S16 = LLT::scalar(16); 4146 const LLT V2S16 = LLT::fixed_vector(2, 16); 4147 auto EndIdx = Intr->VAddrEnd; 4148 4149 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4150 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4151 if (!SrcOp.isReg()) 4152 continue; // _L to _LZ may have eliminated this. 4153 4154 Register AddrReg = SrcOp.getReg(); 4155 4156 if (I < Intr->GradientStart) { 4157 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4158 PackedAddrs.push_back(AddrReg); 4159 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4160 (I >= Intr->CoordStart && !IsA16)) { 4161 // Handle any gradient or coordinate operands that should not be packed 4162 PackedAddrs.push_back(AddrReg); 4163 } else { 4164 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4165 // derivatives dx/dh and dx/dv are packed with undef. 4166 if (((I + 1) >= EndIdx) || 4167 ((Intr->NumGradients / 2) % 2 == 1 && 4168 (I == static_cast<unsigned>(Intr->GradientStart + 4169 (Intr->NumGradients / 2) - 1) || 4170 I == static_cast<unsigned>(Intr->GradientStart + 4171 Intr->NumGradients - 1))) || 4172 // Check for _L to _LZ optimization 4173 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4174 PackedAddrs.push_back( 4175 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4176 .getReg(0)); 4177 } else { 4178 PackedAddrs.push_back( 4179 B.buildBuildVector( 4180 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4181 .getReg(0)); 4182 ++I; 4183 } 4184 } 4185 } 4186 } 4187 4188 /// Convert from separate vaddr components to a single vector address register, 4189 /// and replace the remaining operands with $noreg. 4190 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4191 int DimIdx, int NumVAddrs) { 4192 const LLT S32 = LLT::scalar(32); 4193 4194 SmallVector<Register, 8> AddrRegs; 4195 for (int I = 0; I != NumVAddrs; ++I) { 4196 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4197 if (SrcOp.isReg()) { 4198 AddrRegs.push_back(SrcOp.getReg()); 4199 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4200 } 4201 } 4202 4203 int NumAddrRegs = AddrRegs.size(); 4204 if (NumAddrRegs != 1) { 4205 // Round up to 8 elements for v5-v7 4206 // FIXME: Missing intermediate sized register classes and instructions. 4207 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4208 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4209 auto Undef = B.buildUndef(S32); 4210 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4211 NumAddrRegs = RoundedNumRegs; 4212 } 4213 4214 auto VAddr = 4215 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4216 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4217 } 4218 4219 for (int I = 1; I != NumVAddrs; ++I) { 4220 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4221 if (SrcOp.isReg()) 4222 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4223 } 4224 } 4225 4226 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4227 /// 4228 /// Depending on the subtarget, load/store with 16-bit element data need to be 4229 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4230 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4231 /// registers. 4232 /// 4233 /// We don't want to directly select image instructions just yet, but also want 4234 /// to exposes all register repacking to the legalizer/combiners. We also don't 4235 /// want a selected instrution entering RegBankSelect. In order to avoid 4236 /// defining a multitude of intermediate image instructions, directly hack on 4237 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4238 /// now unnecessary arguments with $noreg. 4239 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4240 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4241 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4242 4243 const unsigned NumDefs = MI.getNumExplicitDefs(); 4244 const unsigned ArgOffset = NumDefs + 1; 4245 bool IsTFE = NumDefs == 2; 4246 // We are only processing the operands of d16 image operations on subtargets 4247 // that use the unpacked register layout, or need to repack the TFE result. 4248 4249 // TODO: Do we need to guard against already legalized intrinsics? 4250 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4251 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4252 4253 MachineRegisterInfo *MRI = B.getMRI(); 4254 const LLT S32 = LLT::scalar(32); 4255 const LLT S16 = LLT::scalar(16); 4256 const LLT V2S16 = LLT::fixed_vector(2, 16); 4257 4258 unsigned DMask = 0; 4259 4260 // Check for 16 bit addresses and pack if true. 4261 LLT GradTy = 4262 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4263 LLT AddrTy = 4264 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4265 const bool IsG16 = GradTy == S16; 4266 const bool IsA16 = AddrTy == S16; 4267 4268 int DMaskLanes = 0; 4269 if (!BaseOpcode->Atomic) { 4270 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4271 if (BaseOpcode->Gather4) { 4272 DMaskLanes = 4; 4273 } else if (DMask != 0) { 4274 DMaskLanes = countPopulation(DMask); 4275 } else if (!IsTFE && !BaseOpcode->Store) { 4276 // If dmask is 0, this is a no-op load. This can be eliminated. 4277 B.buildUndef(MI.getOperand(0)); 4278 MI.eraseFromParent(); 4279 return true; 4280 } 4281 } 4282 4283 Observer.changingInstr(MI); 4284 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4285 4286 unsigned NewOpcode = NumDefs == 0 ? 4287 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4288 4289 // Track that we legalized this 4290 MI.setDesc(B.getTII().get(NewOpcode)); 4291 4292 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4293 // dmask to be at least 1 otherwise the instruction will fail 4294 if (IsTFE && DMask == 0) { 4295 DMask = 0x1; 4296 DMaskLanes = 1; 4297 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4298 } 4299 4300 if (BaseOpcode->Atomic) { 4301 Register VData0 = MI.getOperand(2).getReg(); 4302 LLT Ty = MRI->getType(VData0); 4303 4304 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4305 if (Ty.isVector()) 4306 return false; 4307 4308 if (BaseOpcode->AtomicX2) { 4309 Register VData1 = MI.getOperand(3).getReg(); 4310 // The two values are packed in one register. 4311 LLT PackedTy = LLT::fixed_vector(2, Ty); 4312 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4313 MI.getOperand(2).setReg(Concat.getReg(0)); 4314 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4315 } 4316 } 4317 4318 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4319 4320 // Optimize _L to _LZ when _L is zero 4321 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4322 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4323 const ConstantFP *ConstantLod; 4324 4325 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4326 m_GFCst(ConstantLod))) { 4327 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4328 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4329 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4330 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4331 Intr->Dim); 4332 4333 // The starting indexes should remain in the same place. 4334 --CorrectedNumVAddrs; 4335 4336 MI.getOperand(MI.getNumExplicitDefs()) 4337 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4338 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4339 Intr = NewImageDimIntr; 4340 } 4341 } 4342 } 4343 4344 // Optimize _mip away, when 'lod' is zero 4345 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4346 int64_t ConstantLod; 4347 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4348 m_ICst(ConstantLod))) { 4349 if (ConstantLod == 0) { 4350 // TODO: Change intrinsic opcode and remove operand instead or replacing 4351 // it with 0, as the _L to _LZ handling is done above. 4352 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4353 --CorrectedNumVAddrs; 4354 } 4355 } 4356 } 4357 4358 // Rewrite the addressing register layout before doing anything else. 4359 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4360 // 16 bit gradients are supported, but are tied to the A16 control 4361 // so both gradients and addresses must be 16 bit 4362 return false; 4363 } 4364 4365 if (IsA16 && !ST.hasA16()) { 4366 // A16 not supported 4367 return false; 4368 } 4369 4370 if (IsA16 || IsG16) { 4371 if (Intr->NumVAddrs > 1) { 4372 SmallVector<Register, 4> PackedRegs; 4373 4374 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4375 IsG16); 4376 4377 // See also below in the non-a16 branch 4378 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4379 4380 if (!UseNSA && PackedRegs.size() > 1) { 4381 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4382 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4383 PackedRegs[0] = Concat.getReg(0); 4384 PackedRegs.resize(1); 4385 } 4386 4387 const unsigned NumPacked = PackedRegs.size(); 4388 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4389 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4390 if (!SrcOp.isReg()) { 4391 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4392 continue; 4393 } 4394 4395 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4396 4397 if (I - Intr->VAddrStart < NumPacked) 4398 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4399 else 4400 SrcOp.setReg(AMDGPU::NoRegister); 4401 } 4402 } 4403 } else { 4404 // If the register allocator cannot place the address registers contiguously 4405 // without introducing moves, then using the non-sequential address encoding 4406 // is always preferable, since it saves VALU instructions and is usually a 4407 // wash in terms of code size or even better. 4408 // 4409 // However, we currently have no way of hinting to the register allocator 4410 // that MIMG addresses should be placed contiguously when it is possible to 4411 // do so, so force non-NSA for the common 2-address case as a heuristic. 4412 // 4413 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4414 // allocation when possible. 4415 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4416 4417 if (!UseNSA && Intr->NumVAddrs > 1) 4418 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4419 Intr->NumVAddrs); 4420 } 4421 4422 int Flags = 0; 4423 if (IsA16) 4424 Flags |= 1; 4425 if (IsG16) 4426 Flags |= 2; 4427 MI.addOperand(MachineOperand::CreateImm(Flags)); 4428 4429 if (BaseOpcode->Store) { // No TFE for stores? 4430 // TODO: Handle dmask trim 4431 Register VData = MI.getOperand(1).getReg(); 4432 LLT Ty = MRI->getType(VData); 4433 if (!Ty.isVector() || Ty.getElementType() != S16) 4434 return true; 4435 4436 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4437 if (RepackedReg != VData) { 4438 MI.getOperand(1).setReg(RepackedReg); 4439 } 4440 4441 return true; 4442 } 4443 4444 Register DstReg = MI.getOperand(0).getReg(); 4445 LLT Ty = MRI->getType(DstReg); 4446 const LLT EltTy = Ty.getScalarType(); 4447 const bool IsD16 = Ty.getScalarType() == S16; 4448 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4449 4450 // Confirm that the return type is large enough for the dmask specified 4451 if (NumElts < DMaskLanes) 4452 return false; 4453 4454 if (NumElts > 4 || DMaskLanes > 4) 4455 return false; 4456 4457 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4458 const LLT AdjustedTy = 4459 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4460 4461 // The raw dword aligned data component of the load. The only legal cases 4462 // where this matters should be when using the packed D16 format, for 4463 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4464 LLT RoundedTy; 4465 4466 // S32 vector to to cover all data, plus TFE result element. 4467 LLT TFETy; 4468 4469 // Register type to use for each loaded component. Will be S32 or V2S16. 4470 LLT RegTy; 4471 4472 if (IsD16 && ST.hasUnpackedD16VMem()) { 4473 RoundedTy = 4474 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4475 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4476 RegTy = S32; 4477 } else { 4478 unsigned EltSize = EltTy.getSizeInBits(); 4479 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4480 unsigned RoundedSize = 32 * RoundedElts; 4481 RoundedTy = LLT::scalarOrVector( 4482 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4483 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4484 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4485 } 4486 4487 // The return type does not need adjustment. 4488 // TODO: Should we change s16 case to s32 or <2 x s16>? 4489 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4490 return true; 4491 4492 Register Dst1Reg; 4493 4494 // Insert after the instruction. 4495 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4496 4497 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4498 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4499 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4500 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4501 4502 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4503 4504 MI.getOperand(0).setReg(NewResultReg); 4505 4506 // In the IR, TFE is supposed to be used with a 2 element struct return 4507 // type. The intruction really returns these two values in one contiguous 4508 // register, with one additional dword beyond the loaded data. Rewrite the 4509 // return type to use a single register result. 4510 4511 if (IsTFE) { 4512 Dst1Reg = MI.getOperand(1).getReg(); 4513 if (MRI->getType(Dst1Reg) != S32) 4514 return false; 4515 4516 // TODO: Make sure the TFE operand bit is set. 4517 MI.RemoveOperand(1); 4518 4519 // Handle the easy case that requires no repack instructions. 4520 if (Ty == S32) { 4521 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4522 return true; 4523 } 4524 } 4525 4526 // Now figure out how to copy the new result register back into the old 4527 // result. 4528 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4529 4530 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4531 4532 if (ResultNumRegs == 1) { 4533 assert(!IsTFE); 4534 ResultRegs[0] = NewResultReg; 4535 } else { 4536 // We have to repack into a new vector of some kind. 4537 for (int I = 0; I != NumDataRegs; ++I) 4538 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4539 B.buildUnmerge(ResultRegs, NewResultReg); 4540 4541 // Drop the final TFE element to get the data part. The TFE result is 4542 // directly written to the right place already. 4543 if (IsTFE) 4544 ResultRegs.resize(NumDataRegs); 4545 } 4546 4547 // For an s16 scalar result, we form an s32 result with a truncate regardless 4548 // of packed vs. unpacked. 4549 if (IsD16 && !Ty.isVector()) { 4550 B.buildTrunc(DstReg, ResultRegs[0]); 4551 return true; 4552 } 4553 4554 // Avoid a build/concat_vector of 1 entry. 4555 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4556 B.buildBitcast(DstReg, ResultRegs[0]); 4557 return true; 4558 } 4559 4560 assert(Ty.isVector()); 4561 4562 if (IsD16) { 4563 // For packed D16 results with TFE enabled, all the data components are 4564 // S32. Cast back to the expected type. 4565 // 4566 // TODO: We don't really need to use load s32 elements. We would only need one 4567 // cast for the TFE result if a multiple of v2s16 was used. 4568 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4569 for (Register &Reg : ResultRegs) 4570 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4571 } else if (ST.hasUnpackedD16VMem()) { 4572 for (Register &Reg : ResultRegs) 4573 Reg = B.buildTrunc(S16, Reg).getReg(0); 4574 } 4575 } 4576 4577 auto padWithUndef = [&](LLT Ty, int NumElts) { 4578 if (NumElts == 0) 4579 return; 4580 Register Undef = B.buildUndef(Ty).getReg(0); 4581 for (int I = 0; I != NumElts; ++I) 4582 ResultRegs.push_back(Undef); 4583 }; 4584 4585 // Pad out any elements eliminated due to the dmask. 4586 LLT ResTy = MRI->getType(ResultRegs[0]); 4587 if (!ResTy.isVector()) { 4588 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4589 B.buildBuildVector(DstReg, ResultRegs); 4590 return true; 4591 } 4592 4593 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4594 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4595 4596 // Deal with the one annoying legal case. 4597 const LLT V3S16 = LLT::fixed_vector(3, 16); 4598 if (Ty == V3S16) { 4599 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4600 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4601 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4602 return true; 4603 } 4604 4605 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4606 B.buildConcatVectors(DstReg, ResultRegs); 4607 return true; 4608 } 4609 4610 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4611 LegalizerHelper &Helper, MachineInstr &MI) const { 4612 MachineIRBuilder &B = Helper.MIRBuilder; 4613 GISelChangeObserver &Observer = Helper.Observer; 4614 4615 Register Dst = MI.getOperand(0).getReg(); 4616 LLT Ty = B.getMRI()->getType(Dst); 4617 unsigned Size = Ty.getSizeInBits(); 4618 MachineFunction &MF = B.getMF(); 4619 4620 Observer.changingInstr(MI); 4621 4622 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4623 Ty = getBitcastRegisterType(Ty); 4624 Helper.bitcastDst(MI, Ty, 0); 4625 Dst = MI.getOperand(0).getReg(); 4626 B.setInsertPt(B.getMBB(), MI); 4627 } 4628 4629 // FIXME: We don't really need this intermediate instruction. The intrinsic 4630 // should be fixed to have a memory operand. Since it's readnone, we're not 4631 // allowed to add one. 4632 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4633 MI.RemoveOperand(1); // Remove intrinsic ID 4634 4635 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4636 // TODO: Should this use datalayout alignment? 4637 const unsigned MemSize = (Size + 7) / 8; 4638 const Align MemAlign(4); 4639 MachineMemOperand *MMO = MF.getMachineMemOperand( 4640 MachinePointerInfo(), 4641 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4642 MachineMemOperand::MOInvariant, 4643 MemSize, MemAlign); 4644 MI.addMemOperand(MF, MMO); 4645 4646 // There are no 96-bit result scalar loads, but widening to 128-bit should 4647 // always be legal. We may need to restore this to a 96-bit result if it turns 4648 // out this needs to be converted to a vector load during RegBankSelect. 4649 if (!isPowerOf2_32(Size)) { 4650 if (Ty.isVector()) 4651 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4652 else 4653 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4654 } 4655 4656 Observer.changedInstr(MI); 4657 return true; 4658 } 4659 4660 // TODO: Move to selection 4661 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4662 MachineRegisterInfo &MRI, 4663 MachineIRBuilder &B) const { 4664 if (!ST.isTrapHandlerEnabled() || 4665 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4666 return legalizeTrapEndpgm(MI, MRI, B); 4667 4668 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4669 switch (*HsaAbiVer) { 4670 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4671 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4672 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4673 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4674 return ST.supportsGetDoorbellID() ? 4675 legalizeTrapHsa(MI, MRI, B) : 4676 legalizeTrapHsaQueuePtr(MI, MRI, B); 4677 } 4678 } 4679 4680 llvm_unreachable("Unknown trap handler"); 4681 } 4682 4683 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4684 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4685 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4686 MI.eraseFromParent(); 4687 return true; 4688 } 4689 4690 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4691 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4692 // Pass queue pointer to trap handler as input, and insert trap instruction 4693 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4694 Register LiveIn = 4695 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4696 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4697 return false; 4698 4699 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4700 B.buildCopy(SGPR01, LiveIn); 4701 B.buildInstr(AMDGPU::S_TRAP) 4702 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4703 .addReg(SGPR01, RegState::Implicit); 4704 4705 MI.eraseFromParent(); 4706 return true; 4707 } 4708 4709 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4710 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4711 B.buildInstr(AMDGPU::S_TRAP) 4712 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4713 MI.eraseFromParent(); 4714 return true; 4715 } 4716 4717 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4718 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4719 // Is non-HSA path or trap-handler disabled? then, report a warning 4720 // accordingly 4721 if (!ST.isTrapHandlerEnabled() || 4722 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4723 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4724 "debugtrap handler not supported", 4725 MI.getDebugLoc(), DS_Warning); 4726 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4727 Ctx.diagnose(NoTrap); 4728 } else { 4729 // Insert debug-trap instruction 4730 B.buildInstr(AMDGPU::S_TRAP) 4731 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4732 } 4733 4734 MI.eraseFromParent(); 4735 return true; 4736 } 4737 4738 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4739 MachineIRBuilder &B) const { 4740 MachineRegisterInfo &MRI = *B.getMRI(); 4741 const LLT S16 = LLT::scalar(16); 4742 const LLT S32 = LLT::scalar(32); 4743 4744 Register DstReg = MI.getOperand(0).getReg(); 4745 Register NodePtr = MI.getOperand(2).getReg(); 4746 Register RayExtent = MI.getOperand(3).getReg(); 4747 Register RayOrigin = MI.getOperand(4).getReg(); 4748 Register RayDir = MI.getOperand(5).getReg(); 4749 Register RayInvDir = MI.getOperand(6).getReg(); 4750 Register TDescr = MI.getOperand(7).getReg(); 4751 4752 if (!ST.hasGFX10_AEncoding()) { 4753 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4754 "intrinsic not supported on subtarget", 4755 MI.getDebugLoc()); 4756 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4757 return false; 4758 } 4759 4760 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4761 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4762 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4763 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4764 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4765 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4766 4767 SmallVector<Register, 12> Ops; 4768 if (Is64) { 4769 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4770 Ops.push_back(Unmerge.getReg(0)); 4771 Ops.push_back(Unmerge.getReg(1)); 4772 } else { 4773 Ops.push_back(NodePtr); 4774 } 4775 Ops.push_back(RayExtent); 4776 4777 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4778 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4779 Ops.push_back(Unmerge.getReg(0)); 4780 Ops.push_back(Unmerge.getReg(1)); 4781 Ops.push_back(Unmerge.getReg(2)); 4782 }; 4783 4784 packLanes(RayOrigin); 4785 if (IsA16) { 4786 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4787 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4788 Register R1 = MRI.createGenericVirtualRegister(S32); 4789 Register R2 = MRI.createGenericVirtualRegister(S32); 4790 Register R3 = MRI.createGenericVirtualRegister(S32); 4791 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4792 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4793 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4794 Ops.push_back(R1); 4795 Ops.push_back(R2); 4796 Ops.push_back(R3); 4797 } else { 4798 packLanes(RayDir); 4799 packLanes(RayInvDir); 4800 } 4801 4802 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4803 .addDef(DstReg) 4804 .addImm(Opcode); 4805 4806 for (Register R : Ops) { 4807 MIB.addUse(R); 4808 } 4809 4810 MIB.addUse(TDescr) 4811 .addImm(IsA16 ? 1 : 0) 4812 .cloneMemRefs(MI); 4813 4814 MI.eraseFromParent(); 4815 return true; 4816 } 4817 4818 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4819 MachineInstr &MI) const { 4820 MachineIRBuilder &B = Helper.MIRBuilder; 4821 MachineRegisterInfo &MRI = *B.getMRI(); 4822 4823 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4824 auto IntrID = MI.getIntrinsicID(); 4825 switch (IntrID) { 4826 case Intrinsic::amdgcn_if: 4827 case Intrinsic::amdgcn_else: { 4828 MachineInstr *Br = nullptr; 4829 MachineBasicBlock *UncondBrTarget = nullptr; 4830 bool Negated = false; 4831 if (MachineInstr *BrCond = 4832 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4833 const SIRegisterInfo *TRI 4834 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4835 4836 Register Def = MI.getOperand(1).getReg(); 4837 Register Use = MI.getOperand(3).getReg(); 4838 4839 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4840 4841 if (Negated) 4842 std::swap(CondBrTarget, UncondBrTarget); 4843 4844 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4845 if (IntrID == Intrinsic::amdgcn_if) { 4846 B.buildInstr(AMDGPU::SI_IF) 4847 .addDef(Def) 4848 .addUse(Use) 4849 .addMBB(UncondBrTarget); 4850 } else { 4851 B.buildInstr(AMDGPU::SI_ELSE) 4852 .addDef(Def) 4853 .addUse(Use) 4854 .addMBB(UncondBrTarget); 4855 } 4856 4857 if (Br) { 4858 Br->getOperand(0).setMBB(CondBrTarget); 4859 } else { 4860 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4861 // since we're swapping branch targets it needs to be reinserted. 4862 // FIXME: IRTranslator should probably not do this 4863 B.buildBr(*CondBrTarget); 4864 } 4865 4866 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4867 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4868 MI.eraseFromParent(); 4869 BrCond->eraseFromParent(); 4870 return true; 4871 } 4872 4873 return false; 4874 } 4875 case Intrinsic::amdgcn_loop: { 4876 MachineInstr *Br = nullptr; 4877 MachineBasicBlock *UncondBrTarget = nullptr; 4878 bool Negated = false; 4879 if (MachineInstr *BrCond = 4880 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4881 const SIRegisterInfo *TRI 4882 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4883 4884 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4885 Register Reg = MI.getOperand(2).getReg(); 4886 4887 if (Negated) 4888 std::swap(CondBrTarget, UncondBrTarget); 4889 4890 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4891 B.buildInstr(AMDGPU::SI_LOOP) 4892 .addUse(Reg) 4893 .addMBB(UncondBrTarget); 4894 4895 if (Br) 4896 Br->getOperand(0).setMBB(CondBrTarget); 4897 else 4898 B.buildBr(*CondBrTarget); 4899 4900 MI.eraseFromParent(); 4901 BrCond->eraseFromParent(); 4902 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4903 return true; 4904 } 4905 4906 return false; 4907 } 4908 case Intrinsic::amdgcn_kernarg_segment_ptr: 4909 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4910 // This only makes sense to call in a kernel, so just lower to null. 4911 B.buildConstant(MI.getOperand(0).getReg(), 0); 4912 MI.eraseFromParent(); 4913 return true; 4914 } 4915 4916 return legalizePreloadedArgIntrin( 4917 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4918 case Intrinsic::amdgcn_implicitarg_ptr: 4919 return legalizeImplicitArgPtr(MI, MRI, B); 4920 case Intrinsic::amdgcn_workitem_id_x: 4921 return legalizePreloadedArgIntrin(MI, MRI, B, 4922 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4923 case Intrinsic::amdgcn_workitem_id_y: 4924 return legalizePreloadedArgIntrin(MI, MRI, B, 4925 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4926 case Intrinsic::amdgcn_workitem_id_z: 4927 return legalizePreloadedArgIntrin(MI, MRI, B, 4928 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4929 case Intrinsic::amdgcn_workgroup_id_x: 4930 return legalizePreloadedArgIntrin(MI, MRI, B, 4931 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4932 case Intrinsic::amdgcn_workgroup_id_y: 4933 return legalizePreloadedArgIntrin(MI, MRI, B, 4934 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4935 case Intrinsic::amdgcn_workgroup_id_z: 4936 return legalizePreloadedArgIntrin(MI, MRI, B, 4937 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4938 case Intrinsic::amdgcn_dispatch_ptr: 4939 return legalizePreloadedArgIntrin(MI, MRI, B, 4940 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4941 case Intrinsic::amdgcn_queue_ptr: 4942 return legalizePreloadedArgIntrin(MI, MRI, B, 4943 AMDGPUFunctionArgInfo::QUEUE_PTR); 4944 case Intrinsic::amdgcn_implicit_buffer_ptr: 4945 return legalizePreloadedArgIntrin( 4946 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4947 case Intrinsic::amdgcn_dispatch_id: 4948 return legalizePreloadedArgIntrin(MI, MRI, B, 4949 AMDGPUFunctionArgInfo::DISPATCH_ID); 4950 case Intrinsic::amdgcn_fdiv_fast: 4951 return legalizeFDIVFastIntrin(MI, MRI, B); 4952 case Intrinsic::amdgcn_is_shared: 4953 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4954 case Intrinsic::amdgcn_is_private: 4955 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4956 case Intrinsic::amdgcn_wavefrontsize: { 4957 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4958 MI.eraseFromParent(); 4959 return true; 4960 } 4961 case Intrinsic::amdgcn_s_buffer_load: 4962 return legalizeSBufferLoad(Helper, MI); 4963 case Intrinsic::amdgcn_raw_buffer_store: 4964 case Intrinsic::amdgcn_struct_buffer_store: 4965 return legalizeBufferStore(MI, MRI, B, false, false); 4966 case Intrinsic::amdgcn_raw_buffer_store_format: 4967 case Intrinsic::amdgcn_struct_buffer_store_format: 4968 return legalizeBufferStore(MI, MRI, B, false, true); 4969 case Intrinsic::amdgcn_raw_tbuffer_store: 4970 case Intrinsic::amdgcn_struct_tbuffer_store: 4971 return legalizeBufferStore(MI, MRI, B, true, true); 4972 case Intrinsic::amdgcn_raw_buffer_load: 4973 case Intrinsic::amdgcn_struct_buffer_load: 4974 return legalizeBufferLoad(MI, MRI, B, false, false); 4975 case Intrinsic::amdgcn_raw_buffer_load_format: 4976 case Intrinsic::amdgcn_struct_buffer_load_format: 4977 return legalizeBufferLoad(MI, MRI, B, true, false); 4978 case Intrinsic::amdgcn_raw_tbuffer_load: 4979 case Intrinsic::amdgcn_struct_tbuffer_load: 4980 return legalizeBufferLoad(MI, MRI, B, true, true); 4981 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4982 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4985 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4986 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4987 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4988 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4989 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4990 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4991 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4992 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4993 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4994 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4995 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4996 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4997 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4998 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4999 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5000 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5001 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5002 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5003 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5004 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5005 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5006 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5007 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5008 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5009 case Intrinsic::amdgcn_buffer_atomic_fadd: 5010 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5011 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5012 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5013 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5014 return legalizeBufferAtomic(MI, B, IntrID); 5015 case Intrinsic::amdgcn_atomic_inc: 5016 return legalizeAtomicIncDec(MI, B, true); 5017 case Intrinsic::amdgcn_atomic_dec: 5018 return legalizeAtomicIncDec(MI, B, false); 5019 case Intrinsic::trap: 5020 return legalizeTrapIntrinsic(MI, MRI, B); 5021 case Intrinsic::debugtrap: 5022 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5023 case Intrinsic::amdgcn_rsq_clamp: 5024 return legalizeRsqClampIntrinsic(MI, MRI, B); 5025 case Intrinsic::amdgcn_ds_fadd: 5026 case Intrinsic::amdgcn_ds_fmin: 5027 case Intrinsic::amdgcn_ds_fmax: 5028 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5029 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5030 return legalizeBVHIntrinsic(MI, B); 5031 default: { 5032 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5033 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5034 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5035 return true; 5036 } 5037 } 5038 5039 return true; 5040 } 5041