1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerFor({{S32, S64}}) 817 .lowerIf(typeIs(1, S1)) 818 .customFor({{S64, S64}}); 819 if (ST.has16BitInsts()) 820 IToFP.legalFor({{S16, S16}}); 821 IToFP.clampScalar(1, S32, S64) 822 .minScalar(0, S32) 823 .scalarize(0) 824 .widenScalarToNextPow2(1); 825 826 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 827 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 828 .customFor({{S64, S32}, {S64, S64}}) 829 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 830 if (ST.has16BitInsts()) 831 FPToI.legalFor({{S16, S16}}); 832 else 833 FPToI.minScalar(1, S32); 834 835 FPToI.minScalar(0, S32) 836 .widenScalarToNextPow2(0, 32) 837 .scalarize(0) 838 .lower(); 839 840 // Lower roundeven into G_FRINT 841 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 842 .scalarize(0) 843 .lower(); 844 845 if (ST.has16BitInsts()) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S16, S32, S64}) 848 .clampScalar(0, S16, S64) 849 .scalarize(0); 850 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32, S64}) 853 .clampScalar(0, S32, S64) 854 .scalarize(0); 855 } else { 856 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 857 .legalFor({S32}) 858 .customFor({S64}) 859 .clampScalar(0, S32, S64) 860 .scalarize(0); 861 } 862 863 getActionDefinitionsBuilder(G_PTR_ADD) 864 .legalIf(all(isPointer(0), sameSize(0, 1))) 865 .scalarize(0) 866 .scalarSameSizeAs(1, 0); 867 868 getActionDefinitionsBuilder(G_PTRMASK) 869 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 870 .scalarSameSizeAs(1, 0) 871 .scalarize(0); 872 873 auto &CmpBuilder = 874 getActionDefinitionsBuilder(G_ICMP) 875 // The compare output type differs based on the register bank of the output, 876 // so make both s1 and s32 legal. 877 // 878 // Scalar compares producing output in scc will be promoted to s32, as that 879 // is the allocatable register type that will be needed for the copy from 880 // scc. This will be promoted during RegBankSelect, and we assume something 881 // before that won't try to use s32 result types. 882 // 883 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 884 // bank. 885 .legalForCartesianProduct( 886 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 887 .legalForCartesianProduct( 888 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 889 if (ST.has16BitInsts()) { 890 CmpBuilder.legalFor({{S1, S16}}); 891 } 892 893 CmpBuilder 894 .widenScalarToNextPow2(1) 895 .clampScalar(1, S32, S64) 896 .scalarize(0) 897 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 898 899 getActionDefinitionsBuilder(G_FCMP) 900 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0); 904 905 // FIXME: fpow has a selection pattern that should move to custom lowering. 906 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 907 if (ST.has16BitInsts()) 908 Exp2Ops.legalFor({S32, S16}); 909 else 910 Exp2Ops.legalFor({S32}); 911 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 912 Exp2Ops.scalarize(0); 913 914 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 915 if (ST.has16BitInsts()) 916 ExpOps.customFor({{S32}, {S16}}); 917 else 918 ExpOps.customFor({S32}); 919 ExpOps.clampScalar(0, MinScalarFPTy, S32) 920 .scalarize(0); 921 922 getActionDefinitionsBuilder(G_FPOWI) 923 .clampScalar(0, MinScalarFPTy, S32) 924 .lower(); 925 926 // The 64-bit versions produce 32-bit results, but only on the SALU. 927 getActionDefinitionsBuilder(G_CTPOP) 928 .legalFor({{S32, S32}, {S32, S64}}) 929 .clampScalar(0, S32, S32) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .widenScalarToNextPow2(0, 32) 933 .widenScalarToNextPow2(1, 32); 934 935 // The hardware instructions return a different result on 0 than the generic 936 // instructions expect. The hardware produces -1, but these produce the 937 // bitwidth. 938 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 939 .scalarize(0) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .widenScalarToNextPow2(0, 32) 943 .widenScalarToNextPow2(1, 32) 944 .lower(); 945 946 // The 64-bit versions produce 32-bit results, but only on the SALU. 947 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 948 .legalFor({{S32, S32}, {S32, S64}}) 949 .clampScalar(0, S32, S32) 950 .clampScalar(1, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0, 32) 953 .widenScalarToNextPow2(1, 32); 954 955 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 956 // RegBankSelect. 957 getActionDefinitionsBuilder(G_BITREVERSE) 958 .legalFor({S32, S64}) 959 .clampScalar(0, S32, S64) 960 .scalarize(0) 961 .widenScalarToNextPow2(0); 962 963 if (ST.has16BitInsts()) { 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S16, S32, V2S16}) 966 .clampMaxNumElements(0, S16, 2) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .clampScalar(0, S16, S32) 971 .scalarize(0); 972 973 if (ST.hasVOP3PInsts()) { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 975 .legalFor({S32, S16, V2S16}) 976 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 977 .clampMaxNumElements(0, S16, 2) 978 .minScalar(0, S16) 979 .widenScalarToNextPow2(0) 980 .scalarize(0) 981 .lower(); 982 } else { 983 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 984 .legalFor({S32, S16}) 985 .widenScalarToNextPow2(0) 986 .minScalar(0, S16) 987 .scalarize(0) 988 .lower(); 989 } 990 } else { 991 // TODO: Should have same legality without v_perm_b32 992 getActionDefinitionsBuilder(G_BSWAP) 993 .legalFor({S32}) 994 .lowerIf(scalarNarrowerThan(0, 32)) 995 // FIXME: Fixing non-power-of-2 before clamp is workaround for 996 // narrowScalar limitation. 997 .widenScalarToNextPow2(0) 998 .maxScalar(0, S32) 999 .scalarize(0) 1000 .lower(); 1001 1002 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1003 .legalFor({S32}) 1004 .minScalar(0, S32) 1005 .widenScalarToNextPow2(0) 1006 .scalarize(0) 1007 .lower(); 1008 } 1009 1010 getActionDefinitionsBuilder(G_INTTOPTR) 1011 // List the common cases 1012 .legalForCartesianProduct(AddrSpaces64, {S64}) 1013 .legalForCartesianProduct(AddrSpaces32, {S32}) 1014 .scalarize(0) 1015 // Accept any address space as long as the size matches 1016 .legalIf(sameSize(0, 1)) 1017 .widenScalarIf(smallerThan(1, 0), 1018 [](const LegalityQuery &Query) { 1019 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1020 }) 1021 .narrowScalarIf(largerThan(1, 0), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_PTRTOINT) 1027 // List the common cases 1028 .legalForCartesianProduct(AddrSpaces64, {S64}) 1029 .legalForCartesianProduct(AddrSpaces32, {S32}) 1030 .scalarize(0) 1031 // Accept any address space as long as the size matches 1032 .legalIf(sameSize(0, 1)) 1033 .widenScalarIf(smallerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }) 1037 .narrowScalarIf( 1038 largerThan(0, 1), 1039 [](const LegalityQuery &Query) { 1040 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1041 }); 1042 1043 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1044 .scalarize(0) 1045 .custom(); 1046 1047 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1048 bool IsLoad) -> bool { 1049 const LLT DstTy = Query.Types[0]; 1050 1051 // Split vector extloads. 1052 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1053 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1054 1055 if (MemSize < DstTy.getSizeInBits()) 1056 MemSize = std::max(MemSize, AlignBits); 1057 1058 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1059 return true; 1060 1061 const LLT PtrTy = Query.Types[1]; 1062 unsigned AS = PtrTy.getAddressSpace(); 1063 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1064 return true; 1065 1066 // Catch weird sized loads that don't evenly divide into the access sizes 1067 // TODO: May be able to widen depending on alignment etc. 1068 unsigned NumRegs = (MemSize + 31) / 32; 1069 if (NumRegs == 3) { 1070 if (!ST.hasDwordx3LoadStores()) 1071 return true; 1072 } else { 1073 // If the alignment allows, these should have been widened. 1074 if (!isPowerOf2_32(NumRegs)) 1075 return true; 1076 } 1077 1078 if (AlignBits < MemSize) { 1079 const SITargetLowering *TLI = ST.getTargetLowering(); 1080 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1081 Align(AlignBits / 8)); 1082 } 1083 1084 return false; 1085 }; 1086 1087 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1088 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1089 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1090 1091 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1092 // LDS 1093 // TODO: Unsupported flat for SI. 1094 1095 for (unsigned Op : {G_LOAD, G_STORE}) { 1096 const bool IsStore = Op == G_STORE; 1097 1098 auto &Actions = getActionDefinitionsBuilder(Op); 1099 // Explicitly list some common cases. 1100 // TODO: Does this help compile time at all? 1101 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1102 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1103 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1104 {S64, GlobalPtr, S64, GlobalAlign32}, 1105 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1106 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1107 {S32, GlobalPtr, S8, GlobalAlign8}, 1108 {S32, GlobalPtr, S16, GlobalAlign16}, 1109 1110 {S32, LocalPtr, S32, 32}, 1111 {S64, LocalPtr, S64, 32}, 1112 {V2S32, LocalPtr, V2S32, 32}, 1113 {S32, LocalPtr, S8, 8}, 1114 {S32, LocalPtr, S16, 16}, 1115 {V2S16, LocalPtr, S32, 32}, 1116 1117 {S32, PrivatePtr, S32, 32}, 1118 {S32, PrivatePtr, S8, 8}, 1119 {S32, PrivatePtr, S16, 16}, 1120 {V2S16, PrivatePtr, S32, 32}, 1121 1122 {S32, ConstantPtr, S32, GlobalAlign32}, 1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1124 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1125 {S64, ConstantPtr, S64, GlobalAlign32}, 1126 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1127 Actions.legalIf( 1128 [=](const LegalityQuery &Query) -> bool { 1129 return isLoadStoreLegal(ST, Query); 1130 }); 1131 1132 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1133 // 64-bits. 1134 // 1135 // TODO: Should generalize bitcast action into coerce, which will also cover 1136 // inserting addrspacecasts. 1137 Actions.customIf(typeIs(1, Constant32Ptr)); 1138 1139 // Turn any illegal element vectors into something easier to deal 1140 // with. These will ultimately produce 32-bit scalar shifts to extract the 1141 // parts anyway. 1142 // 1143 // For odd 16-bit element vectors, prefer to split those into pieces with 1144 // 16-bit vector parts. 1145 Actions.bitcastIf( 1146 [=](const LegalityQuery &Query) -> bool { 1147 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1148 Query.MMODescrs[0].MemoryTy); 1149 }, bitcastToRegisterType(0)); 1150 1151 if (!IsStore) { 1152 // Widen suitably aligned loads by loading extra bytes. The standard 1153 // legalization actions can't properly express widening memory operands. 1154 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1155 return shouldWidenLoad(ST, Query, G_LOAD); 1156 }); 1157 } 1158 1159 // FIXME: load/store narrowing should be moved to lower action 1160 Actions 1161 .narrowScalarIf( 1162 [=](const LegalityQuery &Query) -> bool { 1163 return !Query.Types[0].isVector() && 1164 needToSplitMemOp(Query, Op == G_LOAD); 1165 }, 1166 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1167 const LLT DstTy = Query.Types[0]; 1168 const LLT PtrTy = Query.Types[1]; 1169 1170 const unsigned DstSize = DstTy.getSizeInBits(); 1171 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1172 1173 // Split extloads. 1174 if (DstSize > MemSize) 1175 return std::make_pair(0, LLT::scalar(MemSize)); 1176 1177 if (!isPowerOf2_32(DstSize)) { 1178 // We're probably decomposing an odd sized store. Try to split 1179 // to the widest type. TODO: Account for alignment. As-is it 1180 // should be OK, since the new parts will be further legalized. 1181 unsigned FloorSize = PowerOf2Floor(DstSize); 1182 return std::make_pair(0, LLT::scalar(FloorSize)); 1183 } 1184 1185 if (DstSize > 32 && (DstSize % 32 != 0)) { 1186 // FIXME: Need a way to specify non-extload of larger size if 1187 // suitably aligned. 1188 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1189 } 1190 1191 unsigned MaxSize = maxSizeForAddrSpace(ST, 1192 PtrTy.getAddressSpace(), 1193 Op == G_LOAD); 1194 if (MemSize > MaxSize) 1195 return std::make_pair(0, LLT::scalar(MaxSize)); 1196 1197 unsigned Align = Query.MMODescrs[0].AlignInBits; 1198 return std::make_pair(0, LLT::scalar(Align)); 1199 }) 1200 .fewerElementsIf( 1201 [=](const LegalityQuery &Query) -> bool { 1202 return Query.Types[0].isVector() && 1203 needToSplitMemOp(Query, Op == G_LOAD); 1204 }, 1205 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1206 const LLT DstTy = Query.Types[0]; 1207 const LLT PtrTy = Query.Types[1]; 1208 1209 LLT EltTy = DstTy.getElementType(); 1210 unsigned MaxSize = maxSizeForAddrSpace(ST, 1211 PtrTy.getAddressSpace(), 1212 Op == G_LOAD); 1213 1214 // FIXME: Handle widened to power of 2 results better. This ends 1215 // up scalarizing. 1216 // FIXME: 3 element stores scalarized on SI 1217 1218 // Split if it's too large for the address space. 1219 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1220 if (MemSize > MaxSize) { 1221 unsigned NumElts = DstTy.getNumElements(); 1222 unsigned EltSize = EltTy.getSizeInBits(); 1223 1224 if (MaxSize % EltSize == 0) { 1225 return std::make_pair( 1226 0, LLT::scalarOrVector( 1227 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1228 } 1229 1230 unsigned NumPieces = MemSize / MaxSize; 1231 1232 // FIXME: Refine when odd breakdowns handled 1233 // The scalars will need to be re-legalized. 1234 if (NumPieces == 1 || NumPieces >= NumElts || 1235 NumElts % NumPieces != 0) 1236 return std::make_pair(0, EltTy); 1237 1238 return std::make_pair( 1239 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1240 } 1241 1242 // FIXME: We could probably handle weird extending loads better. 1243 if (DstTy.getSizeInBits() > MemSize) 1244 return std::make_pair(0, EltTy); 1245 1246 unsigned EltSize = EltTy.getSizeInBits(); 1247 unsigned DstSize = DstTy.getSizeInBits(); 1248 if (!isPowerOf2_32(DstSize)) { 1249 // We're probably decomposing an odd sized store. Try to split 1250 // to the widest type. TODO: Account for alignment. As-is it 1251 // should be OK, since the new parts will be further legalized. 1252 unsigned FloorSize = PowerOf2Floor(DstSize); 1253 return std::make_pair( 1254 0, LLT::scalarOrVector( 1255 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1256 } 1257 1258 // Need to split because of alignment. 1259 unsigned Align = Query.MMODescrs[0].AlignInBits; 1260 if (EltSize > Align && 1261 (EltSize / Align < DstTy.getNumElements())) { 1262 return std::make_pair( 1263 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1264 } 1265 1266 // May need relegalization for the scalars. 1267 return std::make_pair(0, EltTy); 1268 }) 1269 .lowerIfMemSizeNotPow2() 1270 .minScalar(0, S32) 1271 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1272 .widenScalarToNextPow2(0) 1273 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1274 .lower(); 1275 } 1276 1277 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1278 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1279 {S32, GlobalPtr, S16, 2 * 8}, 1280 {S32, LocalPtr, S8, 8}, 1281 {S32, LocalPtr, S16, 16}, 1282 {S32, PrivatePtr, S8, 8}, 1283 {S32, PrivatePtr, S16, 16}, 1284 {S32, ConstantPtr, S8, 8}, 1285 {S32, ConstantPtr, S16, 2 * 8}}) 1286 .legalIf( 1287 [=](const LegalityQuery &Query) -> bool { 1288 return isLoadStoreLegal(ST, Query); 1289 }); 1290 1291 if (ST.hasFlatAddressSpace()) { 1292 ExtLoads.legalForTypesWithMemDesc( 1293 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1294 } 1295 1296 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1297 // 64-bits. 1298 // 1299 // TODO: Should generalize bitcast action into coerce, which will also cover 1300 // inserting addrspacecasts. 1301 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1302 1303 ExtLoads.clampScalar(0, S32, S32) 1304 .widenScalarToNextPow2(0) 1305 .unsupportedIfMemSizeNotPow2() 1306 .lower(); 1307 1308 auto &Atomics = getActionDefinitionsBuilder( 1309 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1310 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1311 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1312 G_ATOMICRMW_UMIN}) 1313 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1314 {S64, GlobalPtr}, {S64, LocalPtr}, 1315 {S32, RegionPtr}, {S64, RegionPtr}}); 1316 if (ST.hasFlatAddressSpace()) { 1317 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1318 } 1319 1320 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1321 if (ST.hasLDSFPAtomics()) { 1322 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1323 if (ST.hasGFX90AInsts()) 1324 Atomic.legalFor({{S64, LocalPtr}}); 1325 } 1326 if (ST.hasAtomicFaddInsts()) 1327 Atomic.legalFor({{S32, GlobalPtr}}); 1328 1329 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1330 // demarshalling 1331 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1332 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1333 {S32, FlatPtr}, {S64, FlatPtr}}) 1334 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1335 {S32, RegionPtr}, {S64, RegionPtr}}); 1336 // TODO: Pointer types, any 32-bit or 64-bit vector 1337 1338 // Condition should be s32 for scalar, s1 for vector. 1339 getActionDefinitionsBuilder(G_SELECT) 1340 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1341 LocalPtr, FlatPtr, PrivatePtr, 1342 LLT::fixed_vector(2, LocalPtr), 1343 LLT::fixed_vector(2, PrivatePtr)}, 1344 {S1, S32}) 1345 .clampScalar(0, S16, S64) 1346 .scalarize(1) 1347 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1348 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1349 .clampMaxNumElements(0, S32, 2) 1350 .clampMaxNumElements(0, LocalPtr, 2) 1351 .clampMaxNumElements(0, PrivatePtr, 2) 1352 .scalarize(0) 1353 .widenScalarToNextPow2(0) 1354 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1355 1356 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1357 // be more flexible with the shift amount type. 1358 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1359 .legalFor({{S32, S32}, {S64, S32}}); 1360 if (ST.has16BitInsts()) { 1361 if (ST.hasVOP3PInsts()) { 1362 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1363 .clampMaxNumElements(0, S16, 2); 1364 } else 1365 Shifts.legalFor({{S16, S16}}); 1366 1367 // TODO: Support 16-bit shift amounts for all types 1368 Shifts.widenScalarIf( 1369 [=](const LegalityQuery &Query) { 1370 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1371 // 32-bit amount. 1372 const LLT ValTy = Query.Types[0]; 1373 const LLT AmountTy = Query.Types[1]; 1374 return ValTy.getSizeInBits() <= 16 && 1375 AmountTy.getSizeInBits() < 16; 1376 }, changeTo(1, S16)); 1377 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1378 Shifts.clampScalar(1, S32, S32); 1379 Shifts.clampScalar(0, S16, S64); 1380 Shifts.widenScalarToNextPow2(0, 16); 1381 1382 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1383 .minScalar(0, S16) 1384 .scalarize(0) 1385 .lower(); 1386 } else { 1387 // Make sure we legalize the shift amount type first, as the general 1388 // expansion for the shifted type will produce much worse code if it hasn't 1389 // been truncated already. 1390 Shifts.clampScalar(1, S32, S32); 1391 Shifts.clampScalar(0, S32, S64); 1392 Shifts.widenScalarToNextPow2(0, 32); 1393 1394 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1395 .minScalar(0, S32) 1396 .scalarize(0) 1397 .lower(); 1398 } 1399 Shifts.scalarize(0); 1400 1401 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1402 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1403 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1404 unsigned IdxTypeIdx = 2; 1405 1406 getActionDefinitionsBuilder(Op) 1407 .customIf([=](const LegalityQuery &Query) { 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1411 const unsigned EltSize = EltTy.getSizeInBits(); 1412 return (EltSize == 32 || EltSize == 64) && 1413 VecTy.getSizeInBits() % 32 == 0 && 1414 VecTy.getSizeInBits() <= MaxRegisterSize && 1415 IdxTy.getSizeInBits() == 32; 1416 }) 1417 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1418 bitcastToVectorElement32(VecTypeIdx)) 1419 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1420 .bitcastIf( 1421 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1422 [=](const LegalityQuery &Query) { 1423 // For > 64-bit element types, try to turn this into a 64-bit 1424 // element vector since we may be able to do better indexing 1425 // if this is scalar. If not, fall back to 32. 1426 const LLT EltTy = Query.Types[EltTypeIdx]; 1427 const LLT VecTy = Query.Types[VecTypeIdx]; 1428 const unsigned DstEltSize = EltTy.getSizeInBits(); 1429 const unsigned VecSize = VecTy.getSizeInBits(); 1430 1431 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1432 return std::make_pair( 1433 VecTypeIdx, 1434 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1435 }) 1436 .clampScalar(EltTypeIdx, S32, S64) 1437 .clampScalar(VecTypeIdx, S32, S64) 1438 .clampScalar(IdxTypeIdx, S32, S32) 1439 .clampMaxNumElements(VecTypeIdx, S32, 32) 1440 // TODO: Clamp elements for 64-bit vectors? 1441 // It should only be necessary with variable indexes. 1442 // As a last resort, lower to the stack 1443 .lower(); 1444 } 1445 1446 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1447 .unsupportedIf([=](const LegalityQuery &Query) { 1448 const LLT &EltTy = Query.Types[1].getElementType(); 1449 return Query.Types[0] != EltTy; 1450 }); 1451 1452 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1453 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1454 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1455 1456 // FIXME: Doesn't handle extract of illegal sizes. 1457 getActionDefinitionsBuilder(Op) 1458 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1459 // FIXME: Multiples of 16 should not be legal. 1460 .legalIf([=](const LegalityQuery &Query) { 1461 const LLT BigTy = Query.Types[BigTyIdx]; 1462 const LLT LitTy = Query.Types[LitTyIdx]; 1463 return (BigTy.getSizeInBits() % 32 == 0) && 1464 (LitTy.getSizeInBits() % 16 == 0); 1465 }) 1466 .widenScalarIf( 1467 [=](const LegalityQuery &Query) { 1468 const LLT BigTy = Query.Types[BigTyIdx]; 1469 return (BigTy.getScalarSizeInBits() < 16); 1470 }, 1471 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1472 .widenScalarIf( 1473 [=](const LegalityQuery &Query) { 1474 const LLT LitTy = Query.Types[LitTyIdx]; 1475 return (LitTy.getScalarSizeInBits() < 16); 1476 }, 1477 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1478 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1479 .widenScalarToNextPow2(BigTyIdx, 32); 1480 1481 } 1482 1483 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1484 .legalForCartesianProduct(AllS32Vectors, {S32}) 1485 .legalForCartesianProduct(AllS64Vectors, {S64}) 1486 .clampNumElements(0, V16S32, V32S32) 1487 .clampNumElements(0, V2S64, V16S64) 1488 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1489 1490 if (ST.hasScalarPackInsts()) { 1491 BuildVector 1492 // FIXME: Should probably widen s1 vectors straight to s32 1493 .minScalarOrElt(0, S16) 1494 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1495 .minScalar(1, S32); 1496 1497 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1498 .legalFor({V2S16, S32}) 1499 .lower(); 1500 BuildVector.minScalarOrElt(0, S32); 1501 } else { 1502 BuildVector.customFor({V2S16, S16}); 1503 BuildVector.minScalarOrElt(0, S32); 1504 1505 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1506 .customFor({V2S16, S32}) 1507 .lower(); 1508 } 1509 1510 BuildVector.legalIf(isRegisterType(0)); 1511 1512 // FIXME: Clamp maximum size 1513 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1514 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1515 .clampMaxNumElements(0, S32, 32) 1516 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1517 .clampMaxNumElements(0, S16, 64); 1518 1519 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1520 // pre-legalize. 1521 if (ST.hasVOP3PInsts()) { 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1523 .customFor({V2S16, V2S16}) 1524 .lower(); 1525 } else 1526 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1527 1528 // Merge/Unmerge 1529 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1530 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1531 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1532 1533 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1534 const LLT Ty = Query.Types[TypeIdx]; 1535 if (Ty.isVector()) { 1536 const LLT &EltTy = Ty.getElementType(); 1537 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1538 return true; 1539 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1540 return true; 1541 } 1542 return false; 1543 }; 1544 1545 auto &Builder = getActionDefinitionsBuilder(Op) 1546 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1547 .lowerFor({{S16, V2S16}}) 1548 .lowerIf([=](const LegalityQuery &Query) { 1549 const LLT BigTy = Query.Types[BigTyIdx]; 1550 return BigTy.getSizeInBits() == 32; 1551 }) 1552 // Try to widen to s16 first for small types. 1553 // TODO: Only do this on targets with legal s16 shifts 1554 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1555 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1556 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1557 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1558 elementTypeIs(1, S16)), 1559 changeTo(1, V2S16)) 1560 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1561 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1562 // valid. 1563 .clampScalar(LitTyIdx, S32, S512) 1564 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1565 // Break up vectors with weird elements into scalars 1566 .fewerElementsIf( 1567 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1568 scalarize(0)) 1569 .fewerElementsIf( 1570 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1571 scalarize(1)) 1572 .clampScalar(BigTyIdx, S32, MaxScalar); 1573 1574 if (Op == G_MERGE_VALUES) { 1575 Builder.widenScalarIf( 1576 // TODO: Use 16-bit shifts if legal for 8-bit values? 1577 [=](const LegalityQuery &Query) { 1578 const LLT Ty = Query.Types[LitTyIdx]; 1579 return Ty.getSizeInBits() < 32; 1580 }, 1581 changeTo(LitTyIdx, S32)); 1582 } 1583 1584 Builder.widenScalarIf( 1585 [=](const LegalityQuery &Query) { 1586 const LLT Ty = Query.Types[BigTyIdx]; 1587 return !isPowerOf2_32(Ty.getSizeInBits()) && 1588 Ty.getSizeInBits() % 16 != 0; 1589 }, 1590 [=](const LegalityQuery &Query) { 1591 // Pick the next power of 2, or a multiple of 64 over 128. 1592 // Whichever is smaller. 1593 const LLT &Ty = Query.Types[BigTyIdx]; 1594 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1595 if (NewSizeInBits >= 256) { 1596 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1597 if (RoundedTo < NewSizeInBits) 1598 NewSizeInBits = RoundedTo; 1599 } 1600 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1601 }) 1602 // Any vectors left are the wrong size. Scalarize them. 1603 .scalarize(0) 1604 .scalarize(1); 1605 } 1606 1607 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1608 // RegBankSelect. 1609 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1610 .legalFor({{S32}, {S64}}); 1611 1612 if (ST.hasVOP3PInsts()) { 1613 SextInReg.lowerFor({{V2S16}}) 1614 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1615 // get more vector shift opportunities, since we'll get those when 1616 // expanded. 1617 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1618 } else if (ST.has16BitInsts()) { 1619 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1620 } else { 1621 // Prefer to promote to s32 before lowering if we don't have 16-bit 1622 // shifts. This avoid a lot of intermediate truncate and extend operations. 1623 SextInReg.lowerFor({{S32}, {S64}}); 1624 } 1625 1626 SextInReg 1627 .scalarize(0) 1628 .clampScalar(0, S32, S64) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1685 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1686 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1687 .unsupported(); 1688 1689 getLegacyLegalizerInfo().computeTables(); 1690 verify(*ST.getInstrInfo()); 1691 } 1692 1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1694 MachineInstr &MI) const { 1695 MachineIRBuilder &B = Helper.MIRBuilder; 1696 MachineRegisterInfo &MRI = *B.getMRI(); 1697 1698 switch (MI.getOpcode()) { 1699 case TargetOpcode::G_ADDRSPACE_CAST: 1700 return legalizeAddrSpaceCast(MI, MRI, B); 1701 case TargetOpcode::G_FRINT: 1702 return legalizeFrint(MI, MRI, B); 1703 case TargetOpcode::G_FCEIL: 1704 return legalizeFceil(MI, MRI, B); 1705 case TargetOpcode::G_FREM: 1706 return legalizeFrem(MI, MRI, B); 1707 case TargetOpcode::G_INTRINSIC_TRUNC: 1708 return legalizeIntrinsicTrunc(MI, MRI, B); 1709 case TargetOpcode::G_SITOFP: 1710 return legalizeITOFP(MI, MRI, B, true); 1711 case TargetOpcode::G_UITOFP: 1712 return legalizeITOFP(MI, MRI, B, false); 1713 case TargetOpcode::G_FPTOSI: 1714 return legalizeFPTOI(MI, MRI, B, true); 1715 case TargetOpcode::G_FPTOUI: 1716 return legalizeFPTOI(MI, MRI, B, false); 1717 case TargetOpcode::G_FMINNUM: 1718 case TargetOpcode::G_FMAXNUM: 1719 case TargetOpcode::G_FMINNUM_IEEE: 1720 case TargetOpcode::G_FMAXNUM_IEEE: 1721 return legalizeMinNumMaxNum(Helper, MI); 1722 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1723 return legalizeExtractVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_INSERT_VECTOR_ELT: 1725 return legalizeInsertVectorElt(MI, MRI, B); 1726 case TargetOpcode::G_SHUFFLE_VECTOR: 1727 return legalizeShuffleVector(MI, MRI, B); 1728 case TargetOpcode::G_FSIN: 1729 case TargetOpcode::G_FCOS: 1730 return legalizeSinCos(MI, MRI, B); 1731 case TargetOpcode::G_GLOBAL_VALUE: 1732 return legalizeGlobalValue(MI, MRI, B); 1733 case TargetOpcode::G_LOAD: 1734 case TargetOpcode::G_SEXTLOAD: 1735 case TargetOpcode::G_ZEXTLOAD: 1736 return legalizeLoad(Helper, MI); 1737 case TargetOpcode::G_FMAD: 1738 return legalizeFMad(MI, MRI, B); 1739 case TargetOpcode::G_FDIV: 1740 return legalizeFDIV(MI, MRI, B); 1741 case TargetOpcode::G_UDIV: 1742 case TargetOpcode::G_UREM: 1743 case TargetOpcode::G_UDIVREM: 1744 return legalizeUnsignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_SDIV: 1746 case TargetOpcode::G_SREM: 1747 case TargetOpcode::G_SDIVREM: 1748 return legalizeSignedDIV_REM(MI, MRI, B); 1749 case TargetOpcode::G_ATOMIC_CMPXCHG: 1750 return legalizeAtomicCmpXChg(MI, MRI, B); 1751 case TargetOpcode::G_FLOG: 1752 return legalizeFlog(MI, B, numbers::ln2f); 1753 case TargetOpcode::G_FLOG10: 1754 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1755 case TargetOpcode::G_FEXP: 1756 return legalizeFExp(MI, B); 1757 case TargetOpcode::G_FPOW: 1758 return legalizeFPow(MI, B); 1759 case TargetOpcode::G_FFLOOR: 1760 return legalizeFFloor(MI, MRI, B); 1761 case TargetOpcode::G_BUILD_VECTOR: 1762 return legalizeBuildVector(MI, MRI, B); 1763 default: 1764 return false; 1765 } 1766 1767 llvm_unreachable("expected switch to return"); 1768 } 1769 1770 Register AMDGPULegalizerInfo::getSegmentAperture( 1771 unsigned AS, 1772 MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B) const { 1774 MachineFunction &MF = B.getMF(); 1775 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1776 const LLT S32 = LLT::scalar(32); 1777 1778 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1779 1780 if (ST.hasApertureRegs()) { 1781 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1782 // getreg. 1783 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1784 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1785 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1786 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1789 unsigned Encoding = 1790 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1791 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1792 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1793 1794 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1795 1796 B.buildInstr(AMDGPU::S_GETREG_B32) 1797 .addDef(GetReg) 1798 .addImm(Encoding); 1799 MRI.setType(GetReg, S32); 1800 1801 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1802 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1803 } 1804 1805 Register QueuePtr = MRI.createGenericVirtualRegister( 1806 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1807 1808 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1809 return Register(); 1810 1811 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1812 // private_segment_aperture_base_hi. 1813 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1814 1815 // TODO: can we be smarter about machine pointer info? 1816 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1817 MachineMemOperand *MMO = MF.getMachineMemOperand( 1818 PtrInfo, 1819 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1820 MachineMemOperand::MOInvariant, 1821 4, commonAlignment(Align(64), StructOffset)); 1822 1823 Register LoadAddr; 1824 1825 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1826 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1827 } 1828 1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1830 MachineInstr &MI, MachineRegisterInfo &MRI, 1831 MachineIRBuilder &B) const { 1832 MachineFunction &MF = B.getMF(); 1833 1834 const LLT S32 = LLT::scalar(32); 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Src = MI.getOperand(1).getReg(); 1837 1838 LLT DstTy = MRI.getType(Dst); 1839 LLT SrcTy = MRI.getType(Src); 1840 unsigned DestAS = DstTy.getAddressSpace(); 1841 unsigned SrcAS = SrcTy.getAddressSpace(); 1842 1843 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1844 // vector element. 1845 assert(!DstTy.isVector()); 1846 1847 const AMDGPUTargetMachine &TM 1848 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1849 1850 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1851 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1852 return true; 1853 } 1854 1855 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1856 // Truncate. 1857 B.buildExtract(Dst, Src, 0); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1863 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1864 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1865 1866 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1867 // another. Merge operands are required to be the same type, but creating an 1868 // extra ptrtoint would be kind of pointless. 1869 auto HighAddr = B.buildConstant( 1870 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1871 B.buildMerge(Dst, {Src, HighAddr}); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1877 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1878 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1879 unsigned NullVal = TM.getNullPointerValue(DestAS); 1880 1881 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1882 auto FlatNull = B.buildConstant(SrcTy, 0); 1883 1884 // Extract low 32-bits of the pointer. 1885 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1886 1887 auto CmpRes = 1888 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1889 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1890 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1896 return false; 1897 1898 if (!ST.hasFlatAddressSpace()) 1899 return false; 1900 1901 auto SegmentNull = 1902 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1903 auto FlatNull = 1904 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1905 1906 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1907 if (!ApertureReg.isValid()) 1908 return false; 1909 1910 auto CmpRes = 1911 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1912 1913 // Coerce the type of the low half of the result so we can use merge_values. 1914 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1915 1916 // TODO: Should we allow mismatched types but matching sizes in merges to 1917 // avoid the ptrtoint? 1918 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1919 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1920 1921 MI.eraseFromParent(); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeFrint( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 Register Src = MI.getOperand(1).getReg(); 1929 LLT Ty = MRI.getType(Src); 1930 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1931 1932 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1933 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1934 1935 auto C1 = B.buildFConstant(Ty, C1Val); 1936 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1937 1938 // TODO: Should this propagate fast-math-flags? 1939 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1940 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1941 1942 auto C2 = B.buildFConstant(Ty, C2Val); 1943 auto Fabs = B.buildFAbs(Ty, Src); 1944 1945 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1946 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeFceil( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 1955 const LLT S1 = LLT::scalar(1); 1956 const LLT S64 = LLT::scalar(64); 1957 1958 Register Src = MI.getOperand(1).getReg(); 1959 assert(MRI.getType(Src) == S64); 1960 1961 // result = trunc(src) 1962 // if (src > 0.0 && src != result) 1963 // result += 1.0 1964 1965 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1966 1967 const auto Zero = B.buildFConstant(S64, 0.0); 1968 const auto One = B.buildFConstant(S64, 1.0); 1969 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1970 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1971 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1972 auto Add = B.buildSelect(S64, And, One, Zero); 1973 1974 // TODO: Should this propagate fast-math-flags? 1975 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1976 return true; 1977 } 1978 1979 bool AMDGPULegalizerInfo::legalizeFrem( 1980 MachineInstr &MI, MachineRegisterInfo &MRI, 1981 MachineIRBuilder &B) const { 1982 Register DstReg = MI.getOperand(0).getReg(); 1983 Register Src0Reg = MI.getOperand(1).getReg(); 1984 Register Src1Reg = MI.getOperand(2).getReg(); 1985 auto Flags = MI.getFlags(); 1986 LLT Ty = MRI.getType(DstReg); 1987 1988 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1989 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1990 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1991 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1992 MI.eraseFromParent(); 1993 return true; 1994 } 1995 1996 static MachineInstrBuilder extractF64Exponent(Register Hi, 1997 MachineIRBuilder &B) { 1998 const unsigned FractBits = 52; 1999 const unsigned ExpBits = 11; 2000 LLT S32 = LLT::scalar(32); 2001 2002 auto Const0 = B.buildConstant(S32, FractBits - 32); 2003 auto Const1 = B.buildConstant(S32, ExpBits); 2004 2005 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2006 .addUse(Hi) 2007 .addUse(Const0.getReg(0)) 2008 .addUse(Const1.getReg(0)); 2009 2010 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 const LLT S1 = LLT::scalar(1); 2017 const LLT S32 = LLT::scalar(32); 2018 const LLT S64 = LLT::scalar(64); 2019 2020 Register Src = MI.getOperand(1).getReg(); 2021 assert(MRI.getType(Src) == S64); 2022 2023 // TODO: Should this use extract since the low half is unused? 2024 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2025 Register Hi = Unmerge.getReg(1); 2026 2027 // Extract the upper half, since this is where we will find the sign and 2028 // exponent. 2029 auto Exp = extractF64Exponent(Hi, B); 2030 2031 const unsigned FractBits = 52; 2032 2033 // Extract the sign bit. 2034 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2035 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2036 2037 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2038 2039 const auto Zero32 = B.buildConstant(S32, 0); 2040 2041 // Extend back to 64-bits. 2042 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2043 2044 auto Shr = B.buildAShr(S64, FractMask, Exp); 2045 auto Not = B.buildNot(S64, Shr); 2046 auto Tmp0 = B.buildAnd(S64, Src, Not); 2047 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2048 2049 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2050 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2051 2052 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2053 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2054 MI.eraseFromParent(); 2055 return true; 2056 } 2057 2058 bool AMDGPULegalizerInfo::legalizeITOFP( 2059 MachineInstr &MI, MachineRegisterInfo &MRI, 2060 MachineIRBuilder &B, bool Signed) const { 2061 2062 Register Dst = MI.getOperand(0).getReg(); 2063 Register Src = MI.getOperand(1).getReg(); 2064 2065 const LLT S64 = LLT::scalar(64); 2066 const LLT S32 = LLT::scalar(32); 2067 2068 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2069 2070 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2071 2072 auto CvtHi = Signed ? 2073 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2074 B.buildUITOFP(S64, Unmerge.getReg(1)); 2075 2076 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2077 2078 auto ThirtyTwo = B.buildConstant(S32, 32); 2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2080 .addUse(CvtHi.getReg(0)) 2081 .addUse(ThirtyTwo.getReg(0)); 2082 2083 // TODO: Should this propagate fast-math-flags? 2084 B.buildFAdd(Dst, LdExp, CvtLo); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 // TODO: Copied from DAG implementation. Verify logic and document how this 2090 // actually works. 2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2092 MachineRegisterInfo &MRI, 2093 MachineIRBuilder &B, 2094 bool Signed) const { 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src = MI.getOperand(1).getReg(); 2098 2099 const LLT S64 = LLT::scalar(64); 2100 const LLT S32 = LLT::scalar(32); 2101 2102 const LLT SrcLT = MRI.getType(Src); 2103 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2104 2105 unsigned Flags = MI.getFlags(); 2106 2107 // The basic idea of converting a floating point number into a pair of 32-bit 2108 // integers is illustrated as follows: 2109 // 2110 // tf := trunc(val); 2111 // hif := floor(tf * 2^-32); 2112 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2113 // hi := fptoi(hif); 2114 // lo := fptoi(lof); 2115 // 2116 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2117 MachineInstrBuilder Sign; 2118 if (Signed && SrcLT == S32) { 2119 // However, a 32-bit floating point number has only 23 bits mantissa and 2120 // it's not enough to hold all the significant bits of `lof` if val is 2121 // negative. To avoid the loss of precision, We need to take the absolute 2122 // value after truncating and flip the result back based on the original 2123 // signedness. 2124 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2125 Trunc = B.buildFAbs(S32, Trunc, Flags); 2126 } 2127 MachineInstrBuilder K0, K1; 2128 if (SrcLT == S64) { 2129 K0 = B.buildFConstant(S64, 2130 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2131 K1 = B.buildFConstant(S64, 2132 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2133 } else { 2134 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2135 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2136 } 2137 2138 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2139 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2140 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2141 2142 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2143 : B.buildFPTOUI(S32, FloorMul); 2144 auto Lo = B.buildFPTOUI(S32, Fma); 2145 2146 if (Signed && SrcLT == S32) { 2147 // Flip the result based on the signedness, which is either all 0s or 1s. 2148 Sign = B.buildMerge(S64, {Sign, Sign}); 2149 // r := xor({lo, hi}, sign) - sign; 2150 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2151 } else 2152 B.buildMerge(Dst, {Lo, Hi}); 2153 MI.eraseFromParent(); 2154 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2159 MachineInstr &MI) const { 2160 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2161 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2162 2163 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2164 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2165 2166 // With ieee_mode disabled, the instructions have the correct behavior 2167 // already for G_FMINNUM/G_FMAXNUM 2168 if (!MFI->getMode().IEEE) 2169 return !IsIEEEOp; 2170 2171 if (IsIEEEOp) 2172 return true; 2173 2174 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2175 } 2176 2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2178 MachineInstr &MI, MachineRegisterInfo &MRI, 2179 MachineIRBuilder &B) const { 2180 // TODO: Should move some of this into LegalizerHelper. 2181 2182 // TODO: Promote dynamic indexing of s16 to s32 2183 2184 // FIXME: Artifact combiner probably should have replaced the truncated 2185 // constant before this, so we shouldn't need 2186 // getConstantVRegValWithLookThrough. 2187 Optional<ValueAndVReg> MaybeIdxVal = 2188 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2189 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2190 return true; 2191 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2192 2193 Register Dst = MI.getOperand(0).getReg(); 2194 Register Vec = MI.getOperand(1).getReg(); 2195 2196 LLT VecTy = MRI.getType(Vec); 2197 LLT EltTy = VecTy.getElementType(); 2198 assert(EltTy == MRI.getType(Dst)); 2199 2200 if (IdxVal < VecTy.getNumElements()) 2201 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2202 else 2203 B.buildUndef(Dst); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2210 MachineInstr &MI, MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B) const { 2212 // TODO: Should move some of this into LegalizerHelper. 2213 2214 // TODO: Promote dynamic indexing of s16 to s32 2215 2216 // FIXME: Artifact combiner probably should have replaced the truncated 2217 // constant before this, so we shouldn't need 2218 // getConstantVRegValWithLookThrough. 2219 Optional<ValueAndVReg> MaybeIdxVal = 2220 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2221 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2222 return true; 2223 2224 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2225 Register Dst = MI.getOperand(0).getReg(); 2226 Register Vec = MI.getOperand(1).getReg(); 2227 Register Ins = MI.getOperand(2).getReg(); 2228 2229 LLT VecTy = MRI.getType(Vec); 2230 LLT EltTy = VecTy.getElementType(); 2231 assert(EltTy == MRI.getType(Ins)); 2232 2233 if (IdxVal < VecTy.getNumElements()) 2234 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2235 else 2236 B.buildUndef(Dst); 2237 2238 MI.eraseFromParent(); 2239 return true; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, 2244 MachineIRBuilder &B) const { 2245 const LLT V2S16 = LLT::fixed_vector(2, 16); 2246 2247 Register Dst = MI.getOperand(0).getReg(); 2248 Register Src0 = MI.getOperand(1).getReg(); 2249 LLT DstTy = MRI.getType(Dst); 2250 LLT SrcTy = MRI.getType(Src0); 2251 2252 if (SrcTy == V2S16 && DstTy == V2S16 && 2253 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2254 return true; 2255 2256 MachineIRBuilder HelperBuilder(MI); 2257 GISelObserverWrapper DummyObserver; 2258 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2259 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2260 } 2261 2262 bool AMDGPULegalizerInfo::legalizeSinCos( 2263 MachineInstr &MI, MachineRegisterInfo &MRI, 2264 MachineIRBuilder &B) const { 2265 2266 Register DstReg = MI.getOperand(0).getReg(); 2267 Register SrcReg = MI.getOperand(1).getReg(); 2268 LLT Ty = MRI.getType(DstReg); 2269 unsigned Flags = MI.getFlags(); 2270 2271 Register TrigVal; 2272 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2273 if (ST.hasTrigReducedRange()) { 2274 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2275 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2276 .addUse(MulVal.getReg(0)) 2277 .setMIFlags(Flags).getReg(0); 2278 } else 2279 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2280 2281 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2282 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2283 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2284 .addUse(TrigVal) 2285 .setMIFlags(Flags); 2286 MI.eraseFromParent(); 2287 return true; 2288 } 2289 2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2291 MachineIRBuilder &B, 2292 const GlobalValue *GV, 2293 int64_t Offset, 2294 unsigned GAFlags) const { 2295 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2296 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2297 // to the following code sequence: 2298 // 2299 // For constant address space: 2300 // s_getpc_b64 s[0:1] 2301 // s_add_u32 s0, s0, $symbol 2302 // s_addc_u32 s1, s1, 0 2303 // 2304 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2305 // a fixup or relocation is emitted to replace $symbol with a literal 2306 // constant, which is a pc-relative offset from the encoding of the $symbol 2307 // operand to the global variable. 2308 // 2309 // For global address space: 2310 // s_getpc_b64 s[0:1] 2311 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2312 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2313 // 2314 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2315 // fixups or relocations are emitted to replace $symbol@*@lo and 2316 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2317 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2318 // operand to the global variable. 2319 // 2320 // What we want here is an offset from the value returned by s_getpc 2321 // (which is the address of the s_add_u32 instruction) to the global 2322 // variable, but since the encoding of $symbol starts 4 bytes after the start 2323 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2324 // small. This requires us to add 4 to the global variable offset in order to 2325 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2326 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2327 // instruction. 2328 2329 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2330 2331 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2332 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2333 2334 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2335 .addDef(PCReg); 2336 2337 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2338 if (GAFlags == SIInstrInfo::MO_NONE) 2339 MIB.addImm(0); 2340 else 2341 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2342 2343 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2344 2345 if (PtrTy.getSizeInBits() == 32) 2346 B.buildExtract(DstReg, PCReg, 0); 2347 return true; 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2351 MachineInstr &MI, MachineRegisterInfo &MRI, 2352 MachineIRBuilder &B) const { 2353 Register DstReg = MI.getOperand(0).getReg(); 2354 LLT Ty = MRI.getType(DstReg); 2355 unsigned AS = Ty.getAddressSpace(); 2356 2357 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2358 MachineFunction &MF = B.getMF(); 2359 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2360 2361 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2362 if (!MFI->isModuleEntryFunction() && 2363 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2364 const Function &Fn = MF.getFunction(); 2365 DiagnosticInfoUnsupported BadLDSDecl( 2366 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2367 DS_Warning); 2368 Fn.getContext().diagnose(BadLDSDecl); 2369 2370 // We currently don't have a way to correctly allocate LDS objects that 2371 // aren't directly associated with a kernel. We do force inlining of 2372 // functions that use local objects. However, if these dead functions are 2373 // not eliminated, we don't want a compile time error. Just emit a warning 2374 // and a trap, since there should be no callable path here. 2375 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2376 B.buildUndef(DstReg); 2377 MI.eraseFromParent(); 2378 return true; 2379 } 2380 2381 // TODO: We could emit code to handle the initialization somewhere. 2382 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2383 const SITargetLowering *TLI = ST.getTargetLowering(); 2384 if (!TLI->shouldUseLDSConstAddress(GV)) { 2385 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2386 return true; // Leave in place; 2387 } 2388 2389 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2390 Type *Ty = GV->getValueType(); 2391 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2392 // zero-sized type in other languages to declare the dynamic shared 2393 // memory which size is not known at the compile time. They will be 2394 // allocated by the runtime and placed directly after the static 2395 // allocated ones. They all share the same offset. 2396 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2397 // Adjust alignment for that dynamic shared memory array. 2398 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2399 LLT S32 = LLT::scalar(32); 2400 auto Sz = 2401 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2402 B.buildIntToPtr(DstReg, Sz); 2403 MI.eraseFromParent(); 2404 return true; 2405 } 2406 } 2407 2408 B.buildConstant( 2409 DstReg, 2410 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 const Function &Fn = MF.getFunction(); 2416 DiagnosticInfoUnsupported BadInit( 2417 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2418 Fn.getContext().diagnose(BadInit); 2419 return true; 2420 } 2421 2422 const SITargetLowering *TLI = ST.getTargetLowering(); 2423 2424 if (TLI->shouldEmitFixup(GV)) { 2425 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2426 MI.eraseFromParent(); 2427 return true; 2428 } 2429 2430 if (TLI->shouldEmitPCReloc(GV)) { 2431 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2437 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2438 2439 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2440 MachinePointerInfo::getGOT(MF), 2441 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2442 MachineMemOperand::MOInvariant, 2443 8 /*Size*/, Align(8)); 2444 2445 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2446 2447 if (Ty.getSizeInBits() == 32) { 2448 // Truncate if this is a 32-bit constant adrdess. 2449 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2450 B.buildExtract(DstReg, Load, 0); 2451 } else 2452 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2453 2454 MI.eraseFromParent(); 2455 return true; 2456 } 2457 2458 static LLT widenToNextPowerOf2(LLT Ty) { 2459 if (Ty.isVector()) 2460 return Ty.changeElementCount( 2461 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2462 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2466 MachineInstr &MI) const { 2467 MachineIRBuilder &B = Helper.MIRBuilder; 2468 MachineRegisterInfo &MRI = *B.getMRI(); 2469 GISelChangeObserver &Observer = Helper.Observer; 2470 2471 Register PtrReg = MI.getOperand(1).getReg(); 2472 LLT PtrTy = MRI.getType(PtrReg); 2473 unsigned AddrSpace = PtrTy.getAddressSpace(); 2474 2475 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2476 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2477 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2478 Observer.changingInstr(MI); 2479 MI.getOperand(1).setReg(Cast.getReg(0)); 2480 Observer.changedInstr(MI); 2481 return true; 2482 } 2483 2484 if (MI.getOpcode() != AMDGPU::G_LOAD) 2485 return false; 2486 2487 Register ValReg = MI.getOperand(0).getReg(); 2488 LLT ValTy = MRI.getType(ValReg); 2489 2490 MachineMemOperand *MMO = *MI.memoperands_begin(); 2491 const unsigned ValSize = ValTy.getSizeInBits(); 2492 const LLT MemTy = MMO->getMemoryType(); 2493 const Align MemAlign = MMO->getAlign(); 2494 const unsigned MemSize = MemTy.getSizeInBits(); 2495 const unsigned AlignInBits = 8 * MemAlign.value(); 2496 2497 // Widen non-power-of-2 loads to the alignment if needed 2498 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2499 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2500 2501 // This was already the correct extending load result type, so just adjust 2502 // the memory type. 2503 if (WideMemSize == ValSize) { 2504 MachineFunction &MF = B.getMF(); 2505 2506 MachineMemOperand *WideMMO = 2507 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2508 Observer.changingInstr(MI); 2509 MI.setMemRefs(MF, {WideMMO}); 2510 Observer.changedInstr(MI); 2511 return true; 2512 } 2513 2514 // Don't bother handling edge case that should probably never be produced. 2515 if (ValSize > WideMemSize) 2516 return false; 2517 2518 LLT WideTy = widenToNextPowerOf2(ValTy); 2519 2520 Register WideLoad; 2521 if (!WideTy.isVector()) { 2522 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2523 B.buildTrunc(ValReg, WideLoad).getReg(0); 2524 } else { 2525 // Extract the subvector. 2526 2527 if (isRegisterType(ValTy)) { 2528 // If this a case where G_EXTRACT is legal, use it. 2529 // (e.g. <3 x s32> -> <4 x s32>) 2530 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2531 B.buildExtract(ValReg, WideLoad, 0); 2532 } else { 2533 // For cases where the widened type isn't a nice register value, unmerge 2534 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2535 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2536 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2537 B.setInsertPt(B.getMBB(), MI.getIterator()); 2538 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2539 } 2540 } 2541 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 return false; 2547 } 2548 2549 bool AMDGPULegalizerInfo::legalizeFMad( 2550 MachineInstr &MI, MachineRegisterInfo &MRI, 2551 MachineIRBuilder &B) const { 2552 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2553 assert(Ty.isScalar()); 2554 2555 MachineFunction &MF = B.getMF(); 2556 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2557 2558 // TODO: Always legal with future ftz flag. 2559 // FIXME: Do we need just output? 2560 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2561 return true; 2562 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2563 return true; 2564 2565 MachineIRBuilder HelperBuilder(MI); 2566 GISelObserverWrapper DummyObserver; 2567 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2568 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2569 } 2570 2571 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2572 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2573 Register DstReg = MI.getOperand(0).getReg(); 2574 Register PtrReg = MI.getOperand(1).getReg(); 2575 Register CmpVal = MI.getOperand(2).getReg(); 2576 Register NewVal = MI.getOperand(3).getReg(); 2577 2578 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2579 "this should not have been custom lowered"); 2580 2581 LLT ValTy = MRI.getType(CmpVal); 2582 LLT VecTy = LLT::fixed_vector(2, ValTy); 2583 2584 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2585 2586 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2587 .addDef(DstReg) 2588 .addUse(PtrReg) 2589 .addUse(PackedVal) 2590 .setMemRefs(MI.memoperands()); 2591 2592 MI.eraseFromParent(); 2593 return true; 2594 } 2595 2596 bool AMDGPULegalizerInfo::legalizeFlog( 2597 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2598 Register Dst = MI.getOperand(0).getReg(); 2599 Register Src = MI.getOperand(1).getReg(); 2600 LLT Ty = B.getMRI()->getType(Dst); 2601 unsigned Flags = MI.getFlags(); 2602 2603 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2604 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2605 2606 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2607 MI.eraseFromParent(); 2608 return true; 2609 } 2610 2611 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2612 MachineIRBuilder &B) const { 2613 Register Dst = MI.getOperand(0).getReg(); 2614 Register Src = MI.getOperand(1).getReg(); 2615 unsigned Flags = MI.getFlags(); 2616 LLT Ty = B.getMRI()->getType(Dst); 2617 2618 auto K = B.buildFConstant(Ty, numbers::log2e); 2619 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2620 B.buildFExp2(Dst, Mul, Flags); 2621 MI.eraseFromParent(); 2622 return true; 2623 } 2624 2625 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2626 MachineIRBuilder &B) const { 2627 Register Dst = MI.getOperand(0).getReg(); 2628 Register Src0 = MI.getOperand(1).getReg(); 2629 Register Src1 = MI.getOperand(2).getReg(); 2630 unsigned Flags = MI.getFlags(); 2631 LLT Ty = B.getMRI()->getType(Dst); 2632 const LLT S16 = LLT::scalar(16); 2633 const LLT S32 = LLT::scalar(32); 2634 2635 if (Ty == S32) { 2636 auto Log = B.buildFLog2(S32, Src0, Flags); 2637 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2638 .addUse(Log.getReg(0)) 2639 .addUse(Src1) 2640 .setMIFlags(Flags); 2641 B.buildFExp2(Dst, Mul, Flags); 2642 } else if (Ty == S16) { 2643 // There's no f16 fmul_legacy, so we need to convert for it. 2644 auto Log = B.buildFLog2(S16, Src0, Flags); 2645 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2646 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2647 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2648 .addUse(Ext0.getReg(0)) 2649 .addUse(Ext1.getReg(0)) 2650 .setMIFlags(Flags); 2651 2652 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2653 } else 2654 return false; 2655 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 // Find a source register, ignoring any possible source modifiers. 2661 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2662 Register ModSrc = OrigSrc; 2663 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2664 ModSrc = SrcFNeg->getOperand(1).getReg(); 2665 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2666 ModSrc = SrcFAbs->getOperand(1).getReg(); 2667 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2668 ModSrc = SrcFAbs->getOperand(1).getReg(); 2669 return ModSrc; 2670 } 2671 2672 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2673 MachineRegisterInfo &MRI, 2674 MachineIRBuilder &B) const { 2675 2676 const LLT S1 = LLT::scalar(1); 2677 const LLT S64 = LLT::scalar(64); 2678 Register Dst = MI.getOperand(0).getReg(); 2679 Register OrigSrc = MI.getOperand(1).getReg(); 2680 unsigned Flags = MI.getFlags(); 2681 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2682 "this should not have been custom lowered"); 2683 2684 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2685 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2686 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2687 // V_FRACT bug is: 2688 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2689 // 2690 // Convert floor(x) to (x - fract(x)) 2691 2692 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2693 .addUse(OrigSrc) 2694 .setMIFlags(Flags); 2695 2696 // Give source modifier matching some assistance before obscuring a foldable 2697 // pattern. 2698 2699 // TODO: We can avoid the neg on the fract? The input sign to fract 2700 // shouldn't matter? 2701 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2702 2703 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2704 2705 Register Min = MRI.createGenericVirtualRegister(S64); 2706 2707 // We don't need to concern ourselves with the snan handling difference, so 2708 // use the one which will directly select. 2709 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2710 if (MFI->getMode().IEEE) 2711 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2712 else 2713 B.buildFMinNum(Min, Fract, Const, Flags); 2714 2715 Register CorrectedFract = Min; 2716 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2717 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2718 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2719 } 2720 2721 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2722 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2723 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 // Turn an illegal packed v2s16 build vector into bit operations. 2729 // TODO: This should probably be a bitcast action in LegalizerHelper. 2730 bool AMDGPULegalizerInfo::legalizeBuildVector( 2731 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2732 Register Dst = MI.getOperand(0).getReg(); 2733 const LLT S32 = LLT::scalar(32); 2734 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2735 2736 Register Src0 = MI.getOperand(1).getReg(); 2737 Register Src1 = MI.getOperand(2).getReg(); 2738 assert(MRI.getType(Src0) == LLT::scalar(16)); 2739 2740 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2741 B.buildBitcast(Dst, Merge); 2742 2743 MI.eraseFromParent(); 2744 return true; 2745 } 2746 2747 // Check that this is a G_XOR x, -1 2748 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2749 if (MI.getOpcode() != TargetOpcode::G_XOR) 2750 return false; 2751 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2752 return ConstVal && *ConstVal == -1; 2753 } 2754 2755 // Return the use branch instruction, otherwise null if the usage is invalid. 2756 static MachineInstr * 2757 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2758 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2759 Register CondDef = MI.getOperand(0).getReg(); 2760 if (!MRI.hasOneNonDBGUse(CondDef)) 2761 return nullptr; 2762 2763 MachineBasicBlock *Parent = MI.getParent(); 2764 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2765 2766 if (isNot(MRI, *UseMI)) { 2767 Register NegatedCond = UseMI->getOperand(0).getReg(); 2768 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2769 return nullptr; 2770 2771 // We're deleting the def of this value, so we need to remove it. 2772 UseMI->eraseFromParent(); 2773 2774 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2775 Negated = true; 2776 } 2777 2778 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2779 return nullptr; 2780 2781 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2782 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2783 if (Next == Parent->end()) { 2784 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2785 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2786 return nullptr; 2787 UncondBrTarget = &*NextMBB; 2788 } else { 2789 if (Next->getOpcode() != AMDGPU::G_BR) 2790 return nullptr; 2791 Br = &*Next; 2792 UncondBrTarget = Br->getOperand(0).getMBB(); 2793 } 2794 2795 return UseMI; 2796 } 2797 2798 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2799 const ArgDescriptor *Arg, 2800 const TargetRegisterClass *ArgRC, 2801 LLT ArgTy) const { 2802 MCRegister SrcReg = Arg->getRegister(); 2803 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2804 assert(DstReg.isVirtual() && "Virtual register expected"); 2805 2806 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2807 ArgTy); 2808 if (Arg->isMasked()) { 2809 // TODO: Should we try to emit this once in the entry block? 2810 const LLT S32 = LLT::scalar(32); 2811 const unsigned Mask = Arg->getMask(); 2812 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2813 2814 Register AndMaskSrc = LiveIn; 2815 2816 if (Shift != 0) { 2817 auto ShiftAmt = B.buildConstant(S32, Shift); 2818 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2819 } 2820 2821 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2822 } else { 2823 B.buildCopy(DstReg, LiveIn); 2824 } 2825 2826 return true; 2827 } 2828 2829 bool AMDGPULegalizerInfo::loadInputValue( 2830 Register DstReg, MachineIRBuilder &B, 2831 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2832 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2833 const ArgDescriptor *Arg; 2834 const TargetRegisterClass *ArgRC; 2835 LLT ArgTy; 2836 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2837 2838 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2839 return false; // TODO: Handle these 2840 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2841 } 2842 2843 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2844 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2845 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2846 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2847 return false; 2848 2849 MI.eraseFromParent(); 2850 return true; 2851 } 2852 2853 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2854 MachineRegisterInfo &MRI, 2855 MachineIRBuilder &B) const { 2856 Register Dst = MI.getOperand(0).getReg(); 2857 LLT DstTy = MRI.getType(Dst); 2858 LLT S16 = LLT::scalar(16); 2859 LLT S32 = LLT::scalar(32); 2860 LLT S64 = LLT::scalar(64); 2861 2862 if (DstTy == S16) 2863 return legalizeFDIV16(MI, MRI, B); 2864 if (DstTy == S32) 2865 return legalizeFDIV32(MI, MRI, B); 2866 if (DstTy == S64) 2867 return legalizeFDIV64(MI, MRI, B); 2868 2869 return false; 2870 } 2871 2872 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2873 Register DstDivReg, 2874 Register DstRemReg, 2875 Register X, 2876 Register Y) const { 2877 const LLT S1 = LLT::scalar(1); 2878 const LLT S32 = LLT::scalar(32); 2879 2880 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2881 // algorithm used here. 2882 2883 // Initial estimate of inv(y). 2884 auto FloatY = B.buildUITOFP(S32, Y); 2885 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2886 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2887 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2888 auto Z = B.buildFPTOUI(S32, ScaledY); 2889 2890 // One round of UNR. 2891 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2892 auto NegYZ = B.buildMul(S32, NegY, Z); 2893 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2894 2895 // Quotient/remainder estimate. 2896 auto Q = B.buildUMulH(S32, X, Z); 2897 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2898 2899 // First quotient/remainder refinement. 2900 auto One = B.buildConstant(S32, 1); 2901 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2902 if (DstDivReg) 2903 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2904 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2905 2906 // Second quotient/remainder refinement. 2907 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2908 if (DstDivReg) 2909 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2910 2911 if (DstRemReg) 2912 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2913 } 2914 2915 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2916 // 2917 // Return lo, hi of result 2918 // 2919 // %cvt.lo = G_UITOFP Val.lo 2920 // %cvt.hi = G_UITOFP Val.hi 2921 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2922 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2923 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2924 // %mul2 = G_FMUL %mul1, 2**(-32) 2925 // %trunc = G_INTRINSIC_TRUNC %mul2 2926 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2927 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2928 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2929 Register Val) { 2930 const LLT S32 = LLT::scalar(32); 2931 auto Unmerge = B.buildUnmerge(S32, Val); 2932 2933 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2934 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2935 2936 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2937 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2938 2939 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2940 auto Mul1 = 2941 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2942 2943 // 2**(-32) 2944 auto Mul2 = 2945 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2946 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2947 2948 // -(2**32) 2949 auto Mad2 = B.buildFMAD(S32, Trunc, 2950 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2951 2952 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2953 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2954 2955 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2956 } 2957 2958 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2959 Register DstDivReg, 2960 Register DstRemReg, 2961 Register Numer, 2962 Register Denom) const { 2963 const LLT S32 = LLT::scalar(32); 2964 const LLT S64 = LLT::scalar(64); 2965 const LLT S1 = LLT::scalar(1); 2966 Register RcpLo, RcpHi; 2967 2968 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2969 2970 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2971 2972 auto Zero64 = B.buildConstant(S64, 0); 2973 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2974 2975 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2976 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2977 2978 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2979 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2980 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2981 2982 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2983 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2984 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2985 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2986 2987 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2988 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2989 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2990 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2991 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2992 2993 auto Zero32 = B.buildConstant(S32, 0); 2994 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2995 auto Add2_HiC = 2996 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2997 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2998 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2999 3000 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3001 Register NumerLo = UnmergeNumer.getReg(0); 3002 Register NumerHi = UnmergeNumer.getReg(1); 3003 3004 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3005 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3006 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3007 Register Mul3_Lo = UnmergeMul3.getReg(0); 3008 Register Mul3_Hi = UnmergeMul3.getReg(1); 3009 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3010 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3011 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3012 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3013 3014 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3015 Register DenomLo = UnmergeDenom.getReg(0); 3016 Register DenomHi = UnmergeDenom.getReg(1); 3017 3018 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3019 auto C1 = B.buildSExt(S32, CmpHi); 3020 3021 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3022 auto C2 = B.buildSExt(S32, CmpLo); 3023 3024 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3025 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3026 3027 // TODO: Here and below portions of the code can be enclosed into if/endif. 3028 // Currently control flow is unconditional and we have 4 selects after 3029 // potential endif to substitute PHIs. 3030 3031 // if C3 != 0 ... 3032 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3033 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3034 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3035 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3036 3037 auto One64 = B.buildConstant(S64, 1); 3038 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3039 3040 auto C4 = 3041 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3042 auto C5 = 3043 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3044 auto C6 = B.buildSelect( 3045 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3046 3047 // if (C6 != 0) 3048 auto Add4 = B.buildAdd(S64, Add3, One64); 3049 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3050 3051 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3052 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3053 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3054 3055 // endif C6 3056 // endif C3 3057 3058 if (DstDivReg) { 3059 auto Sel1 = B.buildSelect( 3060 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3061 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3062 Sel1, MulHi3); 3063 } 3064 3065 if (DstRemReg) { 3066 auto Sel2 = B.buildSelect( 3067 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3068 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3069 Sel2, Sub1); 3070 } 3071 } 3072 3073 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3074 MachineRegisterInfo &MRI, 3075 MachineIRBuilder &B) const { 3076 Register DstDivReg, DstRemReg; 3077 switch (MI.getOpcode()) { 3078 default: 3079 llvm_unreachable("Unexpected opcode!"); 3080 case AMDGPU::G_UDIV: { 3081 DstDivReg = MI.getOperand(0).getReg(); 3082 break; 3083 } 3084 case AMDGPU::G_UREM: { 3085 DstRemReg = MI.getOperand(0).getReg(); 3086 break; 3087 } 3088 case AMDGPU::G_UDIVREM: { 3089 DstDivReg = MI.getOperand(0).getReg(); 3090 DstRemReg = MI.getOperand(1).getReg(); 3091 break; 3092 } 3093 } 3094 3095 const LLT S64 = LLT::scalar(64); 3096 const LLT S32 = LLT::scalar(32); 3097 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3098 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3099 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3100 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3101 3102 if (Ty == S32) 3103 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3104 else if (Ty == S64) 3105 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3106 else 3107 return false; 3108 3109 MI.eraseFromParent(); 3110 return true; 3111 } 3112 3113 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3114 MachineRegisterInfo &MRI, 3115 MachineIRBuilder &B) const { 3116 const LLT S64 = LLT::scalar(64); 3117 const LLT S32 = LLT::scalar(32); 3118 3119 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3120 if (Ty != S32 && Ty != S64) 3121 return false; 3122 3123 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3124 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3125 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3126 3127 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3128 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3129 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3130 3131 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3132 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3133 3134 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3135 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3136 3137 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3138 switch (MI.getOpcode()) { 3139 default: 3140 llvm_unreachable("Unexpected opcode!"); 3141 case AMDGPU::G_SDIV: { 3142 DstDivReg = MI.getOperand(0).getReg(); 3143 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3144 break; 3145 } 3146 case AMDGPU::G_SREM: { 3147 DstRemReg = MI.getOperand(0).getReg(); 3148 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3149 break; 3150 } 3151 case AMDGPU::G_SDIVREM: { 3152 DstDivReg = MI.getOperand(0).getReg(); 3153 DstRemReg = MI.getOperand(1).getReg(); 3154 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3155 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3156 break; 3157 } 3158 } 3159 3160 if (Ty == S32) 3161 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3162 else 3163 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3164 3165 if (DstDivReg) { 3166 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3167 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3168 B.buildSub(DstDivReg, SignXor, Sign); 3169 } 3170 3171 if (DstRemReg) { 3172 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3173 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3174 B.buildSub(DstRemReg, SignXor, Sign); 3175 } 3176 3177 MI.eraseFromParent(); 3178 return true; 3179 } 3180 3181 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3182 MachineRegisterInfo &MRI, 3183 MachineIRBuilder &B) const { 3184 Register Res = MI.getOperand(0).getReg(); 3185 Register LHS = MI.getOperand(1).getReg(); 3186 Register RHS = MI.getOperand(2).getReg(); 3187 uint16_t Flags = MI.getFlags(); 3188 LLT ResTy = MRI.getType(Res); 3189 3190 const MachineFunction &MF = B.getMF(); 3191 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3192 MI.getFlag(MachineInstr::FmAfn); 3193 3194 if (!AllowInaccurateRcp) 3195 return false; 3196 3197 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3198 // 1 / x -> RCP(x) 3199 if (CLHS->isExactlyValue(1.0)) { 3200 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3201 .addUse(RHS) 3202 .setMIFlags(Flags); 3203 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 // -1 / x -> RCP( FNEG(x) ) 3209 if (CLHS->isExactlyValue(-1.0)) { 3210 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3211 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3212 .addUse(FNeg.getReg(0)) 3213 .setMIFlags(Flags); 3214 3215 MI.eraseFromParent(); 3216 return true; 3217 } 3218 } 3219 3220 // x / y -> x * (1.0 / y) 3221 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3222 .addUse(RHS) 3223 .setMIFlags(Flags); 3224 B.buildFMul(Res, LHS, RCP, Flags); 3225 3226 MI.eraseFromParent(); 3227 return true; 3228 } 3229 3230 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3231 MachineRegisterInfo &MRI, 3232 MachineIRBuilder &B) const { 3233 Register Res = MI.getOperand(0).getReg(); 3234 Register X = MI.getOperand(1).getReg(); 3235 Register Y = MI.getOperand(2).getReg(); 3236 uint16_t Flags = MI.getFlags(); 3237 LLT ResTy = MRI.getType(Res); 3238 3239 const MachineFunction &MF = B.getMF(); 3240 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3241 MI.getFlag(MachineInstr::FmAfn); 3242 3243 if (!AllowInaccurateRcp) 3244 return false; 3245 3246 auto NegY = B.buildFNeg(ResTy, Y); 3247 auto One = B.buildFConstant(ResTy, 1.0); 3248 3249 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3250 .addUse(Y) 3251 .setMIFlags(Flags); 3252 3253 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3254 R = B.buildFMA(ResTy, Tmp0, R, R); 3255 3256 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3257 R = B.buildFMA(ResTy, Tmp1, R, R); 3258 3259 auto Ret = B.buildFMul(ResTy, X, R); 3260 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3261 3262 B.buildFMA(Res, Tmp2, R, Ret); 3263 MI.eraseFromParent(); 3264 return true; 3265 } 3266 3267 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3268 MachineRegisterInfo &MRI, 3269 MachineIRBuilder &B) const { 3270 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3271 return true; 3272 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register LHS = MI.getOperand(1).getReg(); 3275 Register RHS = MI.getOperand(2).getReg(); 3276 3277 uint16_t Flags = MI.getFlags(); 3278 3279 LLT S16 = LLT::scalar(16); 3280 LLT S32 = LLT::scalar(32); 3281 3282 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3283 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3284 3285 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3286 .addUse(RHSExt.getReg(0)) 3287 .setMIFlags(Flags); 3288 3289 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3290 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3291 3292 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3293 .addUse(RDst.getReg(0)) 3294 .addUse(RHS) 3295 .addUse(LHS) 3296 .setMIFlags(Flags); 3297 3298 MI.eraseFromParent(); 3299 return true; 3300 } 3301 3302 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3303 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3304 static void toggleSPDenormMode(bool Enable, 3305 MachineIRBuilder &B, 3306 const GCNSubtarget &ST, 3307 AMDGPU::SIModeRegisterDefaults Mode) { 3308 // Set SP denorm mode to this value. 3309 unsigned SPDenormMode = 3310 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3311 3312 if (ST.hasDenormModeInst()) { 3313 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3314 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3315 3316 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3317 B.buildInstr(AMDGPU::S_DENORM_MODE) 3318 .addImm(NewDenormModeValue); 3319 3320 } else { 3321 // Select FP32 bit field in mode register. 3322 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3323 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3324 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3325 3326 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3327 .addImm(SPDenormMode) 3328 .addImm(SPDenormModeBitField); 3329 } 3330 } 3331 3332 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3333 MachineRegisterInfo &MRI, 3334 MachineIRBuilder &B) const { 3335 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3336 return true; 3337 3338 Register Res = MI.getOperand(0).getReg(); 3339 Register LHS = MI.getOperand(1).getReg(); 3340 Register RHS = MI.getOperand(2).getReg(); 3341 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3342 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3343 3344 uint16_t Flags = MI.getFlags(); 3345 3346 LLT S32 = LLT::scalar(32); 3347 LLT S1 = LLT::scalar(1); 3348 3349 auto One = B.buildFConstant(S32, 1.0f); 3350 3351 auto DenominatorScaled = 3352 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3353 .addUse(LHS) 3354 .addUse(RHS) 3355 .addImm(0) 3356 .setMIFlags(Flags); 3357 auto NumeratorScaled = 3358 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3359 .addUse(LHS) 3360 .addUse(RHS) 3361 .addImm(1) 3362 .setMIFlags(Flags); 3363 3364 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3365 .addUse(DenominatorScaled.getReg(0)) 3366 .setMIFlags(Flags); 3367 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3368 3369 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3370 // aren't modeled as reading it. 3371 if (!Mode.allFP32Denormals()) 3372 toggleSPDenormMode(true, B, ST, Mode); 3373 3374 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3375 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3376 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3377 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3378 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3379 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3380 3381 if (!Mode.allFP32Denormals()) 3382 toggleSPDenormMode(false, B, ST, Mode); 3383 3384 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3385 .addUse(Fma4.getReg(0)) 3386 .addUse(Fma1.getReg(0)) 3387 .addUse(Fma3.getReg(0)) 3388 .addUse(NumeratorScaled.getReg(1)) 3389 .setMIFlags(Flags); 3390 3391 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3392 .addUse(Fmas.getReg(0)) 3393 .addUse(RHS) 3394 .addUse(LHS) 3395 .setMIFlags(Flags); 3396 3397 MI.eraseFromParent(); 3398 return true; 3399 } 3400 3401 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3402 MachineRegisterInfo &MRI, 3403 MachineIRBuilder &B) const { 3404 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3405 return true; 3406 3407 Register Res = MI.getOperand(0).getReg(); 3408 Register LHS = MI.getOperand(1).getReg(); 3409 Register RHS = MI.getOperand(2).getReg(); 3410 3411 uint16_t Flags = MI.getFlags(); 3412 3413 LLT S64 = LLT::scalar(64); 3414 LLT S1 = LLT::scalar(1); 3415 3416 auto One = B.buildFConstant(S64, 1.0); 3417 3418 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3419 .addUse(LHS) 3420 .addUse(RHS) 3421 .addImm(0) 3422 .setMIFlags(Flags); 3423 3424 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3425 3426 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3427 .addUse(DivScale0.getReg(0)) 3428 .setMIFlags(Flags); 3429 3430 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3431 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3432 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3433 3434 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3435 .addUse(LHS) 3436 .addUse(RHS) 3437 .addImm(1) 3438 .setMIFlags(Flags); 3439 3440 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3441 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3442 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3443 3444 Register Scale; 3445 if (!ST.hasUsableDivScaleConditionOutput()) { 3446 // Workaround a hardware bug on SI where the condition output from div_scale 3447 // is not usable. 3448 3449 LLT S32 = LLT::scalar(32); 3450 3451 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3452 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3453 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3454 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3455 3456 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3457 Scale1Unmerge.getReg(1)); 3458 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3459 Scale0Unmerge.getReg(1)); 3460 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3461 } else { 3462 Scale = DivScale1.getReg(1); 3463 } 3464 3465 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3466 .addUse(Fma4.getReg(0)) 3467 .addUse(Fma3.getReg(0)) 3468 .addUse(Mul.getReg(0)) 3469 .addUse(Scale) 3470 .setMIFlags(Flags); 3471 3472 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3473 .addUse(Fmas.getReg(0)) 3474 .addUse(RHS) 3475 .addUse(LHS) 3476 .setMIFlags(Flags); 3477 3478 MI.eraseFromParent(); 3479 return true; 3480 } 3481 3482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3483 MachineRegisterInfo &MRI, 3484 MachineIRBuilder &B) const { 3485 Register Res = MI.getOperand(0).getReg(); 3486 Register LHS = MI.getOperand(2).getReg(); 3487 Register RHS = MI.getOperand(3).getReg(); 3488 uint16_t Flags = MI.getFlags(); 3489 3490 LLT S32 = LLT::scalar(32); 3491 LLT S1 = LLT::scalar(1); 3492 3493 auto Abs = B.buildFAbs(S32, RHS, Flags); 3494 const APFloat C0Val(1.0f); 3495 3496 auto C0 = B.buildConstant(S32, 0x6f800000); 3497 auto C1 = B.buildConstant(S32, 0x2f800000); 3498 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3499 3500 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3501 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3502 3503 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3504 3505 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3506 .addUse(Mul0.getReg(0)) 3507 .setMIFlags(Flags); 3508 3509 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3510 3511 B.buildFMul(Res, Sel, Mul1, Flags); 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3518 // FIXME: Why do we handle this one but not other removed instructions? 3519 // 3520 // Reciprocal square root. The clamp prevents infinite results, clamping 3521 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3522 // +-max_float. 3523 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3524 MachineRegisterInfo &MRI, 3525 MachineIRBuilder &B) const { 3526 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3527 return true; 3528 3529 Register Dst = MI.getOperand(0).getReg(); 3530 Register Src = MI.getOperand(2).getReg(); 3531 auto Flags = MI.getFlags(); 3532 3533 LLT Ty = MRI.getType(Dst); 3534 3535 const fltSemantics *FltSemantics; 3536 if (Ty == LLT::scalar(32)) 3537 FltSemantics = &APFloat::IEEEsingle(); 3538 else if (Ty == LLT::scalar(64)) 3539 FltSemantics = &APFloat::IEEEdouble(); 3540 else 3541 return false; 3542 3543 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3544 .addUse(Src) 3545 .setMIFlags(Flags); 3546 3547 // We don't need to concern ourselves with the snan handling difference, since 3548 // the rsq quieted (or not) so use the one which will directly select. 3549 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3550 const bool UseIEEE = MFI->getMode().IEEE; 3551 3552 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3553 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3554 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3555 3556 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3557 3558 if (UseIEEE) 3559 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3560 else 3561 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3562 MI.eraseFromParent(); 3563 return true; 3564 } 3565 3566 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3567 switch (IID) { 3568 case Intrinsic::amdgcn_ds_fadd: 3569 return AMDGPU::G_ATOMICRMW_FADD; 3570 case Intrinsic::amdgcn_ds_fmin: 3571 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3572 case Intrinsic::amdgcn_ds_fmax: 3573 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3574 default: 3575 llvm_unreachable("not a DS FP intrinsic"); 3576 } 3577 } 3578 3579 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3580 MachineInstr &MI, 3581 Intrinsic::ID IID) const { 3582 GISelChangeObserver &Observer = Helper.Observer; 3583 Observer.changingInstr(MI); 3584 3585 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3586 3587 // The remaining operands were used to set fields in the MemOperand on 3588 // construction. 3589 for (int I = 6; I > 3; --I) 3590 MI.RemoveOperand(I); 3591 3592 MI.RemoveOperand(1); // Remove the intrinsic ID. 3593 Observer.changedInstr(MI); 3594 return true; 3595 } 3596 3597 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3598 MachineRegisterInfo &MRI, 3599 MachineIRBuilder &B) const { 3600 uint64_t Offset = 3601 ST.getTargetLowering()->getImplicitParameterOffset( 3602 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3603 LLT DstTy = MRI.getType(DstReg); 3604 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3605 3606 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3607 if (!loadInputValue(KernargPtrReg, B, 3608 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3609 return false; 3610 3611 // FIXME: This should be nuw 3612 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3613 return true; 3614 } 3615 3616 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3617 MachineRegisterInfo &MRI, 3618 MachineIRBuilder &B) const { 3619 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3620 if (!MFI->isEntryFunction()) { 3621 return legalizePreloadedArgIntrin(MI, MRI, B, 3622 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3623 } 3624 3625 Register DstReg = MI.getOperand(0).getReg(); 3626 if (!getImplicitArgPtr(DstReg, MRI, B)) 3627 return false; 3628 3629 MI.eraseFromParent(); 3630 return true; 3631 } 3632 3633 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3634 MachineRegisterInfo &MRI, 3635 MachineIRBuilder &B, 3636 unsigned AddrSpace) const { 3637 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3638 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3639 Register Hi32 = Unmerge.getReg(1); 3640 3641 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3642 MI.eraseFromParent(); 3643 return true; 3644 } 3645 3646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3647 // offset (the offset that is included in bounds checking and swizzling, to be 3648 // split between the instruction's voffset and immoffset fields) and soffset 3649 // (the offset that is excluded from bounds checking and swizzling, to go in 3650 // the instruction's soffset field). This function takes the first kind of 3651 // offset and figures out how to split it between voffset and immoffset. 3652 std::tuple<Register, unsigned, unsigned> 3653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3654 Register OrigOffset) const { 3655 const unsigned MaxImm = 4095; 3656 Register BaseReg; 3657 unsigned TotalConstOffset; 3658 const LLT S32 = LLT::scalar(32); 3659 MachineRegisterInfo &MRI = *B.getMRI(); 3660 3661 std::tie(BaseReg, TotalConstOffset) = 3662 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3663 3664 unsigned ImmOffset = TotalConstOffset; 3665 3666 // If BaseReg is a pointer, convert it to int. 3667 if (MRI.getType(BaseReg).isPointer()) 3668 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3669 3670 // If the immediate value is too big for the immoffset field, put the value 3671 // and -4096 into the immoffset field so that the value that is copied/added 3672 // for the voffset field is a multiple of 4096, and it stands more chance 3673 // of being CSEd with the copy/add for another similar load/store. 3674 // However, do not do that rounding down to a multiple of 4096 if that is a 3675 // negative number, as it appears to be illegal to have a negative offset 3676 // in the vgpr, even if adding the immediate offset makes it positive. 3677 unsigned Overflow = ImmOffset & ~MaxImm; 3678 ImmOffset -= Overflow; 3679 if ((int32_t)Overflow < 0) { 3680 Overflow += ImmOffset; 3681 ImmOffset = 0; 3682 } 3683 3684 if (Overflow != 0) { 3685 if (!BaseReg) { 3686 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3687 } else { 3688 auto OverflowVal = B.buildConstant(S32, Overflow); 3689 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3690 } 3691 } 3692 3693 if (!BaseReg) 3694 BaseReg = B.buildConstant(S32, 0).getReg(0); 3695 3696 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3697 } 3698 3699 /// Handle register layout difference for f16 images for some subtargets. 3700 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3701 MachineRegisterInfo &MRI, 3702 Register Reg, 3703 bool ImageStore) const { 3704 const LLT S16 = LLT::scalar(16); 3705 const LLT S32 = LLT::scalar(32); 3706 LLT StoreVT = MRI.getType(Reg); 3707 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3708 3709 if (ST.hasUnpackedD16VMem()) { 3710 auto Unmerge = B.buildUnmerge(S16, Reg); 3711 3712 SmallVector<Register, 4> WideRegs; 3713 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3714 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3715 3716 int NumElts = StoreVT.getNumElements(); 3717 3718 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3719 .getReg(0); 3720 } 3721 3722 if (ImageStore && ST.hasImageStoreD16Bug()) { 3723 if (StoreVT.getNumElements() == 2) { 3724 SmallVector<Register, 4> PackedRegs; 3725 Reg = B.buildBitcast(S32, Reg).getReg(0); 3726 PackedRegs.push_back(Reg); 3727 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3728 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3729 .getReg(0); 3730 } 3731 3732 if (StoreVT.getNumElements() == 3) { 3733 SmallVector<Register, 4> PackedRegs; 3734 auto Unmerge = B.buildUnmerge(S16, Reg); 3735 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3736 PackedRegs.push_back(Unmerge.getReg(I)); 3737 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3738 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3739 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3740 } 3741 3742 if (StoreVT.getNumElements() == 4) { 3743 SmallVector<Register, 4> PackedRegs; 3744 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3745 auto Unmerge = B.buildUnmerge(S32, Reg); 3746 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3747 PackedRegs.push_back(Unmerge.getReg(I)); 3748 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3749 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3750 .getReg(0); 3751 } 3752 3753 llvm_unreachable("invalid data type"); 3754 } 3755 3756 return Reg; 3757 } 3758 3759 Register AMDGPULegalizerInfo::fixStoreSourceType( 3760 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3761 MachineRegisterInfo *MRI = B.getMRI(); 3762 LLT Ty = MRI->getType(VData); 3763 3764 const LLT S16 = LLT::scalar(16); 3765 3766 // Fixup illegal register types for i8 stores. 3767 if (Ty == LLT::scalar(8) || Ty == S16) { 3768 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3769 return AnyExt; 3770 } 3771 3772 if (Ty.isVector()) { 3773 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3774 if (IsFormat) 3775 return handleD16VData(B, *MRI, VData); 3776 } 3777 } 3778 3779 return VData; 3780 } 3781 3782 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3783 MachineRegisterInfo &MRI, 3784 MachineIRBuilder &B, 3785 bool IsTyped, 3786 bool IsFormat) const { 3787 Register VData = MI.getOperand(1).getReg(); 3788 LLT Ty = MRI.getType(VData); 3789 LLT EltTy = Ty.getScalarType(); 3790 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3791 const LLT S32 = LLT::scalar(32); 3792 3793 VData = fixStoreSourceType(B, VData, IsFormat); 3794 Register RSrc = MI.getOperand(2).getReg(); 3795 3796 MachineMemOperand *MMO = *MI.memoperands_begin(); 3797 const int MemSize = MMO->getSize(); 3798 3799 unsigned ImmOffset; 3800 unsigned TotalOffset; 3801 3802 // The typed intrinsics add an immediate after the registers. 3803 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3804 3805 // The struct intrinsic variants add one additional operand over raw. 3806 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3807 Register VIndex; 3808 int OpOffset = 0; 3809 if (HasVIndex) { 3810 VIndex = MI.getOperand(3).getReg(); 3811 OpOffset = 1; 3812 } 3813 3814 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3815 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3816 3817 unsigned Format = 0; 3818 if (IsTyped) { 3819 Format = MI.getOperand(5 + OpOffset).getImm(); 3820 ++OpOffset; 3821 } 3822 3823 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3824 3825 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3826 if (TotalOffset != 0) 3827 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3828 3829 unsigned Opc; 3830 if (IsTyped) { 3831 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3832 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3833 } else if (IsFormat) { 3834 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3835 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3836 } else { 3837 switch (MemSize) { 3838 case 1: 3839 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3840 break; 3841 case 2: 3842 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3843 break; 3844 default: 3845 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3846 break; 3847 } 3848 } 3849 3850 if (!VIndex) 3851 VIndex = B.buildConstant(S32, 0).getReg(0); 3852 3853 auto MIB = B.buildInstr(Opc) 3854 .addUse(VData) // vdata 3855 .addUse(RSrc) // rsrc 3856 .addUse(VIndex) // vindex 3857 .addUse(VOffset) // voffset 3858 .addUse(SOffset) // soffset 3859 .addImm(ImmOffset); // offset(imm) 3860 3861 if (IsTyped) 3862 MIB.addImm(Format); 3863 3864 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3865 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3866 .addMemOperand(MMO); 3867 3868 MI.eraseFromParent(); 3869 return true; 3870 } 3871 3872 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3873 MachineRegisterInfo &MRI, 3874 MachineIRBuilder &B, 3875 bool IsFormat, 3876 bool IsTyped) const { 3877 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3878 MachineMemOperand *MMO = *MI.memoperands_begin(); 3879 const int MemSize = MMO->getSize(); 3880 const LLT S32 = LLT::scalar(32); 3881 3882 Register Dst = MI.getOperand(0).getReg(); 3883 Register RSrc = MI.getOperand(2).getReg(); 3884 3885 // The typed intrinsics add an immediate after the registers. 3886 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3887 3888 // The struct intrinsic variants add one additional operand over raw. 3889 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3890 Register VIndex; 3891 int OpOffset = 0; 3892 if (HasVIndex) { 3893 VIndex = MI.getOperand(3).getReg(); 3894 OpOffset = 1; 3895 } 3896 3897 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3898 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3899 3900 unsigned Format = 0; 3901 if (IsTyped) { 3902 Format = MI.getOperand(5 + OpOffset).getImm(); 3903 ++OpOffset; 3904 } 3905 3906 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3907 unsigned ImmOffset; 3908 unsigned TotalOffset; 3909 3910 LLT Ty = MRI.getType(Dst); 3911 LLT EltTy = Ty.getScalarType(); 3912 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3913 const bool Unpacked = ST.hasUnpackedD16VMem(); 3914 3915 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3916 if (TotalOffset != 0) 3917 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3918 3919 unsigned Opc; 3920 3921 if (IsTyped) { 3922 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3923 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3924 } else if (IsFormat) { 3925 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3926 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3927 } else { 3928 switch (MemSize) { 3929 case 1: 3930 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3931 break; 3932 case 2: 3933 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3934 break; 3935 default: 3936 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3937 break; 3938 } 3939 } 3940 3941 Register LoadDstReg; 3942 3943 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3944 LLT UnpackedTy = Ty.changeElementSize(32); 3945 3946 if (IsExtLoad) 3947 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3948 else if (Unpacked && IsD16 && Ty.isVector()) 3949 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3950 else 3951 LoadDstReg = Dst; 3952 3953 if (!VIndex) 3954 VIndex = B.buildConstant(S32, 0).getReg(0); 3955 3956 auto MIB = B.buildInstr(Opc) 3957 .addDef(LoadDstReg) // vdata 3958 .addUse(RSrc) // rsrc 3959 .addUse(VIndex) // vindex 3960 .addUse(VOffset) // voffset 3961 .addUse(SOffset) // soffset 3962 .addImm(ImmOffset); // offset(imm) 3963 3964 if (IsTyped) 3965 MIB.addImm(Format); 3966 3967 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3968 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3969 .addMemOperand(MMO); 3970 3971 if (LoadDstReg != Dst) { 3972 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3973 3974 // Widen result for extending loads was widened. 3975 if (IsExtLoad) 3976 B.buildTrunc(Dst, LoadDstReg); 3977 else { 3978 // Repack to original 16-bit vector result 3979 // FIXME: G_TRUNC should work, but legalization currently fails 3980 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3981 SmallVector<Register, 4> Repack; 3982 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3983 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3984 B.buildMerge(Dst, Repack); 3985 } 3986 } 3987 3988 MI.eraseFromParent(); 3989 return true; 3990 } 3991 3992 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3993 MachineIRBuilder &B, 3994 bool IsInc) const { 3995 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3996 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3997 B.buildInstr(Opc) 3998 .addDef(MI.getOperand(0).getReg()) 3999 .addUse(MI.getOperand(2).getReg()) 4000 .addUse(MI.getOperand(3).getReg()) 4001 .cloneMemRefs(MI); 4002 MI.eraseFromParent(); 4003 return true; 4004 } 4005 4006 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4007 switch (IntrID) { 4008 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4009 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4011 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4012 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4014 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4015 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4016 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4017 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4019 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4020 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4021 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4022 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4023 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4024 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4025 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4026 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4027 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4028 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4029 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4031 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4032 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4033 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4034 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4035 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4037 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4038 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4039 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4040 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4041 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4042 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4043 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4044 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4045 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4046 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4047 case Intrinsic::amdgcn_buffer_atomic_fadd: 4048 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4049 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4050 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4051 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4052 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4053 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4054 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4055 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4056 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4057 default: 4058 llvm_unreachable("unhandled atomic opcode"); 4059 } 4060 } 4061 4062 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4063 MachineIRBuilder &B, 4064 Intrinsic::ID IID) const { 4065 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4066 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4067 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4068 4069 Register Dst; 4070 4071 int OpOffset = 0; 4072 if (HasReturn) { 4073 // A few FP atomics do not support return values. 4074 Dst = MI.getOperand(0).getReg(); 4075 } else { 4076 OpOffset = -1; 4077 } 4078 4079 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4080 Register CmpVal; 4081 4082 if (IsCmpSwap) { 4083 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4084 ++OpOffset; 4085 } 4086 4087 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4088 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4089 4090 // The struct intrinsic variants add one additional operand over raw. 4091 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4092 Register VIndex; 4093 if (HasVIndex) { 4094 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4095 ++OpOffset; 4096 } 4097 4098 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4099 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4100 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4101 4102 MachineMemOperand *MMO = *MI.memoperands_begin(); 4103 4104 unsigned ImmOffset; 4105 unsigned TotalOffset; 4106 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4107 if (TotalOffset != 0) 4108 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4109 4110 if (!VIndex) 4111 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4112 4113 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4114 4115 if (HasReturn) 4116 MIB.addDef(Dst); 4117 4118 MIB.addUse(VData); // vdata 4119 4120 if (IsCmpSwap) 4121 MIB.addReg(CmpVal); 4122 4123 MIB.addUse(RSrc) // rsrc 4124 .addUse(VIndex) // vindex 4125 .addUse(VOffset) // voffset 4126 .addUse(SOffset) // soffset 4127 .addImm(ImmOffset) // offset(imm) 4128 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4129 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4130 .addMemOperand(MMO); 4131 4132 MI.eraseFromParent(); 4133 return true; 4134 } 4135 4136 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4137 /// vector with s16 typed elements. 4138 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4139 SmallVectorImpl<Register> &PackedAddrs, 4140 unsigned ArgOffset, 4141 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4142 bool IsA16, bool IsG16) { 4143 const LLT S16 = LLT::scalar(16); 4144 const LLT V2S16 = LLT::fixed_vector(2, 16); 4145 auto EndIdx = Intr->VAddrEnd; 4146 4147 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4148 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4149 if (!SrcOp.isReg()) 4150 continue; // _L to _LZ may have eliminated this. 4151 4152 Register AddrReg = SrcOp.getReg(); 4153 4154 if (I < Intr->GradientStart) { 4155 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4156 PackedAddrs.push_back(AddrReg); 4157 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4158 (I >= Intr->CoordStart && !IsA16)) { 4159 // Handle any gradient or coordinate operands that should not be packed 4160 PackedAddrs.push_back(AddrReg); 4161 } else { 4162 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4163 // derivatives dx/dh and dx/dv are packed with undef. 4164 if (((I + 1) >= EndIdx) || 4165 ((Intr->NumGradients / 2) % 2 == 1 && 4166 (I == static_cast<unsigned>(Intr->GradientStart + 4167 (Intr->NumGradients / 2) - 1) || 4168 I == static_cast<unsigned>(Intr->GradientStart + 4169 Intr->NumGradients - 1))) || 4170 // Check for _L to _LZ optimization 4171 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4172 PackedAddrs.push_back( 4173 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4174 .getReg(0)); 4175 } else { 4176 PackedAddrs.push_back( 4177 B.buildBuildVector( 4178 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4179 .getReg(0)); 4180 ++I; 4181 } 4182 } 4183 } 4184 } 4185 4186 /// Convert from separate vaddr components to a single vector address register, 4187 /// and replace the remaining operands with $noreg. 4188 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4189 int DimIdx, int NumVAddrs) { 4190 const LLT S32 = LLT::scalar(32); 4191 4192 SmallVector<Register, 8> AddrRegs; 4193 for (int I = 0; I != NumVAddrs; ++I) { 4194 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4195 if (SrcOp.isReg()) { 4196 AddrRegs.push_back(SrcOp.getReg()); 4197 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4198 } 4199 } 4200 4201 int NumAddrRegs = AddrRegs.size(); 4202 if (NumAddrRegs != 1) { 4203 // Round up to 8 elements for v5-v7 4204 // FIXME: Missing intermediate sized register classes and instructions. 4205 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4206 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4207 auto Undef = B.buildUndef(S32); 4208 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4209 NumAddrRegs = RoundedNumRegs; 4210 } 4211 4212 auto VAddr = 4213 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4214 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4215 } 4216 4217 for (int I = 1; I != NumVAddrs; ++I) { 4218 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4219 if (SrcOp.isReg()) 4220 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4221 } 4222 } 4223 4224 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4225 /// 4226 /// Depending on the subtarget, load/store with 16-bit element data need to be 4227 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4228 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4229 /// registers. 4230 /// 4231 /// We don't want to directly select image instructions just yet, but also want 4232 /// to exposes all register repacking to the legalizer/combiners. We also don't 4233 /// want a selected instrution entering RegBankSelect. In order to avoid 4234 /// defining a multitude of intermediate image instructions, directly hack on 4235 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4236 /// now unnecessary arguments with $noreg. 4237 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4238 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4239 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4240 4241 const unsigned NumDefs = MI.getNumExplicitDefs(); 4242 const unsigned ArgOffset = NumDefs + 1; 4243 bool IsTFE = NumDefs == 2; 4244 // We are only processing the operands of d16 image operations on subtargets 4245 // that use the unpacked register layout, or need to repack the TFE result. 4246 4247 // TODO: Do we need to guard against already legalized intrinsics? 4248 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4249 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4250 4251 MachineRegisterInfo *MRI = B.getMRI(); 4252 const LLT S32 = LLT::scalar(32); 4253 const LLT S16 = LLT::scalar(16); 4254 const LLT V2S16 = LLT::fixed_vector(2, 16); 4255 4256 unsigned DMask = 0; 4257 4258 // Check for 16 bit addresses and pack if true. 4259 LLT GradTy = 4260 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4261 LLT AddrTy = 4262 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4263 const bool IsG16 = GradTy == S16; 4264 const bool IsA16 = AddrTy == S16; 4265 4266 int DMaskLanes = 0; 4267 if (!BaseOpcode->Atomic) { 4268 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4269 if (BaseOpcode->Gather4) { 4270 DMaskLanes = 4; 4271 } else if (DMask != 0) { 4272 DMaskLanes = countPopulation(DMask); 4273 } else if (!IsTFE && !BaseOpcode->Store) { 4274 // If dmask is 0, this is a no-op load. This can be eliminated. 4275 B.buildUndef(MI.getOperand(0)); 4276 MI.eraseFromParent(); 4277 return true; 4278 } 4279 } 4280 4281 Observer.changingInstr(MI); 4282 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4283 4284 unsigned NewOpcode = NumDefs == 0 ? 4285 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4286 4287 // Track that we legalized this 4288 MI.setDesc(B.getTII().get(NewOpcode)); 4289 4290 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4291 // dmask to be at least 1 otherwise the instruction will fail 4292 if (IsTFE && DMask == 0) { 4293 DMask = 0x1; 4294 DMaskLanes = 1; 4295 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4296 } 4297 4298 if (BaseOpcode->Atomic) { 4299 Register VData0 = MI.getOperand(2).getReg(); 4300 LLT Ty = MRI->getType(VData0); 4301 4302 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4303 if (Ty.isVector()) 4304 return false; 4305 4306 if (BaseOpcode->AtomicX2) { 4307 Register VData1 = MI.getOperand(3).getReg(); 4308 // The two values are packed in one register. 4309 LLT PackedTy = LLT::fixed_vector(2, Ty); 4310 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4311 MI.getOperand(2).setReg(Concat.getReg(0)); 4312 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4313 } 4314 } 4315 4316 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4317 4318 // Optimize _L to _LZ when _L is zero 4319 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4320 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4321 const ConstantFP *ConstantLod; 4322 4323 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4324 m_GFCst(ConstantLod))) { 4325 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4326 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4327 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4328 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4329 Intr->Dim); 4330 4331 // The starting indexes should remain in the same place. 4332 --CorrectedNumVAddrs; 4333 4334 MI.getOperand(MI.getNumExplicitDefs()) 4335 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4336 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4337 Intr = NewImageDimIntr; 4338 } 4339 } 4340 } 4341 4342 // Optimize _mip away, when 'lod' is zero 4343 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4344 int64_t ConstantLod; 4345 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4346 m_ICst(ConstantLod))) { 4347 if (ConstantLod == 0) { 4348 // TODO: Change intrinsic opcode and remove operand instead or replacing 4349 // it with 0, as the _L to _LZ handling is done above. 4350 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4351 --CorrectedNumVAddrs; 4352 } 4353 } 4354 } 4355 4356 // Rewrite the addressing register layout before doing anything else. 4357 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4358 // 16 bit gradients are supported, but are tied to the A16 control 4359 // so both gradients and addresses must be 16 bit 4360 return false; 4361 } 4362 4363 if (IsA16 && !ST.hasA16()) { 4364 // A16 not supported 4365 return false; 4366 } 4367 4368 if (IsA16 || IsG16) { 4369 if (Intr->NumVAddrs > 1) { 4370 SmallVector<Register, 4> PackedRegs; 4371 4372 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4373 IsG16); 4374 4375 // See also below in the non-a16 branch 4376 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4377 4378 if (!UseNSA && PackedRegs.size() > 1) { 4379 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4380 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4381 PackedRegs[0] = Concat.getReg(0); 4382 PackedRegs.resize(1); 4383 } 4384 4385 const unsigned NumPacked = PackedRegs.size(); 4386 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4387 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4388 if (!SrcOp.isReg()) { 4389 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4390 continue; 4391 } 4392 4393 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4394 4395 if (I - Intr->VAddrStart < NumPacked) 4396 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4397 else 4398 SrcOp.setReg(AMDGPU::NoRegister); 4399 } 4400 } 4401 } else { 4402 // If the register allocator cannot place the address registers contiguously 4403 // without introducing moves, then using the non-sequential address encoding 4404 // is always preferable, since it saves VALU instructions and is usually a 4405 // wash in terms of code size or even better. 4406 // 4407 // However, we currently have no way of hinting to the register allocator 4408 // that MIMG addresses should be placed contiguously when it is possible to 4409 // do so, so force non-NSA for the common 2-address case as a heuristic. 4410 // 4411 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4412 // allocation when possible. 4413 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4414 4415 if (!UseNSA && Intr->NumVAddrs > 1) 4416 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4417 Intr->NumVAddrs); 4418 } 4419 4420 int Flags = 0; 4421 if (IsA16) 4422 Flags |= 1; 4423 if (IsG16) 4424 Flags |= 2; 4425 MI.addOperand(MachineOperand::CreateImm(Flags)); 4426 4427 if (BaseOpcode->Store) { // No TFE for stores? 4428 // TODO: Handle dmask trim 4429 Register VData = MI.getOperand(1).getReg(); 4430 LLT Ty = MRI->getType(VData); 4431 if (!Ty.isVector() || Ty.getElementType() != S16) 4432 return true; 4433 4434 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4435 if (RepackedReg != VData) { 4436 MI.getOperand(1).setReg(RepackedReg); 4437 } 4438 4439 return true; 4440 } 4441 4442 Register DstReg = MI.getOperand(0).getReg(); 4443 LLT Ty = MRI->getType(DstReg); 4444 const LLT EltTy = Ty.getScalarType(); 4445 const bool IsD16 = Ty.getScalarType() == S16; 4446 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4447 4448 // Confirm that the return type is large enough for the dmask specified 4449 if (NumElts < DMaskLanes) 4450 return false; 4451 4452 if (NumElts > 4 || DMaskLanes > 4) 4453 return false; 4454 4455 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4456 const LLT AdjustedTy = 4457 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4458 4459 // The raw dword aligned data component of the load. The only legal cases 4460 // where this matters should be when using the packed D16 format, for 4461 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4462 LLT RoundedTy; 4463 4464 // S32 vector to to cover all data, plus TFE result element. 4465 LLT TFETy; 4466 4467 // Register type to use for each loaded component. Will be S32 or V2S16. 4468 LLT RegTy; 4469 4470 if (IsD16 && ST.hasUnpackedD16VMem()) { 4471 RoundedTy = 4472 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4473 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4474 RegTy = S32; 4475 } else { 4476 unsigned EltSize = EltTy.getSizeInBits(); 4477 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4478 unsigned RoundedSize = 32 * RoundedElts; 4479 RoundedTy = LLT::scalarOrVector( 4480 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4481 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4482 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4483 } 4484 4485 // The return type does not need adjustment. 4486 // TODO: Should we change s16 case to s32 or <2 x s16>? 4487 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4488 return true; 4489 4490 Register Dst1Reg; 4491 4492 // Insert after the instruction. 4493 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4494 4495 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4496 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4497 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4498 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4499 4500 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4501 4502 MI.getOperand(0).setReg(NewResultReg); 4503 4504 // In the IR, TFE is supposed to be used with a 2 element struct return 4505 // type. The intruction really returns these two values in one contiguous 4506 // register, with one additional dword beyond the loaded data. Rewrite the 4507 // return type to use a single register result. 4508 4509 if (IsTFE) { 4510 Dst1Reg = MI.getOperand(1).getReg(); 4511 if (MRI->getType(Dst1Reg) != S32) 4512 return false; 4513 4514 // TODO: Make sure the TFE operand bit is set. 4515 MI.RemoveOperand(1); 4516 4517 // Handle the easy case that requires no repack instructions. 4518 if (Ty == S32) { 4519 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4520 return true; 4521 } 4522 } 4523 4524 // Now figure out how to copy the new result register back into the old 4525 // result. 4526 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4527 4528 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4529 4530 if (ResultNumRegs == 1) { 4531 assert(!IsTFE); 4532 ResultRegs[0] = NewResultReg; 4533 } else { 4534 // We have to repack into a new vector of some kind. 4535 for (int I = 0; I != NumDataRegs; ++I) 4536 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4537 B.buildUnmerge(ResultRegs, NewResultReg); 4538 4539 // Drop the final TFE element to get the data part. The TFE result is 4540 // directly written to the right place already. 4541 if (IsTFE) 4542 ResultRegs.resize(NumDataRegs); 4543 } 4544 4545 // For an s16 scalar result, we form an s32 result with a truncate regardless 4546 // of packed vs. unpacked. 4547 if (IsD16 && !Ty.isVector()) { 4548 B.buildTrunc(DstReg, ResultRegs[0]); 4549 return true; 4550 } 4551 4552 // Avoid a build/concat_vector of 1 entry. 4553 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4554 B.buildBitcast(DstReg, ResultRegs[0]); 4555 return true; 4556 } 4557 4558 assert(Ty.isVector()); 4559 4560 if (IsD16) { 4561 // For packed D16 results with TFE enabled, all the data components are 4562 // S32. Cast back to the expected type. 4563 // 4564 // TODO: We don't really need to use load s32 elements. We would only need one 4565 // cast for the TFE result if a multiple of v2s16 was used. 4566 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4567 for (Register &Reg : ResultRegs) 4568 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4569 } else if (ST.hasUnpackedD16VMem()) { 4570 for (Register &Reg : ResultRegs) 4571 Reg = B.buildTrunc(S16, Reg).getReg(0); 4572 } 4573 } 4574 4575 auto padWithUndef = [&](LLT Ty, int NumElts) { 4576 if (NumElts == 0) 4577 return; 4578 Register Undef = B.buildUndef(Ty).getReg(0); 4579 for (int I = 0; I != NumElts; ++I) 4580 ResultRegs.push_back(Undef); 4581 }; 4582 4583 // Pad out any elements eliminated due to the dmask. 4584 LLT ResTy = MRI->getType(ResultRegs[0]); 4585 if (!ResTy.isVector()) { 4586 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4587 B.buildBuildVector(DstReg, ResultRegs); 4588 return true; 4589 } 4590 4591 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4592 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4593 4594 // Deal with the one annoying legal case. 4595 const LLT V3S16 = LLT::fixed_vector(3, 16); 4596 if (Ty == V3S16) { 4597 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4598 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4599 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4600 return true; 4601 } 4602 4603 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4604 B.buildConcatVectors(DstReg, ResultRegs); 4605 return true; 4606 } 4607 4608 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4609 LegalizerHelper &Helper, MachineInstr &MI) const { 4610 MachineIRBuilder &B = Helper.MIRBuilder; 4611 GISelChangeObserver &Observer = Helper.Observer; 4612 4613 Register Dst = MI.getOperand(0).getReg(); 4614 LLT Ty = B.getMRI()->getType(Dst); 4615 unsigned Size = Ty.getSizeInBits(); 4616 MachineFunction &MF = B.getMF(); 4617 4618 Observer.changingInstr(MI); 4619 4620 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4621 Ty = getBitcastRegisterType(Ty); 4622 Helper.bitcastDst(MI, Ty, 0); 4623 Dst = MI.getOperand(0).getReg(); 4624 B.setInsertPt(B.getMBB(), MI); 4625 } 4626 4627 // FIXME: We don't really need this intermediate instruction. The intrinsic 4628 // should be fixed to have a memory operand. Since it's readnone, we're not 4629 // allowed to add one. 4630 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4631 MI.RemoveOperand(1); // Remove intrinsic ID 4632 4633 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4634 // TODO: Should this use datalayout alignment? 4635 const unsigned MemSize = (Size + 7) / 8; 4636 const Align MemAlign(4); 4637 MachineMemOperand *MMO = MF.getMachineMemOperand( 4638 MachinePointerInfo(), 4639 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4640 MachineMemOperand::MOInvariant, 4641 MemSize, MemAlign); 4642 MI.addMemOperand(MF, MMO); 4643 4644 // There are no 96-bit result scalar loads, but widening to 128-bit should 4645 // always be legal. We may need to restore this to a 96-bit result if it turns 4646 // out this needs to be converted to a vector load during RegBankSelect. 4647 if (!isPowerOf2_32(Size)) { 4648 if (Ty.isVector()) 4649 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4650 else 4651 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4652 } 4653 4654 Observer.changedInstr(MI); 4655 return true; 4656 } 4657 4658 // TODO: Move to selection 4659 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4660 MachineRegisterInfo &MRI, 4661 MachineIRBuilder &B) const { 4662 if (!ST.isTrapHandlerEnabled() || 4663 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4664 return legalizeTrapEndpgm(MI, MRI, B); 4665 4666 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4667 switch (*HsaAbiVer) { 4668 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4669 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4670 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4671 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4672 return ST.supportsGetDoorbellID() ? 4673 legalizeTrapHsa(MI, MRI, B) : 4674 legalizeTrapHsaQueuePtr(MI, MRI, B); 4675 } 4676 } 4677 4678 llvm_unreachable("Unknown trap handler"); 4679 } 4680 4681 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4682 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4683 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4684 MI.eraseFromParent(); 4685 return true; 4686 } 4687 4688 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4690 // Pass queue pointer to trap handler as input, and insert trap instruction 4691 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4692 Register LiveIn = 4693 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4694 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4695 return false; 4696 4697 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4698 B.buildCopy(SGPR01, LiveIn); 4699 B.buildInstr(AMDGPU::S_TRAP) 4700 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4701 .addReg(SGPR01, RegState::Implicit); 4702 4703 MI.eraseFromParent(); 4704 return true; 4705 } 4706 4707 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4708 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4709 B.buildInstr(AMDGPU::S_TRAP) 4710 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4711 MI.eraseFromParent(); 4712 return true; 4713 } 4714 4715 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4716 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4717 // Is non-HSA path or trap-handler disabled? then, report a warning 4718 // accordingly 4719 if (!ST.isTrapHandlerEnabled() || 4720 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4721 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4722 "debugtrap handler not supported", 4723 MI.getDebugLoc(), DS_Warning); 4724 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4725 Ctx.diagnose(NoTrap); 4726 } else { 4727 // Insert debug-trap instruction 4728 B.buildInstr(AMDGPU::S_TRAP) 4729 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4730 } 4731 4732 MI.eraseFromParent(); 4733 return true; 4734 } 4735 4736 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4737 MachineIRBuilder &B) const { 4738 MachineRegisterInfo &MRI = *B.getMRI(); 4739 const LLT S16 = LLT::scalar(16); 4740 const LLT S32 = LLT::scalar(32); 4741 4742 Register DstReg = MI.getOperand(0).getReg(); 4743 Register NodePtr = MI.getOperand(2).getReg(); 4744 Register RayExtent = MI.getOperand(3).getReg(); 4745 Register RayOrigin = MI.getOperand(4).getReg(); 4746 Register RayDir = MI.getOperand(5).getReg(); 4747 Register RayInvDir = MI.getOperand(6).getReg(); 4748 Register TDescr = MI.getOperand(7).getReg(); 4749 4750 if (!ST.hasGFX10_AEncoding()) { 4751 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4752 "intrinsic not supported on subtarget", 4753 MI.getDebugLoc()); 4754 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4755 return false; 4756 } 4757 4758 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4759 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4760 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4761 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4762 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4763 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4764 4765 SmallVector<Register, 12> Ops; 4766 if (Is64) { 4767 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4768 Ops.push_back(Unmerge.getReg(0)); 4769 Ops.push_back(Unmerge.getReg(1)); 4770 } else { 4771 Ops.push_back(NodePtr); 4772 } 4773 Ops.push_back(RayExtent); 4774 4775 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4776 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4777 Ops.push_back(Unmerge.getReg(0)); 4778 Ops.push_back(Unmerge.getReg(1)); 4779 Ops.push_back(Unmerge.getReg(2)); 4780 }; 4781 4782 packLanes(RayOrigin); 4783 if (IsA16) { 4784 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4785 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4786 Register R1 = MRI.createGenericVirtualRegister(S32); 4787 Register R2 = MRI.createGenericVirtualRegister(S32); 4788 Register R3 = MRI.createGenericVirtualRegister(S32); 4789 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4790 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4791 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4792 Ops.push_back(R1); 4793 Ops.push_back(R2); 4794 Ops.push_back(R3); 4795 } else { 4796 packLanes(RayDir); 4797 packLanes(RayInvDir); 4798 } 4799 4800 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4801 .addDef(DstReg) 4802 .addImm(Opcode); 4803 4804 for (Register R : Ops) { 4805 MIB.addUse(R); 4806 } 4807 4808 MIB.addUse(TDescr) 4809 .addImm(IsA16 ? 1 : 0) 4810 .cloneMemRefs(MI); 4811 4812 MI.eraseFromParent(); 4813 return true; 4814 } 4815 4816 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4817 MachineInstr &MI) const { 4818 MachineIRBuilder &B = Helper.MIRBuilder; 4819 MachineRegisterInfo &MRI = *B.getMRI(); 4820 4821 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4822 auto IntrID = MI.getIntrinsicID(); 4823 switch (IntrID) { 4824 case Intrinsic::amdgcn_if: 4825 case Intrinsic::amdgcn_else: { 4826 MachineInstr *Br = nullptr; 4827 MachineBasicBlock *UncondBrTarget = nullptr; 4828 bool Negated = false; 4829 if (MachineInstr *BrCond = 4830 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4831 const SIRegisterInfo *TRI 4832 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4833 4834 Register Def = MI.getOperand(1).getReg(); 4835 Register Use = MI.getOperand(3).getReg(); 4836 4837 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4838 4839 if (Negated) 4840 std::swap(CondBrTarget, UncondBrTarget); 4841 4842 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4843 if (IntrID == Intrinsic::amdgcn_if) { 4844 B.buildInstr(AMDGPU::SI_IF) 4845 .addDef(Def) 4846 .addUse(Use) 4847 .addMBB(UncondBrTarget); 4848 } else { 4849 B.buildInstr(AMDGPU::SI_ELSE) 4850 .addDef(Def) 4851 .addUse(Use) 4852 .addMBB(UncondBrTarget); 4853 } 4854 4855 if (Br) { 4856 Br->getOperand(0).setMBB(CondBrTarget); 4857 } else { 4858 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4859 // since we're swapping branch targets it needs to be reinserted. 4860 // FIXME: IRTranslator should probably not do this 4861 B.buildBr(*CondBrTarget); 4862 } 4863 4864 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4865 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4866 MI.eraseFromParent(); 4867 BrCond->eraseFromParent(); 4868 return true; 4869 } 4870 4871 return false; 4872 } 4873 case Intrinsic::amdgcn_loop: { 4874 MachineInstr *Br = nullptr; 4875 MachineBasicBlock *UncondBrTarget = nullptr; 4876 bool Negated = false; 4877 if (MachineInstr *BrCond = 4878 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4879 const SIRegisterInfo *TRI 4880 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4881 4882 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4883 Register Reg = MI.getOperand(2).getReg(); 4884 4885 if (Negated) 4886 std::swap(CondBrTarget, UncondBrTarget); 4887 4888 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4889 B.buildInstr(AMDGPU::SI_LOOP) 4890 .addUse(Reg) 4891 .addMBB(UncondBrTarget); 4892 4893 if (Br) 4894 Br->getOperand(0).setMBB(CondBrTarget); 4895 else 4896 B.buildBr(*CondBrTarget); 4897 4898 MI.eraseFromParent(); 4899 BrCond->eraseFromParent(); 4900 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4901 return true; 4902 } 4903 4904 return false; 4905 } 4906 case Intrinsic::amdgcn_kernarg_segment_ptr: 4907 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4908 // This only makes sense to call in a kernel, so just lower to null. 4909 B.buildConstant(MI.getOperand(0).getReg(), 0); 4910 MI.eraseFromParent(); 4911 return true; 4912 } 4913 4914 return legalizePreloadedArgIntrin( 4915 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4916 case Intrinsic::amdgcn_implicitarg_ptr: 4917 return legalizeImplicitArgPtr(MI, MRI, B); 4918 case Intrinsic::amdgcn_workitem_id_x: 4919 return legalizePreloadedArgIntrin(MI, MRI, B, 4920 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4921 case Intrinsic::amdgcn_workitem_id_y: 4922 return legalizePreloadedArgIntrin(MI, MRI, B, 4923 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4924 case Intrinsic::amdgcn_workitem_id_z: 4925 return legalizePreloadedArgIntrin(MI, MRI, B, 4926 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4927 case Intrinsic::amdgcn_workgroup_id_x: 4928 return legalizePreloadedArgIntrin(MI, MRI, B, 4929 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4930 case Intrinsic::amdgcn_workgroup_id_y: 4931 return legalizePreloadedArgIntrin(MI, MRI, B, 4932 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4933 case Intrinsic::amdgcn_workgroup_id_z: 4934 return legalizePreloadedArgIntrin(MI, MRI, B, 4935 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4936 case Intrinsic::amdgcn_dispatch_ptr: 4937 return legalizePreloadedArgIntrin(MI, MRI, B, 4938 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4939 case Intrinsic::amdgcn_queue_ptr: 4940 return legalizePreloadedArgIntrin(MI, MRI, B, 4941 AMDGPUFunctionArgInfo::QUEUE_PTR); 4942 case Intrinsic::amdgcn_implicit_buffer_ptr: 4943 return legalizePreloadedArgIntrin( 4944 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4945 case Intrinsic::amdgcn_dispatch_id: 4946 return legalizePreloadedArgIntrin(MI, MRI, B, 4947 AMDGPUFunctionArgInfo::DISPATCH_ID); 4948 case Intrinsic::amdgcn_fdiv_fast: 4949 return legalizeFDIVFastIntrin(MI, MRI, B); 4950 case Intrinsic::amdgcn_is_shared: 4951 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4952 case Intrinsic::amdgcn_is_private: 4953 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4954 case Intrinsic::amdgcn_wavefrontsize: { 4955 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4956 MI.eraseFromParent(); 4957 return true; 4958 } 4959 case Intrinsic::amdgcn_s_buffer_load: 4960 return legalizeSBufferLoad(Helper, MI); 4961 case Intrinsic::amdgcn_raw_buffer_store: 4962 case Intrinsic::amdgcn_struct_buffer_store: 4963 return legalizeBufferStore(MI, MRI, B, false, false); 4964 case Intrinsic::amdgcn_raw_buffer_store_format: 4965 case Intrinsic::amdgcn_struct_buffer_store_format: 4966 return legalizeBufferStore(MI, MRI, B, false, true); 4967 case Intrinsic::amdgcn_raw_tbuffer_store: 4968 case Intrinsic::amdgcn_struct_tbuffer_store: 4969 return legalizeBufferStore(MI, MRI, B, true, true); 4970 case Intrinsic::amdgcn_raw_buffer_load: 4971 case Intrinsic::amdgcn_struct_buffer_load: 4972 return legalizeBufferLoad(MI, MRI, B, false, false); 4973 case Intrinsic::amdgcn_raw_buffer_load_format: 4974 case Intrinsic::amdgcn_struct_buffer_load_format: 4975 return legalizeBufferLoad(MI, MRI, B, true, false); 4976 case Intrinsic::amdgcn_raw_tbuffer_load: 4977 case Intrinsic::amdgcn_struct_tbuffer_load: 4978 return legalizeBufferLoad(MI, MRI, B, true, true); 4979 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4980 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4981 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4982 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4985 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4986 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4987 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4988 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4989 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4990 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4991 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4992 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4993 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4994 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4995 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4996 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4997 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4998 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4999 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5000 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5001 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5002 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5003 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5004 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5005 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5006 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5007 case Intrinsic::amdgcn_buffer_atomic_fadd: 5008 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5009 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5010 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5011 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5012 return legalizeBufferAtomic(MI, B, IntrID); 5013 case Intrinsic::amdgcn_atomic_inc: 5014 return legalizeAtomicIncDec(MI, B, true); 5015 case Intrinsic::amdgcn_atomic_dec: 5016 return legalizeAtomicIncDec(MI, B, false); 5017 case Intrinsic::trap: 5018 return legalizeTrapIntrinsic(MI, MRI, B); 5019 case Intrinsic::debugtrap: 5020 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5021 case Intrinsic::amdgcn_rsq_clamp: 5022 return legalizeRsqClampIntrinsic(MI, MRI, B); 5023 case Intrinsic::amdgcn_ds_fadd: 5024 case Intrinsic::amdgcn_ds_fmin: 5025 case Intrinsic::amdgcn_ds_fmax: 5026 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5027 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5028 return legalizeBVHIntrinsic(MI, B); 5029 default: { 5030 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5031 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5032 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5033 return true; 5034 } 5035 } 5036 5037 return true; 5038 } 5039