1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 843 .customFor({S16, S32}) 844 .scalarize(0) 845 .lower(); 846 847 // Lower roundeven into G_FRINT 848 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 849 .scalarize(0) 850 .lower(); 851 852 if (ST.has16BitInsts()) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S16, S32, S64}) 855 .clampScalar(0, S16, S64) 856 .scalarize(0); 857 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32, S64}) 860 .clampScalar(0, S32, S64) 861 .scalarize(0); 862 } else { 863 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 864 .legalFor({S32}) 865 .customFor({S64}) 866 .clampScalar(0, S32, S64) 867 .scalarize(0); 868 } 869 870 getActionDefinitionsBuilder(G_PTR_ADD) 871 .legalIf(all(isPointer(0), sameSize(0, 1))) 872 .scalarize(0) 873 .scalarSameSizeAs(1, 0); 874 875 getActionDefinitionsBuilder(G_PTRMASK) 876 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 877 .scalarSameSizeAs(1, 0) 878 .scalarize(0); 879 880 auto &CmpBuilder = 881 getActionDefinitionsBuilder(G_ICMP) 882 // The compare output type differs based on the register bank of the output, 883 // so make both s1 and s32 legal. 884 // 885 // Scalar compares producing output in scc will be promoted to s32, as that 886 // is the allocatable register type that will be needed for the copy from 887 // scc. This will be promoted during RegBankSelect, and we assume something 888 // before that won't try to use s32 result types. 889 // 890 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 891 // bank. 892 .legalForCartesianProduct( 893 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 894 .legalForCartesianProduct( 895 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 896 if (ST.has16BitInsts()) { 897 CmpBuilder.legalFor({{S1, S16}}); 898 } 899 900 CmpBuilder 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0) 904 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 905 906 getActionDefinitionsBuilder(G_FCMP) 907 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 908 .widenScalarToNextPow2(1) 909 .clampScalar(1, S32, S64) 910 .scalarize(0); 911 912 // FIXME: fpow has a selection pattern that should move to custom lowering. 913 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 914 if (ST.has16BitInsts()) 915 Exp2Ops.legalFor({S32, S16}); 916 else 917 Exp2Ops.legalFor({S32}); 918 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 919 Exp2Ops.scalarize(0); 920 921 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 922 if (ST.has16BitInsts()) 923 ExpOps.customFor({{S32}, {S16}}); 924 else 925 ExpOps.customFor({S32}); 926 ExpOps.clampScalar(0, MinScalarFPTy, S32) 927 .scalarize(0); 928 929 getActionDefinitionsBuilder(G_FPOWI) 930 .clampScalar(0, MinScalarFPTy, S32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder(G_CTPOP) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .widenScalarToNextPow2(1, 32) 938 .clampScalar(1, S32, S64) 939 .scalarize(0) 940 .widenScalarToNextPow2(0, 32); 941 942 943 // The hardware instructions return a different result on 0 than the generic 944 // instructions expect. The hardware produces -1, but these produce the 945 // bitwidth. 946 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 947 .scalarize(0) 948 .clampScalar(0, S32, S32) 949 .clampScalar(1, S32, S64) 950 .widenScalarToNextPow2(0, 32) 951 .widenScalarToNextPow2(1, 32) 952 .custom(); 953 954 // The 64-bit versions produce 32-bit results, but only on the SALU. 955 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 956 .legalFor({{S32, S32}, {S32, S64}}) 957 .clampScalar(0, S32, S32) 958 .clampScalar(1, S32, S64) 959 .scalarize(0) 960 .widenScalarToNextPow2(0, 32) 961 .widenScalarToNextPow2(1, 32); 962 963 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 964 // RegBankSelect. 965 getActionDefinitionsBuilder(G_BITREVERSE) 966 .legalFor({S32, S64}) 967 .clampScalar(0, S32, S64) 968 .scalarize(0) 969 .widenScalarToNextPow2(0); 970 971 if (ST.has16BitInsts()) { 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S16, S32, V2S16}) 974 .clampMaxNumElementsStrict(0, S16, 2) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .clampScalar(0, S16, S32) 979 .scalarize(0); 980 981 if (ST.hasVOP3PInsts()) { 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 983 .legalFor({S32, S16, V2S16}) 984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 985 .clampMaxNumElements(0, S16, 2) 986 .minScalar(0, S16) 987 .widenScalarToNextPow2(0) 988 .scalarize(0) 989 .lower(); 990 } else { 991 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 992 .legalFor({S32, S16}) 993 .widenScalarToNextPow2(0) 994 .minScalar(0, S16) 995 .scalarize(0) 996 .lower(); 997 } 998 } else { 999 // TODO: Should have same legality without v_perm_b32 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S32}) 1002 .lowerIf(scalarNarrowerThan(0, 32)) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .maxScalar(0, S32) 1007 .scalarize(0) 1008 .lower(); 1009 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32}) 1012 .minScalar(0, S32) 1013 .widenScalarToNextPow2(0) 1014 .scalarize(0) 1015 .lower(); 1016 } 1017 1018 getActionDefinitionsBuilder(G_INTTOPTR) 1019 // List the common cases 1020 .legalForCartesianProduct(AddrSpaces64, {S64}) 1021 .legalForCartesianProduct(AddrSpaces32, {S32}) 1022 .scalarize(0) 1023 // Accept any address space as long as the size matches 1024 .legalIf(sameSize(0, 1)) 1025 .widenScalarIf(smallerThan(1, 0), 1026 [](const LegalityQuery &Query) { 1027 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1028 }) 1029 .narrowScalarIf(largerThan(1, 0), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_PTRTOINT) 1035 // List the common cases 1036 .legalForCartesianProduct(AddrSpaces64, {S64}) 1037 .legalForCartesianProduct(AddrSpaces32, {S32}) 1038 .scalarize(0) 1039 // Accept any address space as long as the size matches 1040 .legalIf(sameSize(0, 1)) 1041 .widenScalarIf(smallerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }) 1045 .narrowScalarIf( 1046 largerThan(0, 1), 1047 [](const LegalityQuery &Query) { 1048 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1049 }); 1050 1051 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1052 .scalarize(0) 1053 .custom(); 1054 1055 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1056 bool IsLoad) -> bool { 1057 const LLT DstTy = Query.Types[0]; 1058 1059 // Split vector extloads. 1060 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1061 1062 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1063 return true; 1064 1065 const LLT PtrTy = Query.Types[1]; 1066 unsigned AS = PtrTy.getAddressSpace(); 1067 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1068 return true; 1069 1070 // Catch weird sized loads that don't evenly divide into the access sizes 1071 // TODO: May be able to widen depending on alignment etc. 1072 unsigned NumRegs = (MemSize + 31) / 32; 1073 if (NumRegs == 3) { 1074 if (!ST.hasDwordx3LoadStores()) 1075 return true; 1076 } else { 1077 // If the alignment allows, these should have been widened. 1078 if (!isPowerOf2_32(NumRegs)) 1079 return true; 1080 } 1081 1082 return false; 1083 }; 1084 1085 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1086 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1087 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1088 1089 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1090 // LDS 1091 // TODO: Unsupported flat for SI. 1092 1093 for (unsigned Op : {G_LOAD, G_STORE}) { 1094 const bool IsStore = Op == G_STORE; 1095 1096 auto &Actions = getActionDefinitionsBuilder(Op); 1097 // Explicitly list some common cases. 1098 // TODO: Does this help compile time at all? 1099 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1100 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1101 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1102 {S64, GlobalPtr, S64, GlobalAlign32}, 1103 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1104 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1105 {S32, GlobalPtr, S8, GlobalAlign8}, 1106 {S32, GlobalPtr, S16, GlobalAlign16}, 1107 1108 {S32, LocalPtr, S32, 32}, 1109 {S64, LocalPtr, S64, 32}, 1110 {V2S32, LocalPtr, V2S32, 32}, 1111 {S32, LocalPtr, S8, 8}, 1112 {S32, LocalPtr, S16, 16}, 1113 {V2S16, LocalPtr, S32, 32}, 1114 1115 {S32, PrivatePtr, S32, 32}, 1116 {S32, PrivatePtr, S8, 8}, 1117 {S32, PrivatePtr, S16, 16}, 1118 {V2S16, PrivatePtr, S32, 32}, 1119 1120 {S32, ConstantPtr, S32, GlobalAlign32}, 1121 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1122 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1123 {S64, ConstantPtr, S64, GlobalAlign32}, 1124 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1125 Actions.legalIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return isLoadStoreLegal(ST, Query); 1128 }); 1129 1130 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1131 // 64-bits. 1132 // 1133 // TODO: Should generalize bitcast action into coerce, which will also cover 1134 // inserting addrspacecasts. 1135 Actions.customIf(typeIs(1, Constant32Ptr)); 1136 1137 // Turn any illegal element vectors into something easier to deal 1138 // with. These will ultimately produce 32-bit scalar shifts to extract the 1139 // parts anyway. 1140 // 1141 // For odd 16-bit element vectors, prefer to split those into pieces with 1142 // 16-bit vector parts. 1143 Actions.bitcastIf( 1144 [=](const LegalityQuery &Query) -> bool { 1145 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1146 Query.MMODescrs[0].MemoryTy); 1147 }, bitcastToRegisterType(0)); 1148 1149 if (!IsStore) { 1150 // Widen suitably aligned loads by loading extra bytes. The standard 1151 // legalization actions can't properly express widening memory operands. 1152 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1153 return shouldWidenLoad(ST, Query, G_LOAD); 1154 }); 1155 } 1156 1157 // FIXME: load/store narrowing should be moved to lower action 1158 Actions 1159 .narrowScalarIf( 1160 [=](const LegalityQuery &Query) -> bool { 1161 return !Query.Types[0].isVector() && 1162 needToSplitMemOp(Query, Op == G_LOAD); 1163 }, 1164 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1165 const LLT DstTy = Query.Types[0]; 1166 const LLT PtrTy = Query.Types[1]; 1167 1168 const unsigned DstSize = DstTy.getSizeInBits(); 1169 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1170 1171 // Split extloads. 1172 if (DstSize > MemSize) 1173 return std::make_pair(0, LLT::scalar(MemSize)); 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1204 if (MemSize > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector( 1211 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1212 } 1213 1214 unsigned NumPieces = MemSize / MaxSize; 1215 1216 // FIXME: Refine when odd breakdowns handled 1217 // The scalars will need to be re-legalized. 1218 if (NumPieces == 1 || NumPieces >= NumElts || 1219 NumElts % NumPieces != 0) 1220 return std::make_pair(0, EltTy); 1221 1222 return std::make_pair( 1223 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1224 } 1225 1226 // FIXME: We could probably handle weird extending loads better. 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1240 } 1241 1242 // May need relegalization for the scalars. 1243 return std::make_pair(0, EltTy); 1244 }) 1245 .minScalar(0, S32) 1246 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1247 .widenScalarToNextPow2(0) 1248 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1249 .lower(); 1250 } 1251 1252 // FIXME: Unaligned accesses not lowered. 1253 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1254 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1255 {S32, GlobalPtr, S16, 2 * 8}, 1256 {S32, LocalPtr, S8, 8}, 1257 {S32, LocalPtr, S16, 16}, 1258 {S32, PrivatePtr, S8, 8}, 1259 {S32, PrivatePtr, S16, 16}, 1260 {S32, ConstantPtr, S8, 8}, 1261 {S32, ConstantPtr, S16, 2 * 8}}) 1262 .legalIf( 1263 [=](const LegalityQuery &Query) -> bool { 1264 return isLoadStoreLegal(ST, Query); 1265 }); 1266 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1270 } 1271 1272 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1273 // 64-bits. 1274 // 1275 // TODO: Should generalize bitcast action into coerce, which will also cover 1276 // inserting addrspacecasts. 1277 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1278 1279 ExtLoads.clampScalar(0, S32, S32) 1280 .widenScalarToNextPow2(0) 1281 .lower(); 1282 1283 auto &Atomics = getActionDefinitionsBuilder( 1284 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1285 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1286 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1287 G_ATOMICRMW_UMIN}) 1288 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1289 {S64, GlobalPtr}, {S64, LocalPtr}, 1290 {S32, RegionPtr}, {S64, RegionPtr}}); 1291 if (ST.hasFlatAddressSpace()) { 1292 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1293 } 1294 1295 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1296 if (ST.hasLDSFPAtomicAdd()) { 1297 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1298 if (ST.hasGFX90AInsts()) 1299 Atomic.legalFor({{S64, LocalPtr}}); 1300 if (ST.hasGFX940Insts()) 1301 Atomic.legalFor({{V2S16, LocalPtr}}); 1302 } 1303 if (ST.hasAtomicFaddInsts()) 1304 Atomic.legalFor({{S32, GlobalPtr}}); 1305 1306 if (ST.hasGFX90AInsts()) { 1307 // These are legal with some caveats, and should have undergone expansion in 1308 // the IR in most situations 1309 // TODO: Move atomic expansion into legalizer 1310 // TODO: Also supports <2 x f16> 1311 Atomic.legalFor({ 1312 {S32, GlobalPtr}, 1313 {S64, GlobalPtr}, 1314 {S64, FlatPtr} 1315 }); 1316 } 1317 1318 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1319 // demarshalling 1320 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1321 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1322 {S32, FlatPtr}, {S64, FlatPtr}}) 1323 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1324 {S32, RegionPtr}, {S64, RegionPtr}}); 1325 // TODO: Pointer types, any 32-bit or 64-bit vector 1326 1327 // Condition should be s32 for scalar, s1 for vector. 1328 getActionDefinitionsBuilder(G_SELECT) 1329 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1330 LocalPtr, FlatPtr, PrivatePtr, 1331 LLT::fixed_vector(2, LocalPtr), 1332 LLT::fixed_vector(2, PrivatePtr)}, 1333 {S1, S32}) 1334 .clampScalar(0, S16, S64) 1335 .scalarize(1) 1336 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1337 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1338 .clampMaxNumElements(0, S32, 2) 1339 .clampMaxNumElements(0, LocalPtr, 2) 1340 .clampMaxNumElements(0, PrivatePtr, 2) 1341 .scalarize(0) 1342 .widenScalarToNextPow2(0) 1343 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1344 1345 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1346 // be more flexible with the shift amount type. 1347 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1348 .legalFor({{S32, S32}, {S64, S32}}); 1349 if (ST.has16BitInsts()) { 1350 if (ST.hasVOP3PInsts()) { 1351 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1352 .clampMaxNumElements(0, S16, 2); 1353 } else 1354 Shifts.legalFor({{S16, S16}}); 1355 1356 // TODO: Support 16-bit shift amounts for all types 1357 Shifts.widenScalarIf( 1358 [=](const LegalityQuery &Query) { 1359 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1360 // 32-bit amount. 1361 const LLT ValTy = Query.Types[0]; 1362 const LLT AmountTy = Query.Types[1]; 1363 return ValTy.getSizeInBits() <= 16 && 1364 AmountTy.getSizeInBits() < 16; 1365 }, changeTo(1, S16)); 1366 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1367 Shifts.clampScalar(1, S32, S32); 1368 Shifts.widenScalarToNextPow2(0, 16); 1369 Shifts.clampScalar(0, S16, S64); 1370 1371 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1372 .minScalar(0, S16) 1373 .scalarize(0) 1374 .lower(); 1375 } else { 1376 // Make sure we legalize the shift amount type first, as the general 1377 // expansion for the shifted type will produce much worse code if it hasn't 1378 // been truncated already. 1379 Shifts.clampScalar(1, S32, S32); 1380 Shifts.widenScalarToNextPow2(0, 32); 1381 Shifts.clampScalar(0, S32, S64); 1382 1383 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1384 .minScalar(0, S32) 1385 .scalarize(0) 1386 .lower(); 1387 } 1388 Shifts.scalarize(0); 1389 1390 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1391 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1392 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1393 unsigned IdxTypeIdx = 2; 1394 1395 getActionDefinitionsBuilder(Op) 1396 .customIf([=](const LegalityQuery &Query) { 1397 const LLT EltTy = Query.Types[EltTypeIdx]; 1398 const LLT VecTy = Query.Types[VecTypeIdx]; 1399 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1400 const unsigned EltSize = EltTy.getSizeInBits(); 1401 return (EltSize == 32 || EltSize == 64) && 1402 VecTy.getSizeInBits() % 32 == 0 && 1403 VecTy.getSizeInBits() <= MaxRegisterSize && 1404 IdxTy.getSizeInBits() == 32; 1405 }) 1406 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1407 bitcastToVectorElement32(VecTypeIdx)) 1408 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1409 .bitcastIf( 1410 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1411 [=](const LegalityQuery &Query) { 1412 // For > 64-bit element types, try to turn this into a 64-bit 1413 // element vector since we may be able to do better indexing 1414 // if this is scalar. If not, fall back to 32. 1415 const LLT EltTy = Query.Types[EltTypeIdx]; 1416 const LLT VecTy = Query.Types[VecTypeIdx]; 1417 const unsigned DstEltSize = EltTy.getSizeInBits(); 1418 const unsigned VecSize = VecTy.getSizeInBits(); 1419 1420 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1421 return std::make_pair( 1422 VecTypeIdx, 1423 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 // Sub-vector(or single element) insert and extract. 1450 // TODO: verify immediate offset here since lower only works with 1451 // whole elements. 1452 const LLT BigTy = Query.Types[BigTyIdx]; 1453 return BigTy.isVector(); 1454 }) 1455 // FIXME: Multiples of 16 should not be legal. 1456 .legalIf([=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 const LLT LitTy = Query.Types[LitTyIdx]; 1459 return (BigTy.getSizeInBits() % 32 == 0) && 1460 (LitTy.getSizeInBits() % 16 == 0); 1461 }) 1462 .widenScalarIf( 1463 [=](const LegalityQuery &Query) { 1464 const LLT BigTy = Query.Types[BigTyIdx]; 1465 return (BigTy.getScalarSizeInBits() < 16); 1466 }, 1467 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1468 .widenScalarIf( 1469 [=](const LegalityQuery &Query) { 1470 const LLT LitTy = Query.Types[LitTyIdx]; 1471 return (LitTy.getScalarSizeInBits() < 16); 1472 }, 1473 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1474 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1475 .widenScalarToNextPow2(BigTyIdx, 32); 1476 1477 } 1478 1479 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1480 .legalForCartesianProduct(AllS32Vectors, {S32}) 1481 .legalForCartesianProduct(AllS64Vectors, {S64}) 1482 .clampNumElements(0, V16S32, V32S32) 1483 .clampNumElements(0, V2S64, V16S64) 1484 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1485 1486 if (ST.hasScalarPackInsts()) { 1487 BuildVector 1488 // FIXME: Should probably widen s1 vectors straight to s32 1489 .minScalarOrElt(0, S16) 1490 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1491 .minScalar(1, S32); 1492 1493 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1494 .legalFor({V2S16, S32}) 1495 .lower(); 1496 BuildVector.minScalarOrElt(0, S32); 1497 } else { 1498 BuildVector.customFor({V2S16, S16}); 1499 BuildVector.minScalarOrElt(0, S32); 1500 1501 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1502 .customFor({V2S16, S32}) 1503 .lower(); 1504 } 1505 1506 BuildVector.legalIf(isRegisterType(0)); 1507 1508 // FIXME: Clamp maximum size 1509 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1510 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1511 .clampMaxNumElements(0, S32, 32) 1512 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1513 .clampMaxNumElements(0, S16, 64); 1514 1515 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1516 // pre-legalize. 1517 if (ST.hasVOP3PInsts()) { 1518 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1519 .customFor({V2S16, V2S16}) 1520 .lower(); 1521 } else 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1523 1524 // Merge/Unmerge 1525 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1526 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1527 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1528 1529 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1530 const LLT Ty = Query.Types[TypeIdx]; 1531 if (Ty.isVector()) { 1532 const LLT &EltTy = Ty.getElementType(); 1533 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1534 return true; 1535 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1536 return true; 1537 } 1538 return false; 1539 }; 1540 1541 auto &Builder = getActionDefinitionsBuilder(Op) 1542 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1543 .lowerFor({{S16, V2S16}}) 1544 .lowerIf([=](const LegalityQuery &Query) { 1545 const LLT BigTy = Query.Types[BigTyIdx]; 1546 return BigTy.getSizeInBits() == 32; 1547 }) 1548 // Try to widen to s16 first for small types. 1549 // TODO: Only do this on targets with legal s16 shifts 1550 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1551 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1552 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1553 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1554 elementTypeIs(1, S16)), 1555 changeTo(1, V2S16)) 1556 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1557 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1558 // valid. 1559 .clampScalar(LitTyIdx, S32, S512) 1560 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1561 // Break up vectors with weird elements into scalars 1562 .fewerElementsIf( 1563 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1564 scalarize(0)) 1565 .fewerElementsIf( 1566 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1567 scalarize(1)) 1568 .clampScalar(BigTyIdx, S32, MaxScalar); 1569 1570 if (Op == G_MERGE_VALUES) { 1571 Builder.widenScalarIf( 1572 // TODO: Use 16-bit shifts if legal for 8-bit values? 1573 [=](const LegalityQuery &Query) { 1574 const LLT Ty = Query.Types[LitTyIdx]; 1575 return Ty.getSizeInBits() < 32; 1576 }, 1577 changeTo(LitTyIdx, S32)); 1578 } 1579 1580 Builder.widenScalarIf( 1581 [=](const LegalityQuery &Query) { 1582 const LLT Ty = Query.Types[BigTyIdx]; 1583 return !isPowerOf2_32(Ty.getSizeInBits()) && 1584 Ty.getSizeInBits() % 16 != 0; 1585 }, 1586 [=](const LegalityQuery &Query) { 1587 // Pick the next power of 2, or a multiple of 64 over 128. 1588 // Whichever is smaller. 1589 const LLT &Ty = Query.Types[BigTyIdx]; 1590 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1591 if (NewSizeInBits >= 256) { 1592 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1593 if (RoundedTo < NewSizeInBits) 1594 NewSizeInBits = RoundedTo; 1595 } 1596 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1597 }) 1598 // Any vectors left are the wrong size. Scalarize them. 1599 .scalarize(0) 1600 .scalarize(1); 1601 } 1602 1603 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1604 // RegBankSelect. 1605 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1606 .legalFor({{S32}, {S64}}); 1607 1608 if (ST.hasVOP3PInsts()) { 1609 SextInReg.lowerFor({{V2S16}}) 1610 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1611 // get more vector shift opportunities, since we'll get those when 1612 // expanded. 1613 .clampMaxNumElementsStrict(0, S16, 2); 1614 } else if (ST.has16BitInsts()) { 1615 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1616 } else { 1617 // Prefer to promote to s32 before lowering if we don't have 16-bit 1618 // shifts. This avoid a lot of intermediate truncate and extend operations. 1619 SextInReg.lowerFor({{S32}, {S64}}); 1620 } 1621 1622 SextInReg 1623 .scalarize(0) 1624 .clampScalar(0, S32, S64) 1625 .lower(); 1626 1627 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1628 .scalarize(0) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .clampMaxNumElementsStrict(0, S16, 2) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1685 .lower(); 1686 1687 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1688 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1689 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1690 .unsupported(); 1691 1692 getLegacyLegalizerInfo().computeTables(); 1693 verify(*ST.getInstrInfo()); 1694 } 1695 1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1697 MachineInstr &MI) const { 1698 MachineIRBuilder &B = Helper.MIRBuilder; 1699 MachineRegisterInfo &MRI = *B.getMRI(); 1700 1701 switch (MI.getOpcode()) { 1702 case TargetOpcode::G_ADDRSPACE_CAST: 1703 return legalizeAddrSpaceCast(MI, MRI, B); 1704 case TargetOpcode::G_FRINT: 1705 return legalizeFrint(MI, MRI, B); 1706 case TargetOpcode::G_FCEIL: 1707 return legalizeFceil(MI, MRI, B); 1708 case TargetOpcode::G_FREM: 1709 return legalizeFrem(MI, MRI, B); 1710 case TargetOpcode::G_INTRINSIC_TRUNC: 1711 return legalizeIntrinsicTrunc(MI, MRI, B); 1712 case TargetOpcode::G_SITOFP: 1713 return legalizeITOFP(MI, MRI, B, true); 1714 case TargetOpcode::G_UITOFP: 1715 return legalizeITOFP(MI, MRI, B, false); 1716 case TargetOpcode::G_FPTOSI: 1717 return legalizeFPTOI(MI, MRI, B, true); 1718 case TargetOpcode::G_FPTOUI: 1719 return legalizeFPTOI(MI, MRI, B, false); 1720 case TargetOpcode::G_FMINNUM: 1721 case TargetOpcode::G_FMAXNUM: 1722 case TargetOpcode::G_FMINNUM_IEEE: 1723 case TargetOpcode::G_FMAXNUM_IEEE: 1724 return legalizeMinNumMaxNum(Helper, MI); 1725 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1726 return legalizeExtractVectorElt(MI, MRI, B); 1727 case TargetOpcode::G_INSERT_VECTOR_ELT: 1728 return legalizeInsertVectorElt(MI, MRI, B); 1729 case TargetOpcode::G_SHUFFLE_VECTOR: 1730 return legalizeShuffleVector(MI, MRI, B); 1731 case TargetOpcode::G_FSIN: 1732 case TargetOpcode::G_FCOS: 1733 return legalizeSinCos(MI, MRI, B); 1734 case TargetOpcode::G_GLOBAL_VALUE: 1735 return legalizeGlobalValue(MI, MRI, B); 1736 case TargetOpcode::G_LOAD: 1737 case TargetOpcode::G_SEXTLOAD: 1738 case TargetOpcode::G_ZEXTLOAD: 1739 return legalizeLoad(Helper, MI); 1740 case TargetOpcode::G_FMAD: 1741 return legalizeFMad(MI, MRI, B); 1742 case TargetOpcode::G_FDIV: 1743 return legalizeFDIV(MI, MRI, B); 1744 case TargetOpcode::G_UDIV: 1745 case TargetOpcode::G_UREM: 1746 case TargetOpcode::G_UDIVREM: 1747 return legalizeUnsignedDIV_REM(MI, MRI, B); 1748 case TargetOpcode::G_SDIV: 1749 case TargetOpcode::G_SREM: 1750 case TargetOpcode::G_SDIVREM: 1751 return legalizeSignedDIV_REM(MI, MRI, B); 1752 case TargetOpcode::G_ATOMIC_CMPXCHG: 1753 return legalizeAtomicCmpXChg(MI, MRI, B); 1754 case TargetOpcode::G_FLOG: 1755 return legalizeFlog(MI, B, numbers::ln2f); 1756 case TargetOpcode::G_FLOG10: 1757 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1758 case TargetOpcode::G_FEXP: 1759 return legalizeFExp(MI, B); 1760 case TargetOpcode::G_FPOW: 1761 return legalizeFPow(MI, B); 1762 case TargetOpcode::G_FFLOOR: 1763 return legalizeFFloor(MI, MRI, B); 1764 case TargetOpcode::G_BUILD_VECTOR: 1765 return legalizeBuildVector(MI, MRI, B); 1766 case TargetOpcode::G_CTLZ: 1767 case TargetOpcode::G_CTTZ: 1768 return legalizeCTLZ_CTTZ(MI, MRI, B); 1769 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1770 return legalizeFPTruncRound(MI, B); 1771 default: 1772 return false; 1773 } 1774 1775 llvm_unreachable("expected switch to return"); 1776 } 1777 1778 Register AMDGPULegalizerInfo::getSegmentAperture( 1779 unsigned AS, 1780 MachineRegisterInfo &MRI, 1781 MachineIRBuilder &B) const { 1782 MachineFunction &MF = B.getMF(); 1783 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1784 const LLT S32 = LLT::scalar(32); 1785 1786 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1787 1788 if (ST.hasApertureRegs()) { 1789 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1790 // getreg. 1791 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1792 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1793 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1794 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1795 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1796 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1797 unsigned Encoding = 1798 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1799 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1800 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1801 1802 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1803 1804 B.buildInstr(AMDGPU::S_GETREG_B32) 1805 .addDef(GetReg) 1806 .addImm(Encoding); 1807 MRI.setType(GetReg, S32); 1808 1809 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1810 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1811 } 1812 1813 // TODO: can we be smarter about machine pointer info? 1814 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1815 Register LoadAddr = MRI.createGenericVirtualRegister( 1816 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1817 // For code object version 5, private_base and shared_base are passed through 1818 // implicit kernargs. 1819 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1820 AMDGPUTargetLowering::ImplicitParameter Param = 1821 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1822 : AMDGPUTargetLowering::PRIVATE_BASE; 1823 uint64_t Offset = 1824 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1825 1826 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1827 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1828 1829 if (!loadInputValue(KernargPtrReg, B, 1830 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1831 return Register(); 1832 1833 MachineMemOperand *MMO = MF.getMachineMemOperand( 1834 PtrInfo, 1835 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1836 MachineMemOperand::MOInvariant, 1837 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1838 1839 // Pointer address 1840 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1841 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1842 // Load address 1843 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1844 } 1845 1846 Register QueuePtr = MRI.createGenericVirtualRegister( 1847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1848 1849 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1850 return Register(); 1851 1852 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1853 // private_segment_aperture_base_hi. 1854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1855 1856 MachineMemOperand *MMO = MF.getMachineMemOperand( 1857 PtrInfo, 1858 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1859 MachineMemOperand::MOInvariant, 1860 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1861 1862 B.buildPtrAdd(LoadAddr, QueuePtr, 1863 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1864 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1865 } 1866 1867 /// Return true if the value is a known valid address, such that a null check is 1868 /// not necessary. 1869 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1870 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1871 MachineInstr *Def = MRI.getVRegDef(Val); 1872 switch (Def->getOpcode()) { 1873 case AMDGPU::G_FRAME_INDEX: 1874 case AMDGPU::G_GLOBAL_VALUE: 1875 case AMDGPU::G_BLOCK_ADDR: 1876 return true; 1877 case AMDGPU::G_CONSTANT: { 1878 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1879 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1880 } 1881 default: 1882 return false; 1883 } 1884 1885 return false; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 MachineFunction &MF = B.getMF(); 1892 1893 const LLT S32 = LLT::scalar(32); 1894 Register Dst = MI.getOperand(0).getReg(); 1895 Register Src = MI.getOperand(1).getReg(); 1896 1897 LLT DstTy = MRI.getType(Dst); 1898 LLT SrcTy = MRI.getType(Src); 1899 unsigned DestAS = DstTy.getAddressSpace(); 1900 unsigned SrcAS = SrcTy.getAddressSpace(); 1901 1902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1903 // vector element. 1904 assert(!DstTy.isVector()); 1905 1906 const AMDGPUTargetMachine &TM 1907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1908 1909 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1910 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1911 return true; 1912 } 1913 1914 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1915 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1916 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1917 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1918 // Extract low 32-bits of the pointer. 1919 B.buildExtract(Dst, Src, 0); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 unsigned NullVal = TM.getNullPointerValue(DestAS); 1925 1926 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1927 auto FlatNull = B.buildConstant(SrcTy, 0); 1928 1929 // Extract low 32-bits of the pointer. 1930 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1931 1932 auto CmpRes = 1933 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1934 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1935 1936 MI.eraseFromParent(); 1937 return true; 1938 } 1939 1940 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1941 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1942 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1943 if (!ST.hasFlatAddressSpace()) 1944 return false; 1945 1946 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1947 if (!ApertureReg.isValid()) 1948 return false; 1949 1950 // Coerce the type of the low half of the result so we can use merge_values. 1951 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1952 1953 // TODO: Should we allow mismatched types but matching sizes in merges to 1954 // avoid the ptrtoint? 1955 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1956 1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1958 B.buildCopy(Dst, BuildPtr); 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1964 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1965 1966 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1967 SegmentNull.getReg(0)); 1968 1969 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1970 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1976 SrcTy.getSizeInBits() == 64) { 1977 // Truncate. 1978 B.buildExtract(Dst, Src, 0); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 1984 DstTy.getSizeInBits() == 64) { 1985 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1986 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1987 1988 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1989 // another. Merge operands are required to be the same type, but creating an 1990 // extra ptrtoint would be kind of pointless. 1991 auto HighAddr = B.buildConstant( 1992 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1993 B.buildMerge(Dst, {Src, HighAddr}); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1999 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2000 2001 LLVMContext &Ctx = MF.getFunction().getContext(); 2002 Ctx.diagnose(InvalidAddrSpaceCast); 2003 B.buildUndef(Dst); 2004 MI.eraseFromParent(); 2005 return true; 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeFrint( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 Register Src = MI.getOperand(1).getReg(); 2012 LLT Ty = MRI.getType(Src); 2013 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2014 2015 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2016 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2017 2018 auto C1 = B.buildFConstant(Ty, C1Val); 2019 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2020 2021 // TODO: Should this propagate fast-math-flags? 2022 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2023 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2024 2025 auto C2 = B.buildFConstant(Ty, C2Val); 2026 auto Fabs = B.buildFAbs(Ty, Src); 2027 2028 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2029 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 bool AMDGPULegalizerInfo::legalizeFceil( 2035 MachineInstr &MI, MachineRegisterInfo &MRI, 2036 MachineIRBuilder &B) const { 2037 2038 const LLT S1 = LLT::scalar(1); 2039 const LLT S64 = LLT::scalar(64); 2040 2041 Register Src = MI.getOperand(1).getReg(); 2042 assert(MRI.getType(Src) == S64); 2043 2044 // result = trunc(src) 2045 // if (src > 0.0 && src != result) 2046 // result += 1.0 2047 2048 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2049 2050 const auto Zero = B.buildFConstant(S64, 0.0); 2051 const auto One = B.buildFConstant(S64, 1.0); 2052 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2053 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2054 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2055 auto Add = B.buildSelect(S64, And, One, Zero); 2056 2057 // TODO: Should this propagate fast-math-flags? 2058 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::legalizeFrem( 2064 MachineInstr &MI, MachineRegisterInfo &MRI, 2065 MachineIRBuilder &B) const { 2066 Register DstReg = MI.getOperand(0).getReg(); 2067 Register Src0Reg = MI.getOperand(1).getReg(); 2068 Register Src1Reg = MI.getOperand(2).getReg(); 2069 auto Flags = MI.getFlags(); 2070 LLT Ty = MRI.getType(DstReg); 2071 2072 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2073 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2074 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2075 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2076 MI.eraseFromParent(); 2077 return true; 2078 } 2079 2080 static MachineInstrBuilder extractF64Exponent(Register Hi, 2081 MachineIRBuilder &B) { 2082 const unsigned FractBits = 52; 2083 const unsigned ExpBits = 11; 2084 LLT S32 = LLT::scalar(32); 2085 2086 auto Const0 = B.buildConstant(S32, FractBits - 32); 2087 auto Const1 = B.buildConstant(S32, ExpBits); 2088 2089 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2090 .addUse(Hi) 2091 .addUse(Const0.getReg(0)) 2092 .addUse(Const1.getReg(0)); 2093 2094 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2095 } 2096 2097 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2098 MachineInstr &MI, MachineRegisterInfo &MRI, 2099 MachineIRBuilder &B) const { 2100 const LLT S1 = LLT::scalar(1); 2101 const LLT S32 = LLT::scalar(32); 2102 const LLT S64 = LLT::scalar(64); 2103 2104 Register Src = MI.getOperand(1).getReg(); 2105 assert(MRI.getType(Src) == S64); 2106 2107 // TODO: Should this use extract since the low half is unused? 2108 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2109 Register Hi = Unmerge.getReg(1); 2110 2111 // Extract the upper half, since this is where we will find the sign and 2112 // exponent. 2113 auto Exp = extractF64Exponent(Hi, B); 2114 2115 const unsigned FractBits = 52; 2116 2117 // Extract the sign bit. 2118 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2119 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2120 2121 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2122 2123 const auto Zero32 = B.buildConstant(S32, 0); 2124 2125 // Extend back to 64-bits. 2126 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2127 2128 auto Shr = B.buildAShr(S64, FractMask, Exp); 2129 auto Not = B.buildNot(S64, Shr); 2130 auto Tmp0 = B.buildAnd(S64, Src, Not); 2131 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2132 2133 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2134 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2135 2136 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2137 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeITOFP( 2143 MachineInstr &MI, MachineRegisterInfo &MRI, 2144 MachineIRBuilder &B, bool Signed) const { 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src = MI.getOperand(1).getReg(); 2148 2149 const LLT S64 = LLT::scalar(64); 2150 const LLT S32 = LLT::scalar(32); 2151 2152 assert(MRI.getType(Src) == S64); 2153 2154 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2155 auto ThirtyTwo = B.buildConstant(S32, 32); 2156 2157 if (MRI.getType(Dst) == S64) { 2158 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2159 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2160 2161 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2162 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2163 .addUse(CvtHi.getReg(0)) 2164 .addUse(ThirtyTwo.getReg(0)); 2165 2166 // TODO: Should this propagate fast-math-flags? 2167 B.buildFAdd(Dst, LdExp, CvtLo); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 assert(MRI.getType(Dst) == S32); 2173 2174 auto One = B.buildConstant(S32, 1); 2175 2176 MachineInstrBuilder ShAmt; 2177 if (Signed) { 2178 auto ThirtyOne = B.buildConstant(S32, 31); 2179 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2180 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2181 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2182 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2183 /*HasSideEffects=*/false) 2184 .addUse(Unmerge.getReg(1)); 2185 auto LS2 = B.buildSub(S32, LS, One); 2186 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2187 } else 2188 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2189 auto Norm = B.buildShl(S64, Src, ShAmt); 2190 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2191 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2192 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2193 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2194 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2195 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2196 /*HasSideEffects=*/false) 2197 .addUse(FVal.getReg(0)) 2198 .addUse(Scale.getReg(0)); 2199 MI.eraseFromParent(); 2200 return true; 2201 } 2202 2203 // TODO: Copied from DAG implementation. Verify logic and document how this 2204 // actually works. 2205 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B, 2208 bool Signed) const { 2209 2210 Register Dst = MI.getOperand(0).getReg(); 2211 Register Src = MI.getOperand(1).getReg(); 2212 2213 const LLT S64 = LLT::scalar(64); 2214 const LLT S32 = LLT::scalar(32); 2215 2216 const LLT SrcLT = MRI.getType(Src); 2217 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2218 2219 unsigned Flags = MI.getFlags(); 2220 2221 // The basic idea of converting a floating point number into a pair of 32-bit 2222 // integers is illustrated as follows: 2223 // 2224 // tf := trunc(val); 2225 // hif := floor(tf * 2^-32); 2226 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2227 // hi := fptoi(hif); 2228 // lo := fptoi(lof); 2229 // 2230 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2231 MachineInstrBuilder Sign; 2232 if (Signed && SrcLT == S32) { 2233 // However, a 32-bit floating point number has only 23 bits mantissa and 2234 // it's not enough to hold all the significant bits of `lof` if val is 2235 // negative. To avoid the loss of precision, We need to take the absolute 2236 // value after truncating and flip the result back based on the original 2237 // signedness. 2238 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2239 Trunc = B.buildFAbs(S32, Trunc, Flags); 2240 } 2241 MachineInstrBuilder K0, K1; 2242 if (SrcLT == S64) { 2243 K0 = B.buildFConstant(S64, 2244 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2245 K1 = B.buildFConstant(S64, 2246 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2247 } else { 2248 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2249 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2250 } 2251 2252 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2253 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2254 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2255 2256 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2257 : B.buildFPTOUI(S32, FloorMul); 2258 auto Lo = B.buildFPTOUI(S32, Fma); 2259 2260 if (Signed && SrcLT == S32) { 2261 // Flip the result based on the signedness, which is either all 0s or 1s. 2262 Sign = B.buildMerge(S64, {Sign, Sign}); 2263 // r := xor({lo, hi}, sign) - sign; 2264 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2265 } else 2266 B.buildMerge(Dst, {Lo, Hi}); 2267 MI.eraseFromParent(); 2268 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2273 MachineInstr &MI) const { 2274 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2275 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2276 2277 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2278 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2279 2280 // With ieee_mode disabled, the instructions have the correct behavior 2281 // already for G_FMINNUM/G_FMAXNUM 2282 if (!MFI->getMode().IEEE) 2283 return !IsIEEEOp; 2284 2285 if (IsIEEEOp) 2286 return true; 2287 2288 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2289 } 2290 2291 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2292 MachineInstr &MI, MachineRegisterInfo &MRI, 2293 MachineIRBuilder &B) const { 2294 // TODO: Should move some of this into LegalizerHelper. 2295 2296 // TODO: Promote dynamic indexing of s16 to s32 2297 2298 // FIXME: Artifact combiner probably should have replaced the truncated 2299 // constant before this, so we shouldn't need 2300 // getIConstantVRegValWithLookThrough. 2301 Optional<ValueAndVReg> MaybeIdxVal = 2302 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2303 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2304 return true; 2305 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2306 2307 Register Dst = MI.getOperand(0).getReg(); 2308 Register Vec = MI.getOperand(1).getReg(); 2309 2310 LLT VecTy = MRI.getType(Vec); 2311 LLT EltTy = VecTy.getElementType(); 2312 assert(EltTy == MRI.getType(Dst)); 2313 2314 if (IdxVal < VecTy.getNumElements()) { 2315 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2316 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2317 } else { 2318 B.buildUndef(Dst); 2319 } 2320 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 // TODO: Should move some of this into LegalizerHelper. 2329 2330 // TODO: Promote dynamic indexing of s16 to s32 2331 2332 // FIXME: Artifact combiner probably should have replaced the truncated 2333 // constant before this, so we shouldn't need 2334 // getIConstantVRegValWithLookThrough. 2335 Optional<ValueAndVReg> MaybeIdxVal = 2336 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2338 return true; 2339 2340 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Vec = MI.getOperand(1).getReg(); 2343 Register Ins = MI.getOperand(2).getReg(); 2344 2345 LLT VecTy = MRI.getType(Vec); 2346 LLT EltTy = VecTy.getElementType(); 2347 assert(EltTy == MRI.getType(Ins)); 2348 (void)Ins; 2349 2350 unsigned NumElts = VecTy.getNumElements(); 2351 if (IdxVal < NumElts) { 2352 SmallVector<Register, 8> SrcRegs; 2353 for (unsigned i = 0; i < NumElts; ++i) 2354 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2355 B.buildUnmerge(SrcRegs, Vec); 2356 2357 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2358 B.buildMerge(Dst, SrcRegs); 2359 } else { 2360 B.buildUndef(Dst); 2361 } 2362 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2368 MachineInstr &MI, MachineRegisterInfo &MRI, 2369 MachineIRBuilder &B) const { 2370 const LLT V2S16 = LLT::fixed_vector(2, 16); 2371 2372 Register Dst = MI.getOperand(0).getReg(); 2373 Register Src0 = MI.getOperand(1).getReg(); 2374 LLT DstTy = MRI.getType(Dst); 2375 LLT SrcTy = MRI.getType(Src0); 2376 2377 if (SrcTy == V2S16 && DstTy == V2S16 && 2378 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2379 return true; 2380 2381 MachineIRBuilder HelperBuilder(MI); 2382 GISelObserverWrapper DummyObserver; 2383 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2384 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeSinCos( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, 2389 MachineIRBuilder &B) const { 2390 2391 Register DstReg = MI.getOperand(0).getReg(); 2392 Register SrcReg = MI.getOperand(1).getReg(); 2393 LLT Ty = MRI.getType(DstReg); 2394 unsigned Flags = MI.getFlags(); 2395 2396 Register TrigVal; 2397 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2398 if (ST.hasTrigReducedRange()) { 2399 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2400 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2401 .addUse(MulVal.getReg(0)) 2402 .setMIFlags(Flags).getReg(0); 2403 } else 2404 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2405 2406 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2407 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2408 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2409 .addUse(TrigVal) 2410 .setMIFlags(Flags); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2416 MachineIRBuilder &B, 2417 const GlobalValue *GV, 2418 int64_t Offset, 2419 unsigned GAFlags) const { 2420 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2421 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2422 // to the following code sequence: 2423 // 2424 // For constant address space: 2425 // s_getpc_b64 s[0:1] 2426 // s_add_u32 s0, s0, $symbol 2427 // s_addc_u32 s1, s1, 0 2428 // 2429 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2430 // a fixup or relocation is emitted to replace $symbol with a literal 2431 // constant, which is a pc-relative offset from the encoding of the $symbol 2432 // operand to the global variable. 2433 // 2434 // For global address space: 2435 // s_getpc_b64 s[0:1] 2436 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2437 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2438 // 2439 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2440 // fixups or relocations are emitted to replace $symbol@*@lo and 2441 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2442 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2443 // operand to the global variable. 2444 // 2445 // What we want here is an offset from the value returned by s_getpc 2446 // (which is the address of the s_add_u32 instruction) to the global 2447 // variable, but since the encoding of $symbol starts 4 bytes after the start 2448 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2449 // small. This requires us to add 4 to the global variable offset in order to 2450 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2451 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2452 // instruction. 2453 2454 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2455 2456 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2457 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2458 2459 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2460 .addDef(PCReg); 2461 2462 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2463 if (GAFlags == SIInstrInfo::MO_NONE) 2464 MIB.addImm(0); 2465 else 2466 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2467 2468 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2469 2470 if (PtrTy.getSizeInBits() == 32) 2471 B.buildExtract(DstReg, PCReg, 0); 2472 return true; 2473 } 2474 2475 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2476 MachineInstr &MI, MachineRegisterInfo &MRI, 2477 MachineIRBuilder &B) const { 2478 Register DstReg = MI.getOperand(0).getReg(); 2479 LLT Ty = MRI.getType(DstReg); 2480 unsigned AS = Ty.getAddressSpace(); 2481 2482 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2483 MachineFunction &MF = B.getMF(); 2484 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2485 2486 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2487 if (!MFI->isModuleEntryFunction() && 2488 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2489 const Function &Fn = MF.getFunction(); 2490 DiagnosticInfoUnsupported BadLDSDecl( 2491 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2492 DS_Warning); 2493 Fn.getContext().diagnose(BadLDSDecl); 2494 2495 // We currently don't have a way to correctly allocate LDS objects that 2496 // aren't directly associated with a kernel. We do force inlining of 2497 // functions that use local objects. However, if these dead functions are 2498 // not eliminated, we don't want a compile time error. Just emit a warning 2499 // and a trap, since there should be no callable path here. 2500 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2501 B.buildUndef(DstReg); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 // TODO: We could emit code to handle the initialization somewhere. 2507 // We ignore the initializer for now and legalize it to allow selection. 2508 // The initializer will anyway get errored out during assembly emission. 2509 const SITargetLowering *TLI = ST.getTargetLowering(); 2510 if (!TLI->shouldUseLDSConstAddress(GV)) { 2511 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2512 return true; // Leave in place; 2513 } 2514 2515 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2516 Type *Ty = GV->getValueType(); 2517 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2518 // zero-sized type in other languages to declare the dynamic shared 2519 // memory which size is not known at the compile time. They will be 2520 // allocated by the runtime and placed directly after the static 2521 // allocated ones. They all share the same offset. 2522 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2523 // Adjust alignment for that dynamic shared memory array. 2524 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2525 LLT S32 = LLT::scalar(32); 2526 auto Sz = 2527 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2528 B.buildIntToPtr(DstReg, Sz); 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 } 2533 2534 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2535 *cast<GlobalVariable>(GV))); 2536 MI.eraseFromParent(); 2537 return true; 2538 } 2539 2540 const SITargetLowering *TLI = ST.getTargetLowering(); 2541 2542 if (TLI->shouldEmitFixup(GV)) { 2543 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 if (TLI->shouldEmitPCReloc(GV)) { 2549 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2555 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2556 2557 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2558 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2559 MachinePointerInfo::getGOT(MF), 2560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2561 MachineMemOperand::MOInvariant, 2562 LoadTy, Align(8)); 2563 2564 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2565 2566 if (Ty.getSizeInBits() == 32) { 2567 // Truncate if this is a 32-bit constant address. 2568 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2569 B.buildExtract(DstReg, Load, 0); 2570 } else 2571 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2572 2573 MI.eraseFromParent(); 2574 return true; 2575 } 2576 2577 static LLT widenToNextPowerOf2(LLT Ty) { 2578 if (Ty.isVector()) 2579 return Ty.changeElementCount( 2580 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2581 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2582 } 2583 2584 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2585 MachineInstr &MI) const { 2586 MachineIRBuilder &B = Helper.MIRBuilder; 2587 MachineRegisterInfo &MRI = *B.getMRI(); 2588 GISelChangeObserver &Observer = Helper.Observer; 2589 2590 Register PtrReg = MI.getOperand(1).getReg(); 2591 LLT PtrTy = MRI.getType(PtrReg); 2592 unsigned AddrSpace = PtrTy.getAddressSpace(); 2593 2594 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2595 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2597 Observer.changingInstr(MI); 2598 MI.getOperand(1).setReg(Cast.getReg(0)); 2599 Observer.changedInstr(MI); 2600 return true; 2601 } 2602 2603 if (MI.getOpcode() != AMDGPU::G_LOAD) 2604 return false; 2605 2606 Register ValReg = MI.getOperand(0).getReg(); 2607 LLT ValTy = MRI.getType(ValReg); 2608 2609 MachineMemOperand *MMO = *MI.memoperands_begin(); 2610 const unsigned ValSize = ValTy.getSizeInBits(); 2611 const LLT MemTy = MMO->getMemoryType(); 2612 const Align MemAlign = MMO->getAlign(); 2613 const unsigned MemSize = MemTy.getSizeInBits(); 2614 const uint64_t AlignInBits = 8 * MemAlign.value(); 2615 2616 // Widen non-power-of-2 loads to the alignment if needed 2617 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2618 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2619 2620 // This was already the correct extending load result type, so just adjust 2621 // the memory type. 2622 if (WideMemSize == ValSize) { 2623 MachineFunction &MF = B.getMF(); 2624 2625 MachineMemOperand *WideMMO = 2626 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2627 Observer.changingInstr(MI); 2628 MI.setMemRefs(MF, {WideMMO}); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 // Don't bother handling edge case that should probably never be produced. 2634 if (ValSize > WideMemSize) 2635 return false; 2636 2637 LLT WideTy = widenToNextPowerOf2(ValTy); 2638 2639 Register WideLoad; 2640 if (!WideTy.isVector()) { 2641 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2642 B.buildTrunc(ValReg, WideLoad).getReg(0); 2643 } else { 2644 // Extract the subvector. 2645 2646 if (isRegisterType(ValTy)) { 2647 // If this a case where G_EXTRACT is legal, use it. 2648 // (e.g. <3 x s32> -> <4 x s32>) 2649 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2650 B.buildExtract(ValReg, WideLoad, 0); 2651 } else { 2652 // For cases where the widened type isn't a nice register value, unmerge 2653 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2654 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2655 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2656 } 2657 } 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 return false; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFMad( 2667 MachineInstr &MI, MachineRegisterInfo &MRI, 2668 MachineIRBuilder &B) const { 2669 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2670 assert(Ty.isScalar()); 2671 2672 MachineFunction &MF = B.getMF(); 2673 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2674 2675 // TODO: Always legal with future ftz flag. 2676 // FIXME: Do we need just output? 2677 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2678 return true; 2679 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2680 return true; 2681 2682 MachineIRBuilder HelperBuilder(MI); 2683 GISelObserverWrapper DummyObserver; 2684 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2685 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2686 } 2687 2688 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2690 Register DstReg = MI.getOperand(0).getReg(); 2691 Register PtrReg = MI.getOperand(1).getReg(); 2692 Register CmpVal = MI.getOperand(2).getReg(); 2693 Register NewVal = MI.getOperand(3).getReg(); 2694 2695 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2696 "this should not have been custom lowered"); 2697 2698 LLT ValTy = MRI.getType(CmpVal); 2699 LLT VecTy = LLT::fixed_vector(2, ValTy); 2700 2701 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2702 2703 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2704 .addDef(DstReg) 2705 .addUse(PtrReg) 2706 .addUse(PackedVal) 2707 .setMemRefs(MI.memoperands()); 2708 2709 MI.eraseFromParent(); 2710 return true; 2711 } 2712 2713 bool AMDGPULegalizerInfo::legalizeFlog( 2714 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2715 Register Dst = MI.getOperand(0).getReg(); 2716 Register Src = MI.getOperand(1).getReg(); 2717 LLT Ty = B.getMRI()->getType(Dst); 2718 unsigned Flags = MI.getFlags(); 2719 2720 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2721 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2722 2723 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2724 MI.eraseFromParent(); 2725 return true; 2726 } 2727 2728 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2729 MachineIRBuilder &B) const { 2730 Register Dst = MI.getOperand(0).getReg(); 2731 Register Src = MI.getOperand(1).getReg(); 2732 unsigned Flags = MI.getFlags(); 2733 LLT Ty = B.getMRI()->getType(Dst); 2734 2735 auto K = B.buildFConstant(Ty, numbers::log2e); 2736 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2737 B.buildFExp2(Dst, Mul, Flags); 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2743 MachineIRBuilder &B) const { 2744 Register Dst = MI.getOperand(0).getReg(); 2745 Register Src0 = MI.getOperand(1).getReg(); 2746 Register Src1 = MI.getOperand(2).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 LLT Ty = B.getMRI()->getType(Dst); 2749 const LLT S16 = LLT::scalar(16); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 if (Ty == S32) { 2753 auto Log = B.buildFLog2(S32, Src0, Flags); 2754 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2755 .addUse(Log.getReg(0)) 2756 .addUse(Src1) 2757 .setMIFlags(Flags); 2758 B.buildFExp2(Dst, Mul, Flags); 2759 } else if (Ty == S16) { 2760 // There's no f16 fmul_legacy, so we need to convert for it. 2761 auto Log = B.buildFLog2(S16, Src0, Flags); 2762 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2763 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2764 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2765 .addUse(Ext0.getReg(0)) 2766 .addUse(Ext1.getReg(0)) 2767 .setMIFlags(Flags); 2768 2769 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2770 } else 2771 return false; 2772 2773 MI.eraseFromParent(); 2774 return true; 2775 } 2776 2777 // Find a source register, ignoring any possible source modifiers. 2778 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2779 Register ModSrc = OrigSrc; 2780 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2781 ModSrc = SrcFNeg->getOperand(1).getReg(); 2782 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2783 ModSrc = SrcFAbs->getOperand(1).getReg(); 2784 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2785 ModSrc = SrcFAbs->getOperand(1).getReg(); 2786 return ModSrc; 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2790 MachineRegisterInfo &MRI, 2791 MachineIRBuilder &B) const { 2792 2793 const LLT S1 = LLT::scalar(1); 2794 const LLT S64 = LLT::scalar(64); 2795 Register Dst = MI.getOperand(0).getReg(); 2796 Register OrigSrc = MI.getOperand(1).getReg(); 2797 unsigned Flags = MI.getFlags(); 2798 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2799 "this should not have been custom lowered"); 2800 2801 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2802 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2803 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2804 // V_FRACT bug is: 2805 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2806 // 2807 // Convert floor(x) to (x - fract(x)) 2808 2809 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2810 .addUse(OrigSrc) 2811 .setMIFlags(Flags); 2812 2813 // Give source modifier matching some assistance before obscuring a foldable 2814 // pattern. 2815 2816 // TODO: We can avoid the neg on the fract? The input sign to fract 2817 // shouldn't matter? 2818 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2819 2820 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2821 2822 Register Min = MRI.createGenericVirtualRegister(S64); 2823 2824 // We don't need to concern ourselves with the snan handling difference, so 2825 // use the one which will directly select. 2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2827 if (MFI->getMode().IEEE) 2828 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2829 else 2830 B.buildFMinNum(Min, Fract, Const, Flags); 2831 2832 Register CorrectedFract = Min; 2833 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2834 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2835 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2836 } 2837 2838 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2839 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2840 2841 MI.eraseFromParent(); 2842 return true; 2843 } 2844 2845 // Turn an illegal packed v2s16 build vector into bit operations. 2846 // TODO: This should probably be a bitcast action in LegalizerHelper. 2847 bool AMDGPULegalizerInfo::legalizeBuildVector( 2848 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2849 Register Dst = MI.getOperand(0).getReg(); 2850 const LLT S32 = LLT::scalar(32); 2851 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2852 2853 Register Src0 = MI.getOperand(1).getReg(); 2854 Register Src1 = MI.getOperand(2).getReg(); 2855 assert(MRI.getType(Src0) == LLT::scalar(16)); 2856 2857 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2858 B.buildBitcast(Dst, Merge); 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 } 2863 2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2866 // case with a single min instruction instead of a compare+select. 2867 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2868 MachineRegisterInfo &MRI, 2869 MachineIRBuilder &B) const { 2870 Register Dst = MI.getOperand(0).getReg(); 2871 Register Src = MI.getOperand(1).getReg(); 2872 LLT DstTy = MRI.getType(Dst); 2873 LLT SrcTy = MRI.getType(Src); 2874 2875 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2876 ? AMDGPU::G_AMDGPU_FFBH_U32 2877 : AMDGPU::G_AMDGPU_FFBL_B32; 2878 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2879 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2880 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // Check that this is a G_XOR x, -1 2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2887 if (MI.getOpcode() != TargetOpcode::G_XOR) 2888 return false; 2889 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2890 return ConstVal && *ConstVal == -1; 2891 } 2892 2893 // Return the use branch instruction, otherwise null if the usage is invalid. 2894 static MachineInstr * 2895 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2896 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2897 Register CondDef = MI.getOperand(0).getReg(); 2898 if (!MRI.hasOneNonDBGUse(CondDef)) 2899 return nullptr; 2900 2901 MachineBasicBlock *Parent = MI.getParent(); 2902 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2903 2904 if (isNot(MRI, *UseMI)) { 2905 Register NegatedCond = UseMI->getOperand(0).getReg(); 2906 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2907 return nullptr; 2908 2909 // We're deleting the def of this value, so we need to remove it. 2910 eraseInstr(*UseMI, MRI); 2911 2912 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2913 Negated = true; 2914 } 2915 2916 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2917 return nullptr; 2918 2919 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2920 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2921 if (Next == Parent->end()) { 2922 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2923 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2924 return nullptr; 2925 UncondBrTarget = &*NextMBB; 2926 } else { 2927 if (Next->getOpcode() != AMDGPU::G_BR) 2928 return nullptr; 2929 Br = &*Next; 2930 UncondBrTarget = Br->getOperand(0).getMBB(); 2931 } 2932 2933 return UseMI; 2934 } 2935 2936 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2937 const ArgDescriptor *Arg, 2938 const TargetRegisterClass *ArgRC, 2939 LLT ArgTy) const { 2940 MCRegister SrcReg = Arg->getRegister(); 2941 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2942 assert(DstReg.isVirtual() && "Virtual register expected"); 2943 2944 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2945 *ArgRC, B.getDebugLoc(), ArgTy); 2946 if (Arg->isMasked()) { 2947 // TODO: Should we try to emit this once in the entry block? 2948 const LLT S32 = LLT::scalar(32); 2949 const unsigned Mask = Arg->getMask(); 2950 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2951 2952 Register AndMaskSrc = LiveIn; 2953 2954 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2955 // 0. 2956 if (Shift != 0) { 2957 auto ShiftAmt = B.buildConstant(S32, Shift); 2958 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2959 } 2960 2961 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2962 } else { 2963 B.buildCopy(DstReg, LiveIn); 2964 } 2965 2966 return true; 2967 } 2968 2969 bool AMDGPULegalizerInfo::loadInputValue( 2970 Register DstReg, MachineIRBuilder &B, 2971 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2972 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2973 const ArgDescriptor *Arg; 2974 const TargetRegisterClass *ArgRC; 2975 LLT ArgTy; 2976 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2977 2978 if (!Arg) { 2979 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2980 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2981 // case the pointer argument may be missing and we use null. 2982 B.buildConstant(DstReg, 0); 2983 return true; 2984 } 2985 2986 // It's undefined behavior if a function marked with the amdgpu-no-* 2987 // attributes uses the corresponding intrinsic. 2988 B.buildUndef(DstReg); 2989 return true; 2990 } 2991 2992 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2993 return false; // TODO: Handle these 2994 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2995 } 2996 2997 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2998 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2999 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3000 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3001 return false; 3002 3003 MI.eraseFromParent(); 3004 return true; 3005 } 3006 3007 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3008 MachineRegisterInfo &MRI, 3009 MachineIRBuilder &B) const { 3010 Register Dst = MI.getOperand(0).getReg(); 3011 LLT DstTy = MRI.getType(Dst); 3012 LLT S16 = LLT::scalar(16); 3013 LLT S32 = LLT::scalar(32); 3014 LLT S64 = LLT::scalar(64); 3015 3016 if (DstTy == S16) 3017 return legalizeFDIV16(MI, MRI, B); 3018 if (DstTy == S32) 3019 return legalizeFDIV32(MI, MRI, B); 3020 if (DstTy == S64) 3021 return legalizeFDIV64(MI, MRI, B); 3022 3023 return false; 3024 } 3025 3026 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3027 Register DstDivReg, 3028 Register DstRemReg, 3029 Register X, 3030 Register Y) const { 3031 const LLT S1 = LLT::scalar(1); 3032 const LLT S32 = LLT::scalar(32); 3033 3034 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3035 // algorithm used here. 3036 3037 // Initial estimate of inv(y). 3038 auto FloatY = B.buildUITOFP(S32, Y); 3039 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3040 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3041 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3042 auto Z = B.buildFPTOUI(S32, ScaledY); 3043 3044 // One round of UNR. 3045 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3046 auto NegYZ = B.buildMul(S32, NegY, Z); 3047 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3048 3049 // Quotient/remainder estimate. 3050 auto Q = B.buildUMulH(S32, X, Z); 3051 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3052 3053 // First quotient/remainder refinement. 3054 auto One = B.buildConstant(S32, 1); 3055 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3056 if (DstDivReg) 3057 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3058 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3059 3060 // Second quotient/remainder refinement. 3061 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3062 if (DstDivReg) 3063 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3064 3065 if (DstRemReg) 3066 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3067 } 3068 3069 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3070 // 3071 // Return lo, hi of result 3072 // 3073 // %cvt.lo = G_UITOFP Val.lo 3074 // %cvt.hi = G_UITOFP Val.hi 3075 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3076 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3077 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3078 // %mul2 = G_FMUL %mul1, 2**(-32) 3079 // %trunc = G_INTRINSIC_TRUNC %mul2 3080 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3081 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3082 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3083 Register Val) { 3084 const LLT S32 = LLT::scalar(32); 3085 auto Unmerge = B.buildUnmerge(S32, Val); 3086 3087 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3088 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3089 3090 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3091 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3092 3093 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3094 auto Mul1 = 3095 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3096 3097 // 2**(-32) 3098 auto Mul2 = 3099 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3100 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3101 3102 // -(2**32) 3103 auto Mad2 = B.buildFMAD(S32, Trunc, 3104 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3105 3106 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3107 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3108 3109 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3110 } 3111 3112 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3113 Register DstDivReg, 3114 Register DstRemReg, 3115 Register Numer, 3116 Register Denom) const { 3117 const LLT S32 = LLT::scalar(32); 3118 const LLT S64 = LLT::scalar(64); 3119 const LLT S1 = LLT::scalar(1); 3120 Register RcpLo, RcpHi; 3121 3122 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3123 3124 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3125 3126 auto Zero64 = B.buildConstant(S64, 0); 3127 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3128 3129 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3130 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3131 3132 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3133 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3134 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3135 3136 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3137 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3138 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3139 3140 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3141 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3142 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3143 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3144 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3145 3146 auto Zero32 = B.buildConstant(S32, 0); 3147 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3148 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3149 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3150 3151 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3152 Register NumerLo = UnmergeNumer.getReg(0); 3153 Register NumerHi = UnmergeNumer.getReg(1); 3154 3155 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3156 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3157 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3158 Register Mul3_Lo = UnmergeMul3.getReg(0); 3159 Register Mul3_Hi = UnmergeMul3.getReg(1); 3160 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3161 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3162 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3163 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3164 3165 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3166 Register DenomLo = UnmergeDenom.getReg(0); 3167 Register DenomHi = UnmergeDenom.getReg(1); 3168 3169 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3170 auto C1 = B.buildSExt(S32, CmpHi); 3171 3172 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3173 auto C2 = B.buildSExt(S32, CmpLo); 3174 3175 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3176 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3177 3178 // TODO: Here and below portions of the code can be enclosed into if/endif. 3179 // Currently control flow is unconditional and we have 4 selects after 3180 // potential endif to substitute PHIs. 3181 3182 // if C3 != 0 ... 3183 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3184 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3185 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3186 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3187 3188 auto One64 = B.buildConstant(S64, 1); 3189 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3190 3191 auto C4 = 3192 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3193 auto C5 = 3194 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3195 auto C6 = B.buildSelect( 3196 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3197 3198 // if (C6 != 0) 3199 auto Add4 = B.buildAdd(S64, Add3, One64); 3200 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3201 3202 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3203 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3204 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3205 3206 // endif C6 3207 // endif C3 3208 3209 if (DstDivReg) { 3210 auto Sel1 = B.buildSelect( 3211 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3212 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3213 Sel1, MulHi3); 3214 } 3215 3216 if (DstRemReg) { 3217 auto Sel2 = B.buildSelect( 3218 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3219 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3220 Sel2, Sub1); 3221 } 3222 } 3223 3224 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3225 MachineRegisterInfo &MRI, 3226 MachineIRBuilder &B) const { 3227 Register DstDivReg, DstRemReg; 3228 switch (MI.getOpcode()) { 3229 default: 3230 llvm_unreachable("Unexpected opcode!"); 3231 case AMDGPU::G_UDIV: { 3232 DstDivReg = MI.getOperand(0).getReg(); 3233 break; 3234 } 3235 case AMDGPU::G_UREM: { 3236 DstRemReg = MI.getOperand(0).getReg(); 3237 break; 3238 } 3239 case AMDGPU::G_UDIVREM: { 3240 DstDivReg = MI.getOperand(0).getReg(); 3241 DstRemReg = MI.getOperand(1).getReg(); 3242 break; 3243 } 3244 } 3245 3246 const LLT S64 = LLT::scalar(64); 3247 const LLT S32 = LLT::scalar(32); 3248 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3249 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3250 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3251 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3252 3253 if (Ty == S32) 3254 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3255 else if (Ty == S64) 3256 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3257 else 3258 return false; 3259 3260 MI.eraseFromParent(); 3261 return true; 3262 } 3263 3264 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3265 MachineRegisterInfo &MRI, 3266 MachineIRBuilder &B) const { 3267 const LLT S64 = LLT::scalar(64); 3268 const LLT S32 = LLT::scalar(32); 3269 3270 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3271 if (Ty != S32 && Ty != S64) 3272 return false; 3273 3274 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3275 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3276 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3277 3278 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3279 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3280 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3281 3282 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3283 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3284 3285 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3286 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3287 3288 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3289 switch (MI.getOpcode()) { 3290 default: 3291 llvm_unreachable("Unexpected opcode!"); 3292 case AMDGPU::G_SDIV: { 3293 DstDivReg = MI.getOperand(0).getReg(); 3294 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3295 break; 3296 } 3297 case AMDGPU::G_SREM: { 3298 DstRemReg = MI.getOperand(0).getReg(); 3299 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3300 break; 3301 } 3302 case AMDGPU::G_SDIVREM: { 3303 DstDivReg = MI.getOperand(0).getReg(); 3304 DstRemReg = MI.getOperand(1).getReg(); 3305 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3306 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3307 break; 3308 } 3309 } 3310 3311 if (Ty == S32) 3312 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3313 else 3314 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3315 3316 if (DstDivReg) { 3317 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3318 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3319 B.buildSub(DstDivReg, SignXor, Sign); 3320 } 3321 3322 if (DstRemReg) { 3323 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3324 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3325 B.buildSub(DstRemReg, SignXor, Sign); 3326 } 3327 3328 MI.eraseFromParent(); 3329 return true; 3330 } 3331 3332 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3333 MachineRegisterInfo &MRI, 3334 MachineIRBuilder &B) const { 3335 Register Res = MI.getOperand(0).getReg(); 3336 Register LHS = MI.getOperand(1).getReg(); 3337 Register RHS = MI.getOperand(2).getReg(); 3338 uint16_t Flags = MI.getFlags(); 3339 LLT ResTy = MRI.getType(Res); 3340 3341 const MachineFunction &MF = B.getMF(); 3342 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3343 MI.getFlag(MachineInstr::FmAfn); 3344 3345 if (!AllowInaccurateRcp) 3346 return false; 3347 3348 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3349 // 1 / x -> RCP(x) 3350 if (CLHS->isExactlyValue(1.0)) { 3351 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3352 .addUse(RHS) 3353 .setMIFlags(Flags); 3354 3355 MI.eraseFromParent(); 3356 return true; 3357 } 3358 3359 // -1 / x -> RCP( FNEG(x) ) 3360 if (CLHS->isExactlyValue(-1.0)) { 3361 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3362 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3363 .addUse(FNeg.getReg(0)) 3364 .setMIFlags(Flags); 3365 3366 MI.eraseFromParent(); 3367 return true; 3368 } 3369 } 3370 3371 // x / y -> x * (1.0 / y) 3372 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3373 .addUse(RHS) 3374 .setMIFlags(Flags); 3375 B.buildFMul(Res, LHS, RCP, Flags); 3376 3377 MI.eraseFromParent(); 3378 return true; 3379 } 3380 3381 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3382 MachineRegisterInfo &MRI, 3383 MachineIRBuilder &B) const { 3384 Register Res = MI.getOperand(0).getReg(); 3385 Register X = MI.getOperand(1).getReg(); 3386 Register Y = MI.getOperand(2).getReg(); 3387 uint16_t Flags = MI.getFlags(); 3388 LLT ResTy = MRI.getType(Res); 3389 3390 const MachineFunction &MF = B.getMF(); 3391 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3392 MI.getFlag(MachineInstr::FmAfn); 3393 3394 if (!AllowInaccurateRcp) 3395 return false; 3396 3397 auto NegY = B.buildFNeg(ResTy, Y); 3398 auto One = B.buildFConstant(ResTy, 1.0); 3399 3400 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3401 .addUse(Y) 3402 .setMIFlags(Flags); 3403 3404 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3405 R = B.buildFMA(ResTy, Tmp0, R, R); 3406 3407 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3408 R = B.buildFMA(ResTy, Tmp1, R, R); 3409 3410 auto Ret = B.buildFMul(ResTy, X, R); 3411 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3412 3413 B.buildFMA(Res, Tmp2, R, Ret); 3414 MI.eraseFromParent(); 3415 return true; 3416 } 3417 3418 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3419 MachineRegisterInfo &MRI, 3420 MachineIRBuilder &B) const { 3421 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3422 return true; 3423 3424 Register Res = MI.getOperand(0).getReg(); 3425 Register LHS = MI.getOperand(1).getReg(); 3426 Register RHS = MI.getOperand(2).getReg(); 3427 3428 uint16_t Flags = MI.getFlags(); 3429 3430 LLT S16 = LLT::scalar(16); 3431 LLT S32 = LLT::scalar(32); 3432 3433 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3434 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3435 3436 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3437 .addUse(RHSExt.getReg(0)) 3438 .setMIFlags(Flags); 3439 3440 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3441 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3442 3443 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3444 .addUse(RDst.getReg(0)) 3445 .addUse(RHS) 3446 .addUse(LHS) 3447 .setMIFlags(Flags); 3448 3449 MI.eraseFromParent(); 3450 return true; 3451 } 3452 3453 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3454 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3455 static void toggleSPDenormMode(bool Enable, 3456 MachineIRBuilder &B, 3457 const GCNSubtarget &ST, 3458 AMDGPU::SIModeRegisterDefaults Mode) { 3459 // Set SP denorm mode to this value. 3460 unsigned SPDenormMode = 3461 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3462 3463 if (ST.hasDenormModeInst()) { 3464 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3465 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3466 3467 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3468 B.buildInstr(AMDGPU::S_DENORM_MODE) 3469 .addImm(NewDenormModeValue); 3470 3471 } else { 3472 // Select FP32 bit field in mode register. 3473 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3474 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3475 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3476 3477 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3478 .addImm(SPDenormMode) 3479 .addImm(SPDenormModeBitField); 3480 } 3481 } 3482 3483 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3484 MachineRegisterInfo &MRI, 3485 MachineIRBuilder &B) const { 3486 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3487 return true; 3488 3489 Register Res = MI.getOperand(0).getReg(); 3490 Register LHS = MI.getOperand(1).getReg(); 3491 Register RHS = MI.getOperand(2).getReg(); 3492 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3493 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3494 3495 uint16_t Flags = MI.getFlags(); 3496 3497 LLT S32 = LLT::scalar(32); 3498 LLT S1 = LLT::scalar(1); 3499 3500 auto One = B.buildFConstant(S32, 1.0f); 3501 3502 auto DenominatorScaled = 3503 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3504 .addUse(LHS) 3505 .addUse(RHS) 3506 .addImm(0) 3507 .setMIFlags(Flags); 3508 auto NumeratorScaled = 3509 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3510 .addUse(LHS) 3511 .addUse(RHS) 3512 .addImm(1) 3513 .setMIFlags(Flags); 3514 3515 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3516 .addUse(DenominatorScaled.getReg(0)) 3517 .setMIFlags(Flags); 3518 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3519 3520 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3521 // aren't modeled as reading it. 3522 if (!Mode.allFP32Denormals()) 3523 toggleSPDenormMode(true, B, ST, Mode); 3524 3525 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3526 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3527 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3528 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3529 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3530 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3531 3532 if (!Mode.allFP32Denormals()) 3533 toggleSPDenormMode(false, B, ST, Mode); 3534 3535 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3536 .addUse(Fma4.getReg(0)) 3537 .addUse(Fma1.getReg(0)) 3538 .addUse(Fma3.getReg(0)) 3539 .addUse(NumeratorScaled.getReg(1)) 3540 .setMIFlags(Flags); 3541 3542 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3543 .addUse(Fmas.getReg(0)) 3544 .addUse(RHS) 3545 .addUse(LHS) 3546 .setMIFlags(Flags); 3547 3548 MI.eraseFromParent(); 3549 return true; 3550 } 3551 3552 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3553 MachineRegisterInfo &MRI, 3554 MachineIRBuilder &B) const { 3555 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3556 return true; 3557 3558 Register Res = MI.getOperand(0).getReg(); 3559 Register LHS = MI.getOperand(1).getReg(); 3560 Register RHS = MI.getOperand(2).getReg(); 3561 3562 uint16_t Flags = MI.getFlags(); 3563 3564 LLT S64 = LLT::scalar(64); 3565 LLT S1 = LLT::scalar(1); 3566 3567 auto One = B.buildFConstant(S64, 1.0); 3568 3569 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3570 .addUse(LHS) 3571 .addUse(RHS) 3572 .addImm(0) 3573 .setMIFlags(Flags); 3574 3575 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3576 3577 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3578 .addUse(DivScale0.getReg(0)) 3579 .setMIFlags(Flags); 3580 3581 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3582 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3583 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3584 3585 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3586 .addUse(LHS) 3587 .addUse(RHS) 3588 .addImm(1) 3589 .setMIFlags(Flags); 3590 3591 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3592 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3593 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3594 3595 Register Scale; 3596 if (!ST.hasUsableDivScaleConditionOutput()) { 3597 // Workaround a hardware bug on SI where the condition output from div_scale 3598 // is not usable. 3599 3600 LLT S32 = LLT::scalar(32); 3601 3602 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3603 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3604 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3605 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3606 3607 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3608 Scale1Unmerge.getReg(1)); 3609 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3610 Scale0Unmerge.getReg(1)); 3611 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3612 } else { 3613 Scale = DivScale1.getReg(1); 3614 } 3615 3616 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3617 .addUse(Fma4.getReg(0)) 3618 .addUse(Fma3.getReg(0)) 3619 .addUse(Mul.getReg(0)) 3620 .addUse(Scale) 3621 .setMIFlags(Flags); 3622 3623 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3624 .addUse(Fmas.getReg(0)) 3625 .addUse(RHS) 3626 .addUse(LHS) 3627 .setMIFlags(Flags); 3628 3629 MI.eraseFromParent(); 3630 return true; 3631 } 3632 3633 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3634 MachineRegisterInfo &MRI, 3635 MachineIRBuilder &B) const { 3636 Register Res = MI.getOperand(0).getReg(); 3637 Register LHS = MI.getOperand(2).getReg(); 3638 Register RHS = MI.getOperand(3).getReg(); 3639 uint16_t Flags = MI.getFlags(); 3640 3641 LLT S32 = LLT::scalar(32); 3642 LLT S1 = LLT::scalar(1); 3643 3644 auto Abs = B.buildFAbs(S32, RHS, Flags); 3645 const APFloat C0Val(1.0f); 3646 3647 auto C0 = B.buildConstant(S32, 0x6f800000); 3648 auto C1 = B.buildConstant(S32, 0x2f800000); 3649 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3650 3651 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3652 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3653 3654 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3655 3656 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3657 .addUse(Mul0.getReg(0)) 3658 .setMIFlags(Flags); 3659 3660 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3661 3662 B.buildFMul(Res, Sel, Mul1, Flags); 3663 3664 MI.eraseFromParent(); 3665 return true; 3666 } 3667 3668 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3669 // FIXME: Why do we handle this one but not other removed instructions? 3670 // 3671 // Reciprocal square root. The clamp prevents infinite results, clamping 3672 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3673 // +-max_float. 3674 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3675 MachineRegisterInfo &MRI, 3676 MachineIRBuilder &B) const { 3677 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3678 return true; 3679 3680 Register Dst = MI.getOperand(0).getReg(); 3681 Register Src = MI.getOperand(2).getReg(); 3682 auto Flags = MI.getFlags(); 3683 3684 LLT Ty = MRI.getType(Dst); 3685 3686 const fltSemantics *FltSemantics; 3687 if (Ty == LLT::scalar(32)) 3688 FltSemantics = &APFloat::IEEEsingle(); 3689 else if (Ty == LLT::scalar(64)) 3690 FltSemantics = &APFloat::IEEEdouble(); 3691 else 3692 return false; 3693 3694 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3695 .addUse(Src) 3696 .setMIFlags(Flags); 3697 3698 // We don't need to concern ourselves with the snan handling difference, since 3699 // the rsq quieted (or not) so use the one which will directly select. 3700 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3701 const bool UseIEEE = MFI->getMode().IEEE; 3702 3703 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3704 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3705 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3706 3707 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3708 3709 if (UseIEEE) 3710 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3711 else 3712 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3713 MI.eraseFromParent(); 3714 return true; 3715 } 3716 3717 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3718 switch (IID) { 3719 case Intrinsic::amdgcn_ds_fadd: 3720 return AMDGPU::G_ATOMICRMW_FADD; 3721 case Intrinsic::amdgcn_ds_fmin: 3722 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3723 case Intrinsic::amdgcn_ds_fmax: 3724 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3725 default: 3726 llvm_unreachable("not a DS FP intrinsic"); 3727 } 3728 } 3729 3730 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3731 MachineInstr &MI, 3732 Intrinsic::ID IID) const { 3733 GISelChangeObserver &Observer = Helper.Observer; 3734 Observer.changingInstr(MI); 3735 3736 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3737 3738 // The remaining operands were used to set fields in the MemOperand on 3739 // construction. 3740 for (int I = 6; I > 3; --I) 3741 MI.removeOperand(I); 3742 3743 MI.removeOperand(1); // Remove the intrinsic ID. 3744 Observer.changedInstr(MI); 3745 return true; 3746 } 3747 3748 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3749 MachineRegisterInfo &MRI, 3750 MachineIRBuilder &B) const { 3751 uint64_t Offset = 3752 ST.getTargetLowering()->getImplicitParameterOffset( 3753 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3754 LLT DstTy = MRI.getType(DstReg); 3755 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3756 3757 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3758 if (!loadInputValue(KernargPtrReg, B, 3759 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3760 return false; 3761 3762 // FIXME: This should be nuw 3763 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3764 return true; 3765 } 3766 3767 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3768 MachineRegisterInfo &MRI, 3769 MachineIRBuilder &B) const { 3770 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3771 if (!MFI->isEntryFunction()) { 3772 return legalizePreloadedArgIntrin(MI, MRI, B, 3773 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3774 } 3775 3776 Register DstReg = MI.getOperand(0).getReg(); 3777 if (!getImplicitArgPtr(DstReg, MRI, B)) 3778 return false; 3779 3780 MI.eraseFromParent(); 3781 return true; 3782 } 3783 3784 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3785 MachineRegisterInfo &MRI, 3786 MachineIRBuilder &B, 3787 unsigned AddrSpace) const { 3788 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3789 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3790 Register Hi32 = Unmerge.getReg(1); 3791 3792 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3793 MI.eraseFromParent(); 3794 return true; 3795 } 3796 3797 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3798 // offset (the offset that is included in bounds checking and swizzling, to be 3799 // split between the instruction's voffset and immoffset fields) and soffset 3800 // (the offset that is excluded from bounds checking and swizzling, to go in 3801 // the instruction's soffset field). This function takes the first kind of 3802 // offset and figures out how to split it between voffset and immoffset. 3803 std::pair<Register, unsigned> 3804 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3805 Register OrigOffset) const { 3806 const unsigned MaxImm = 4095; 3807 Register BaseReg; 3808 unsigned ImmOffset; 3809 const LLT S32 = LLT::scalar(32); 3810 MachineRegisterInfo &MRI = *B.getMRI(); 3811 3812 std::tie(BaseReg, ImmOffset) = 3813 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3814 3815 // If BaseReg is a pointer, convert it to int. 3816 if (MRI.getType(BaseReg).isPointer()) 3817 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3818 3819 // If the immediate value is too big for the immoffset field, put the value 3820 // and -4096 into the immoffset field so that the value that is copied/added 3821 // for the voffset field is a multiple of 4096, and it stands more chance 3822 // of being CSEd with the copy/add for another similar load/store. 3823 // However, do not do that rounding down to a multiple of 4096 if that is a 3824 // negative number, as it appears to be illegal to have a negative offset 3825 // in the vgpr, even if adding the immediate offset makes it positive. 3826 unsigned Overflow = ImmOffset & ~MaxImm; 3827 ImmOffset -= Overflow; 3828 if ((int32_t)Overflow < 0) { 3829 Overflow += ImmOffset; 3830 ImmOffset = 0; 3831 } 3832 3833 if (Overflow != 0) { 3834 if (!BaseReg) { 3835 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3836 } else { 3837 auto OverflowVal = B.buildConstant(S32, Overflow); 3838 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3839 } 3840 } 3841 3842 if (!BaseReg) 3843 BaseReg = B.buildConstant(S32, 0).getReg(0); 3844 3845 return std::make_pair(BaseReg, ImmOffset); 3846 } 3847 3848 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3849 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3850 Register VOffset, Register SOffset, 3851 unsigned ImmOffset, Register VIndex, 3852 MachineRegisterInfo &MRI) const { 3853 Optional<ValueAndVReg> MaybeVOffsetVal = 3854 getIConstantVRegValWithLookThrough(VOffset, MRI); 3855 Optional<ValueAndVReg> MaybeSOffsetVal = 3856 getIConstantVRegValWithLookThrough(SOffset, MRI); 3857 Optional<ValueAndVReg> MaybeVIndexVal = 3858 getIConstantVRegValWithLookThrough(VIndex, MRI); 3859 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3860 // update the MMO with that offset. The stride is unknown so we can only do 3861 // this if VIndex is constant 0. 3862 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3863 MaybeVIndexVal->Value == 0) { 3864 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3865 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3866 MMO->setOffset(TotalOffset); 3867 } else { 3868 // We don't have a constant combined offset to use in the MMO. Give up. 3869 MMO->setValue((Value *)nullptr); 3870 } 3871 } 3872 3873 /// Handle register layout difference for f16 images for some subtargets. 3874 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3875 MachineRegisterInfo &MRI, 3876 Register Reg, 3877 bool ImageStore) const { 3878 const LLT S16 = LLT::scalar(16); 3879 const LLT S32 = LLT::scalar(32); 3880 LLT StoreVT = MRI.getType(Reg); 3881 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3882 3883 if (ST.hasUnpackedD16VMem()) { 3884 auto Unmerge = B.buildUnmerge(S16, Reg); 3885 3886 SmallVector<Register, 4> WideRegs; 3887 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3888 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3889 3890 int NumElts = StoreVT.getNumElements(); 3891 3892 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3893 .getReg(0); 3894 } 3895 3896 if (ImageStore && ST.hasImageStoreD16Bug()) { 3897 if (StoreVT.getNumElements() == 2) { 3898 SmallVector<Register, 4> PackedRegs; 3899 Reg = B.buildBitcast(S32, Reg).getReg(0); 3900 PackedRegs.push_back(Reg); 3901 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3902 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3903 .getReg(0); 3904 } 3905 3906 if (StoreVT.getNumElements() == 3) { 3907 SmallVector<Register, 4> PackedRegs; 3908 auto Unmerge = B.buildUnmerge(S16, Reg); 3909 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3910 PackedRegs.push_back(Unmerge.getReg(I)); 3911 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3912 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3913 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3914 } 3915 3916 if (StoreVT.getNumElements() == 4) { 3917 SmallVector<Register, 4> PackedRegs; 3918 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3919 auto Unmerge = B.buildUnmerge(S32, Reg); 3920 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3921 PackedRegs.push_back(Unmerge.getReg(I)); 3922 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3923 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3924 .getReg(0); 3925 } 3926 3927 llvm_unreachable("invalid data type"); 3928 } 3929 3930 if (StoreVT == LLT::fixed_vector(3, S16)) { 3931 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3932 .getReg(0); 3933 } 3934 return Reg; 3935 } 3936 3937 Register AMDGPULegalizerInfo::fixStoreSourceType( 3938 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3939 MachineRegisterInfo *MRI = B.getMRI(); 3940 LLT Ty = MRI->getType(VData); 3941 3942 const LLT S16 = LLT::scalar(16); 3943 3944 // Fixup illegal register types for i8 stores. 3945 if (Ty == LLT::scalar(8) || Ty == S16) { 3946 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3947 return AnyExt; 3948 } 3949 3950 if (Ty.isVector()) { 3951 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3952 if (IsFormat) 3953 return handleD16VData(B, *MRI, VData); 3954 } 3955 } 3956 3957 return VData; 3958 } 3959 3960 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3961 MachineRegisterInfo &MRI, 3962 MachineIRBuilder &B, 3963 bool IsTyped, 3964 bool IsFormat) const { 3965 Register VData = MI.getOperand(1).getReg(); 3966 LLT Ty = MRI.getType(VData); 3967 LLT EltTy = Ty.getScalarType(); 3968 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3969 const LLT S32 = LLT::scalar(32); 3970 3971 VData = fixStoreSourceType(B, VData, IsFormat); 3972 Register RSrc = MI.getOperand(2).getReg(); 3973 3974 MachineMemOperand *MMO = *MI.memoperands_begin(); 3975 const int MemSize = MMO->getSize(); 3976 3977 unsigned ImmOffset; 3978 3979 // The typed intrinsics add an immediate after the registers. 3980 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3981 3982 // The struct intrinsic variants add one additional operand over raw. 3983 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3984 Register VIndex; 3985 int OpOffset = 0; 3986 if (HasVIndex) { 3987 VIndex = MI.getOperand(3).getReg(); 3988 OpOffset = 1; 3989 } else { 3990 VIndex = B.buildConstant(S32, 0).getReg(0); 3991 } 3992 3993 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3994 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3995 3996 unsigned Format = 0; 3997 if (IsTyped) { 3998 Format = MI.getOperand(5 + OpOffset).getImm(); 3999 ++OpOffset; 4000 } 4001 4002 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4003 4004 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4005 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4006 4007 unsigned Opc; 4008 if (IsTyped) { 4009 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4010 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4011 } else if (IsFormat) { 4012 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4013 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4014 } else { 4015 switch (MemSize) { 4016 case 1: 4017 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4018 break; 4019 case 2: 4020 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4021 break; 4022 default: 4023 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4024 break; 4025 } 4026 } 4027 4028 auto MIB = B.buildInstr(Opc) 4029 .addUse(VData) // vdata 4030 .addUse(RSrc) // rsrc 4031 .addUse(VIndex) // vindex 4032 .addUse(VOffset) // voffset 4033 .addUse(SOffset) // soffset 4034 .addImm(ImmOffset); // offset(imm) 4035 4036 if (IsTyped) 4037 MIB.addImm(Format); 4038 4039 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4040 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4041 .addMemOperand(MMO); 4042 4043 MI.eraseFromParent(); 4044 return true; 4045 } 4046 4047 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4048 MachineRegisterInfo &MRI, 4049 MachineIRBuilder &B, 4050 bool IsFormat, 4051 bool IsTyped) const { 4052 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4053 MachineMemOperand *MMO = *MI.memoperands_begin(); 4054 const LLT MemTy = MMO->getMemoryType(); 4055 const LLT S32 = LLT::scalar(32); 4056 4057 Register Dst = MI.getOperand(0).getReg(); 4058 Register RSrc = MI.getOperand(2).getReg(); 4059 4060 // The typed intrinsics add an immediate after the registers. 4061 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4062 4063 // The struct intrinsic variants add one additional operand over raw. 4064 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4065 Register VIndex; 4066 int OpOffset = 0; 4067 if (HasVIndex) { 4068 VIndex = MI.getOperand(3).getReg(); 4069 OpOffset = 1; 4070 } else { 4071 VIndex = B.buildConstant(S32, 0).getReg(0); 4072 } 4073 4074 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4075 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4076 4077 unsigned Format = 0; 4078 if (IsTyped) { 4079 Format = MI.getOperand(5 + OpOffset).getImm(); 4080 ++OpOffset; 4081 } 4082 4083 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4084 unsigned ImmOffset; 4085 4086 LLT Ty = MRI.getType(Dst); 4087 LLT EltTy = Ty.getScalarType(); 4088 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4089 const bool Unpacked = ST.hasUnpackedD16VMem(); 4090 4091 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4092 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4093 4094 unsigned Opc; 4095 4096 if (IsTyped) { 4097 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4098 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4099 } else if (IsFormat) { 4100 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4101 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4102 } else { 4103 switch (MemTy.getSizeInBits()) { 4104 case 8: 4105 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4106 break; 4107 case 16: 4108 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4109 break; 4110 default: 4111 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4112 break; 4113 } 4114 } 4115 4116 Register LoadDstReg; 4117 4118 bool IsExtLoad = 4119 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4120 LLT UnpackedTy = Ty.changeElementSize(32); 4121 4122 if (IsExtLoad) 4123 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4124 else if (Unpacked && IsD16 && Ty.isVector()) 4125 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4126 else 4127 LoadDstReg = Dst; 4128 4129 auto MIB = B.buildInstr(Opc) 4130 .addDef(LoadDstReg) // vdata 4131 .addUse(RSrc) // rsrc 4132 .addUse(VIndex) // vindex 4133 .addUse(VOffset) // voffset 4134 .addUse(SOffset) // soffset 4135 .addImm(ImmOffset); // offset(imm) 4136 4137 if (IsTyped) 4138 MIB.addImm(Format); 4139 4140 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4141 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4142 .addMemOperand(MMO); 4143 4144 if (LoadDstReg != Dst) { 4145 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4146 4147 // Widen result for extending loads was widened. 4148 if (IsExtLoad) 4149 B.buildTrunc(Dst, LoadDstReg); 4150 else { 4151 // Repack to original 16-bit vector result 4152 // FIXME: G_TRUNC should work, but legalization currently fails 4153 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4154 SmallVector<Register, 4> Repack; 4155 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4156 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4157 B.buildMerge(Dst, Repack); 4158 } 4159 } 4160 4161 MI.eraseFromParent(); 4162 return true; 4163 } 4164 4165 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4166 MachineIRBuilder &B, 4167 bool IsInc) const { 4168 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4169 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4170 B.buildInstr(Opc) 4171 .addDef(MI.getOperand(0).getReg()) 4172 .addUse(MI.getOperand(2).getReg()) 4173 .addUse(MI.getOperand(3).getReg()) 4174 .cloneMemRefs(MI); 4175 MI.eraseFromParent(); 4176 return true; 4177 } 4178 4179 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4180 switch (IntrID) { 4181 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4182 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4183 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4184 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4185 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4186 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4187 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4188 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4189 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4190 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4191 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4192 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4193 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4194 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4195 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4196 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4197 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4198 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4199 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4200 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4201 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4202 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4203 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4205 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4206 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4207 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4208 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4209 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4210 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4211 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4212 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4213 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4214 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4215 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4216 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4217 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4218 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4220 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4221 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4223 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4224 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4226 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4227 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4228 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4229 default: 4230 llvm_unreachable("unhandled atomic opcode"); 4231 } 4232 } 4233 4234 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4235 MachineIRBuilder &B, 4236 Intrinsic::ID IID) const { 4237 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4238 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4239 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4240 4241 Register Dst; 4242 4243 int OpOffset = 0; 4244 if (HasReturn) { 4245 // A few FP atomics do not support return values. 4246 Dst = MI.getOperand(0).getReg(); 4247 } else { 4248 OpOffset = -1; 4249 } 4250 4251 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4252 Register CmpVal; 4253 4254 if (IsCmpSwap) { 4255 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4256 ++OpOffset; 4257 } 4258 4259 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4260 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4261 4262 // The struct intrinsic variants add one additional operand over raw. 4263 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4264 Register VIndex; 4265 if (HasVIndex) { 4266 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4267 ++OpOffset; 4268 } else { 4269 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4270 } 4271 4272 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4273 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4274 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4275 4276 MachineMemOperand *MMO = *MI.memoperands_begin(); 4277 4278 unsigned ImmOffset; 4279 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4280 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4281 4282 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4283 4284 if (HasReturn) 4285 MIB.addDef(Dst); 4286 4287 MIB.addUse(VData); // vdata 4288 4289 if (IsCmpSwap) 4290 MIB.addReg(CmpVal); 4291 4292 MIB.addUse(RSrc) // rsrc 4293 .addUse(VIndex) // vindex 4294 .addUse(VOffset) // voffset 4295 .addUse(SOffset) // soffset 4296 .addImm(ImmOffset) // offset(imm) 4297 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4298 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4299 .addMemOperand(MMO); 4300 4301 MI.eraseFromParent(); 4302 return true; 4303 } 4304 4305 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4306 /// vector with s16 typed elements. 4307 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4308 SmallVectorImpl<Register> &PackedAddrs, 4309 unsigned ArgOffset, 4310 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4311 bool IsA16, bool IsG16) { 4312 const LLT S16 = LLT::scalar(16); 4313 const LLT V2S16 = LLT::fixed_vector(2, 16); 4314 auto EndIdx = Intr->VAddrEnd; 4315 4316 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4317 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4318 if (!SrcOp.isReg()) 4319 continue; // _L to _LZ may have eliminated this. 4320 4321 Register AddrReg = SrcOp.getReg(); 4322 4323 if ((I < Intr->GradientStart) || 4324 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4325 (I >= Intr->CoordStart && !IsA16)) { 4326 if ((I < Intr->GradientStart) && IsA16 && 4327 (B.getMRI()->getType(AddrReg) == S16)) { 4328 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4329 // Special handling of bias when A16 is on. Bias is of type half but 4330 // occupies full 32-bit. 4331 PackedAddrs.push_back( 4332 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4333 .getReg(0)); 4334 } else { 4335 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4336 "Bias needs to be converted to 16 bit in A16 mode"); 4337 // Handle any gradient or coordinate operands that should not be packed 4338 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4339 PackedAddrs.push_back(AddrReg); 4340 } 4341 } else { 4342 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4343 // derivatives dx/dh and dx/dv are packed with undef. 4344 if (((I + 1) >= EndIdx) || 4345 ((Intr->NumGradients / 2) % 2 == 1 && 4346 (I == static_cast<unsigned>(Intr->GradientStart + 4347 (Intr->NumGradients / 2) - 1) || 4348 I == static_cast<unsigned>(Intr->GradientStart + 4349 Intr->NumGradients - 1))) || 4350 // Check for _L to _LZ optimization 4351 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4352 PackedAddrs.push_back( 4353 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4354 .getReg(0)); 4355 } else { 4356 PackedAddrs.push_back( 4357 B.buildBuildVector( 4358 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4359 .getReg(0)); 4360 ++I; 4361 } 4362 } 4363 } 4364 } 4365 4366 /// Convert from separate vaddr components to a single vector address register, 4367 /// and replace the remaining operands with $noreg. 4368 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4369 int DimIdx, int NumVAddrs) { 4370 const LLT S32 = LLT::scalar(32); 4371 4372 SmallVector<Register, 8> AddrRegs; 4373 for (int I = 0; I != NumVAddrs; ++I) { 4374 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4375 if (SrcOp.isReg()) { 4376 AddrRegs.push_back(SrcOp.getReg()); 4377 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4378 } 4379 } 4380 4381 int NumAddrRegs = AddrRegs.size(); 4382 if (NumAddrRegs != 1) { 4383 // Above 8 elements round up to next power of 2 (i.e. 16). 4384 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4385 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4386 auto Undef = B.buildUndef(S32); 4387 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4388 NumAddrRegs = RoundedNumRegs; 4389 } 4390 4391 auto VAddr = 4392 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4393 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4394 } 4395 4396 for (int I = 1; I != NumVAddrs; ++I) { 4397 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4398 if (SrcOp.isReg()) 4399 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4400 } 4401 } 4402 4403 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4404 /// 4405 /// Depending on the subtarget, load/store with 16-bit element data need to be 4406 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4407 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4408 /// registers. 4409 /// 4410 /// We don't want to directly select image instructions just yet, but also want 4411 /// to exposes all register repacking to the legalizer/combiners. We also don't 4412 /// want a selected instruction entering RegBankSelect. In order to avoid 4413 /// defining a multitude of intermediate image instructions, directly hack on 4414 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4415 /// padding now unnecessary arguments with $noreg. 4416 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4417 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4418 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4419 4420 const unsigned NumDefs = MI.getNumExplicitDefs(); 4421 const unsigned ArgOffset = NumDefs + 1; 4422 bool IsTFE = NumDefs == 2; 4423 // We are only processing the operands of d16 image operations on subtargets 4424 // that use the unpacked register layout, or need to repack the TFE result. 4425 4426 // TODO: Do we need to guard against already legalized intrinsics? 4427 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4428 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4429 4430 MachineRegisterInfo *MRI = B.getMRI(); 4431 const LLT S32 = LLT::scalar(32); 4432 const LLT S16 = LLT::scalar(16); 4433 const LLT V2S16 = LLT::fixed_vector(2, 16); 4434 4435 unsigned DMask = 0; 4436 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4437 LLT Ty = MRI->getType(VData); 4438 4439 // Check for 16 bit addresses and pack if true. 4440 LLT GradTy = 4441 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4442 LLT AddrTy = 4443 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4444 const bool IsG16 = GradTy == S16; 4445 const bool IsA16 = AddrTy == S16; 4446 const bool IsD16 = Ty.getScalarType() == S16; 4447 4448 int DMaskLanes = 0; 4449 if (!BaseOpcode->Atomic) { 4450 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4451 if (BaseOpcode->Gather4) { 4452 DMaskLanes = 4; 4453 } else if (DMask != 0) { 4454 DMaskLanes = countPopulation(DMask); 4455 } else if (!IsTFE && !BaseOpcode->Store) { 4456 // If dmask is 0, this is a no-op load. This can be eliminated. 4457 B.buildUndef(MI.getOperand(0)); 4458 MI.eraseFromParent(); 4459 return true; 4460 } 4461 } 4462 4463 Observer.changingInstr(MI); 4464 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4465 4466 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4467 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4468 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4469 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4470 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4471 4472 // Track that we legalized this 4473 MI.setDesc(B.getTII().get(NewOpcode)); 4474 4475 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4476 // dmask to be at least 1 otherwise the instruction will fail 4477 if (IsTFE && DMask == 0) { 4478 DMask = 0x1; 4479 DMaskLanes = 1; 4480 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4481 } 4482 4483 if (BaseOpcode->Atomic) { 4484 Register VData0 = MI.getOperand(2).getReg(); 4485 LLT Ty = MRI->getType(VData0); 4486 4487 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4488 if (Ty.isVector()) 4489 return false; 4490 4491 if (BaseOpcode->AtomicX2) { 4492 Register VData1 = MI.getOperand(3).getReg(); 4493 // The two values are packed in one register. 4494 LLT PackedTy = LLT::fixed_vector(2, Ty); 4495 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4496 MI.getOperand(2).setReg(Concat.getReg(0)); 4497 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4498 } 4499 } 4500 4501 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4502 4503 // Rewrite the addressing register layout before doing anything else. 4504 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4505 // 16 bit gradients are supported, but are tied to the A16 control 4506 // so both gradients and addresses must be 16 bit 4507 return false; 4508 } 4509 4510 if (IsA16 && !ST.hasA16()) { 4511 // A16 not supported 4512 return false; 4513 } 4514 4515 if (IsA16 || IsG16) { 4516 if (Intr->NumVAddrs > 1) { 4517 SmallVector<Register, 4> PackedRegs; 4518 4519 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4520 IsG16); 4521 4522 // See also below in the non-a16 branch 4523 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4524 PackedRegs.size() <= ST.getNSAMaxSize(); 4525 4526 if (!UseNSA && PackedRegs.size() > 1) { 4527 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4528 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4529 PackedRegs[0] = Concat.getReg(0); 4530 PackedRegs.resize(1); 4531 } 4532 4533 const unsigned NumPacked = PackedRegs.size(); 4534 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4535 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4536 if (!SrcOp.isReg()) { 4537 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4538 continue; 4539 } 4540 4541 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4542 4543 if (I - Intr->VAddrStart < NumPacked) 4544 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4545 else 4546 SrcOp.setReg(AMDGPU::NoRegister); 4547 } 4548 } 4549 } else { 4550 // If the register allocator cannot place the address registers contiguously 4551 // without introducing moves, then using the non-sequential address encoding 4552 // is always preferable, since it saves VALU instructions and is usually a 4553 // wash in terms of code size or even better. 4554 // 4555 // However, we currently have no way of hinting to the register allocator 4556 // that MIMG addresses should be placed contiguously when it is possible to 4557 // do so, so force non-NSA for the common 2-address case as a heuristic. 4558 // 4559 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4560 // allocation when possible. 4561 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4562 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4563 4564 if (!UseNSA && Intr->NumVAddrs > 1) 4565 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4566 Intr->NumVAddrs); 4567 } 4568 4569 int Flags = 0; 4570 if (IsA16) 4571 Flags |= 1; 4572 if (IsG16) 4573 Flags |= 2; 4574 MI.addOperand(MachineOperand::CreateImm(Flags)); 4575 4576 if (BaseOpcode->Store) { // No TFE for stores? 4577 // TODO: Handle dmask trim 4578 if (!Ty.isVector() || !IsD16) 4579 return true; 4580 4581 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4582 if (RepackedReg != VData) { 4583 MI.getOperand(1).setReg(RepackedReg); 4584 } 4585 4586 return true; 4587 } 4588 4589 Register DstReg = MI.getOperand(0).getReg(); 4590 const LLT EltTy = Ty.getScalarType(); 4591 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4592 4593 // Confirm that the return type is large enough for the dmask specified 4594 if (NumElts < DMaskLanes) 4595 return false; 4596 4597 if (NumElts > 4 || DMaskLanes > 4) 4598 return false; 4599 4600 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4601 const LLT AdjustedTy = 4602 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4603 4604 // The raw dword aligned data component of the load. The only legal cases 4605 // where this matters should be when using the packed D16 format, for 4606 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4607 LLT RoundedTy; 4608 4609 // S32 vector to to cover all data, plus TFE result element. 4610 LLT TFETy; 4611 4612 // Register type to use for each loaded component. Will be S32 or V2S16. 4613 LLT RegTy; 4614 4615 if (IsD16 && ST.hasUnpackedD16VMem()) { 4616 RoundedTy = 4617 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4618 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4619 RegTy = S32; 4620 } else { 4621 unsigned EltSize = EltTy.getSizeInBits(); 4622 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4623 unsigned RoundedSize = 32 * RoundedElts; 4624 RoundedTy = LLT::scalarOrVector( 4625 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4626 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4627 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4628 } 4629 4630 // The return type does not need adjustment. 4631 // TODO: Should we change s16 case to s32 or <2 x s16>? 4632 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4633 return true; 4634 4635 Register Dst1Reg; 4636 4637 // Insert after the instruction. 4638 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4639 4640 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4641 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4642 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4643 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4644 4645 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4646 4647 MI.getOperand(0).setReg(NewResultReg); 4648 4649 // In the IR, TFE is supposed to be used with a 2 element struct return 4650 // type. The instruction really returns these two values in one contiguous 4651 // register, with one additional dword beyond the loaded data. Rewrite the 4652 // return type to use a single register result. 4653 4654 if (IsTFE) { 4655 Dst1Reg = MI.getOperand(1).getReg(); 4656 if (MRI->getType(Dst1Reg) != S32) 4657 return false; 4658 4659 // TODO: Make sure the TFE operand bit is set. 4660 MI.removeOperand(1); 4661 4662 // Handle the easy case that requires no repack instructions. 4663 if (Ty == S32) { 4664 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4665 return true; 4666 } 4667 } 4668 4669 // Now figure out how to copy the new result register back into the old 4670 // result. 4671 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4672 4673 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4674 4675 if (ResultNumRegs == 1) { 4676 assert(!IsTFE); 4677 ResultRegs[0] = NewResultReg; 4678 } else { 4679 // We have to repack into a new vector of some kind. 4680 for (int I = 0; I != NumDataRegs; ++I) 4681 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4682 B.buildUnmerge(ResultRegs, NewResultReg); 4683 4684 // Drop the final TFE element to get the data part. The TFE result is 4685 // directly written to the right place already. 4686 if (IsTFE) 4687 ResultRegs.resize(NumDataRegs); 4688 } 4689 4690 // For an s16 scalar result, we form an s32 result with a truncate regardless 4691 // of packed vs. unpacked. 4692 if (IsD16 && !Ty.isVector()) { 4693 B.buildTrunc(DstReg, ResultRegs[0]); 4694 return true; 4695 } 4696 4697 // Avoid a build/concat_vector of 1 entry. 4698 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4699 B.buildBitcast(DstReg, ResultRegs[0]); 4700 return true; 4701 } 4702 4703 assert(Ty.isVector()); 4704 4705 if (IsD16) { 4706 // For packed D16 results with TFE enabled, all the data components are 4707 // S32. Cast back to the expected type. 4708 // 4709 // TODO: We don't really need to use load s32 elements. We would only need one 4710 // cast for the TFE result if a multiple of v2s16 was used. 4711 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4712 for (Register &Reg : ResultRegs) 4713 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4714 } else if (ST.hasUnpackedD16VMem()) { 4715 for (Register &Reg : ResultRegs) 4716 Reg = B.buildTrunc(S16, Reg).getReg(0); 4717 } 4718 } 4719 4720 auto padWithUndef = [&](LLT Ty, int NumElts) { 4721 if (NumElts == 0) 4722 return; 4723 Register Undef = B.buildUndef(Ty).getReg(0); 4724 for (int I = 0; I != NumElts; ++I) 4725 ResultRegs.push_back(Undef); 4726 }; 4727 4728 // Pad out any elements eliminated due to the dmask. 4729 LLT ResTy = MRI->getType(ResultRegs[0]); 4730 if (!ResTy.isVector()) { 4731 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4732 B.buildBuildVector(DstReg, ResultRegs); 4733 return true; 4734 } 4735 4736 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4737 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4738 4739 // Deal with the one annoying legal case. 4740 const LLT V3S16 = LLT::fixed_vector(3, 16); 4741 if (Ty == V3S16) { 4742 if (IsTFE) { 4743 if (ResultRegs.size() == 1) { 4744 NewResultReg = ResultRegs[0]; 4745 } else if (ResultRegs.size() == 2) { 4746 LLT V4S16 = LLT::fixed_vector(4, 16); 4747 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4748 } else { 4749 return false; 4750 } 4751 } 4752 4753 if (MRI->getType(DstReg).getNumElements() < 4754 MRI->getType(NewResultReg).getNumElements()) { 4755 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4756 } else { 4757 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4758 } 4759 return true; 4760 } 4761 4762 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4763 B.buildConcatVectors(DstReg, ResultRegs); 4764 return true; 4765 } 4766 4767 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4768 LegalizerHelper &Helper, MachineInstr &MI) const { 4769 MachineIRBuilder &B = Helper.MIRBuilder; 4770 GISelChangeObserver &Observer = Helper.Observer; 4771 4772 Register Dst = MI.getOperand(0).getReg(); 4773 LLT Ty = B.getMRI()->getType(Dst); 4774 unsigned Size = Ty.getSizeInBits(); 4775 MachineFunction &MF = B.getMF(); 4776 4777 Observer.changingInstr(MI); 4778 4779 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4780 Ty = getBitcastRegisterType(Ty); 4781 Helper.bitcastDst(MI, Ty, 0); 4782 Dst = MI.getOperand(0).getReg(); 4783 B.setInsertPt(B.getMBB(), MI); 4784 } 4785 4786 // FIXME: We don't really need this intermediate instruction. The intrinsic 4787 // should be fixed to have a memory operand. Since it's readnone, we're not 4788 // allowed to add one. 4789 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4790 MI.removeOperand(1); // Remove intrinsic ID 4791 4792 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4793 // TODO: Should this use datalayout alignment? 4794 const unsigned MemSize = (Size + 7) / 8; 4795 const Align MemAlign(4); 4796 MachineMemOperand *MMO = MF.getMachineMemOperand( 4797 MachinePointerInfo(), 4798 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4799 MachineMemOperand::MOInvariant, 4800 MemSize, MemAlign); 4801 MI.addMemOperand(MF, MMO); 4802 4803 // There are no 96-bit result scalar loads, but widening to 128-bit should 4804 // always be legal. We may need to restore this to a 96-bit result if it turns 4805 // out this needs to be converted to a vector load during RegBankSelect. 4806 if (!isPowerOf2_32(Size)) { 4807 if (Ty.isVector()) 4808 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4809 else 4810 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4811 } 4812 4813 Observer.changedInstr(MI); 4814 return true; 4815 } 4816 4817 // TODO: Move to selection 4818 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4819 MachineRegisterInfo &MRI, 4820 MachineIRBuilder &B) const { 4821 if (!ST.isTrapHandlerEnabled() || 4822 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4823 return legalizeTrapEndpgm(MI, MRI, B); 4824 4825 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4826 switch (*HsaAbiVer) { 4827 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4828 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4829 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4830 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4831 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4832 return ST.supportsGetDoorbellID() ? 4833 legalizeTrapHsa(MI, MRI, B) : 4834 legalizeTrapHsaQueuePtr(MI, MRI, B); 4835 } 4836 } 4837 4838 llvm_unreachable("Unknown trap handler"); 4839 } 4840 4841 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4842 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4843 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4844 MI.eraseFromParent(); 4845 return true; 4846 } 4847 4848 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4849 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4850 MachineFunction &MF = B.getMF(); 4851 const LLT S64 = LLT::scalar(64); 4852 4853 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4854 // For code object version 5, queue_ptr is passed through implicit kernarg. 4855 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 4856 AMDGPUTargetLowering::ImplicitParameter Param = 4857 AMDGPUTargetLowering::QUEUE_PTR; 4858 uint64_t Offset = 4859 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 4860 4861 Register KernargPtrReg = MRI.createGenericVirtualRegister( 4862 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4863 4864 if (!loadInputValue(KernargPtrReg, B, 4865 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4866 return false; 4867 4868 // TODO: can we be smarter about machine pointer info? 4869 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4870 MachineMemOperand *MMO = MF.getMachineMemOperand( 4871 PtrInfo, 4872 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4873 MachineMemOperand::MOInvariant, 4874 LLT::scalar(64), commonAlignment(Align(64), Offset)); 4875 4876 // Pointer address 4877 Register LoadAddr = MRI.createGenericVirtualRegister( 4878 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4879 B.buildPtrAdd(LoadAddr, KernargPtrReg, 4880 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 4881 // Load address 4882 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 4883 B.buildCopy(SGPR01, Temp); 4884 B.buildInstr(AMDGPU::S_TRAP) 4885 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4886 .addReg(SGPR01, RegState::Implicit); 4887 MI.eraseFromParent(); 4888 return true; 4889 } 4890 4891 // Pass queue pointer to trap handler as input, and insert trap instruction 4892 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4893 Register LiveIn = 4894 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4895 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4896 return false; 4897 4898 B.buildCopy(SGPR01, LiveIn); 4899 B.buildInstr(AMDGPU::S_TRAP) 4900 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4901 .addReg(SGPR01, RegState::Implicit); 4902 4903 MI.eraseFromParent(); 4904 return true; 4905 } 4906 4907 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4908 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4909 B.buildInstr(AMDGPU::S_TRAP) 4910 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4911 MI.eraseFromParent(); 4912 return true; 4913 } 4914 4915 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4916 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4917 // Is non-HSA path or trap-handler disabled? Then, report a warning 4918 // accordingly 4919 if (!ST.isTrapHandlerEnabled() || 4920 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4921 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4922 "debugtrap handler not supported", 4923 MI.getDebugLoc(), DS_Warning); 4924 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4925 Ctx.diagnose(NoTrap); 4926 } else { 4927 // Insert debug-trap instruction 4928 B.buildInstr(AMDGPU::S_TRAP) 4929 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4930 } 4931 4932 MI.eraseFromParent(); 4933 return true; 4934 } 4935 4936 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4937 MachineIRBuilder &B) const { 4938 MachineRegisterInfo &MRI = *B.getMRI(); 4939 const LLT S16 = LLT::scalar(16); 4940 const LLT S32 = LLT::scalar(32); 4941 4942 Register DstReg = MI.getOperand(0).getReg(); 4943 Register NodePtr = MI.getOperand(2).getReg(); 4944 Register RayExtent = MI.getOperand(3).getReg(); 4945 Register RayOrigin = MI.getOperand(4).getReg(); 4946 Register RayDir = MI.getOperand(5).getReg(); 4947 Register RayInvDir = MI.getOperand(6).getReg(); 4948 Register TDescr = MI.getOperand(7).getReg(); 4949 4950 if (!ST.hasGFX10_AEncoding()) { 4951 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4952 "intrinsic not supported on subtarget", 4953 MI.getDebugLoc()); 4954 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4955 return false; 4956 } 4957 4958 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4959 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4960 const unsigned NumVDataDwords = 4; 4961 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4962 const bool UseNSA = 4963 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4964 const unsigned BaseOpcodes[2][2] = { 4965 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4966 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4967 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4968 int Opcode; 4969 if (UseNSA) { 4970 Opcode = 4971 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4972 NumVDataDwords, NumVAddrDwords); 4973 } else { 4974 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4975 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4976 PowerOf2Ceil(NumVAddrDwords)); 4977 } 4978 assert(Opcode != -1); 4979 4980 SmallVector<Register, 12> Ops; 4981 if (Is64) { 4982 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4983 Ops.push_back(Unmerge.getReg(0)); 4984 Ops.push_back(Unmerge.getReg(1)); 4985 } else { 4986 Ops.push_back(NodePtr); 4987 } 4988 Ops.push_back(RayExtent); 4989 4990 auto packLanes = [&Ops, &S32, &B](Register Src) { 4991 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4992 Ops.push_back(Unmerge.getReg(0)); 4993 Ops.push_back(Unmerge.getReg(1)); 4994 Ops.push_back(Unmerge.getReg(2)); 4995 }; 4996 4997 packLanes(RayOrigin); 4998 if (IsA16) { 4999 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5000 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5001 Register R1 = MRI.createGenericVirtualRegister(S32); 5002 Register R2 = MRI.createGenericVirtualRegister(S32); 5003 Register R3 = MRI.createGenericVirtualRegister(S32); 5004 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5005 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5006 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5007 Ops.push_back(R1); 5008 Ops.push_back(R2); 5009 Ops.push_back(R3); 5010 } else { 5011 packLanes(RayDir); 5012 packLanes(RayInvDir); 5013 } 5014 5015 if (!UseNSA) { 5016 // Build a single vector containing all the operands so far prepared. 5017 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5018 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5019 Ops.clear(); 5020 Ops.push_back(MergedOps); 5021 } 5022 5023 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5024 .addDef(DstReg) 5025 .addImm(Opcode); 5026 5027 for (Register R : Ops) { 5028 MIB.addUse(R); 5029 } 5030 5031 MIB.addUse(TDescr) 5032 .addImm(IsA16 ? 1 : 0) 5033 .cloneMemRefs(MI); 5034 5035 MI.eraseFromParent(); 5036 return true; 5037 } 5038 5039 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 5040 B.buildConstant(MI.getOperand(0).getReg(), C); 5041 MI.eraseFromParent(); 5042 return true; 5043 } 5044 5045 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5046 MachineIRBuilder &B) const { 5047 unsigned Opc; 5048 int RoundMode = MI.getOperand(2).getImm(); 5049 5050 if (RoundMode == (int)RoundingMode::TowardPositive) 5051 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5052 else if (RoundMode == (int)RoundingMode::TowardNegative) 5053 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5054 else 5055 return false; 5056 5057 B.buildInstr(Opc) 5058 .addDef(MI.getOperand(0).getReg()) 5059 .addUse(MI.getOperand(1).getReg()); 5060 5061 MI.eraseFromParent(); 5062 5063 return true; 5064 } 5065 5066 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5067 MachineInstr &MI) const { 5068 MachineIRBuilder &B = Helper.MIRBuilder; 5069 MachineRegisterInfo &MRI = *B.getMRI(); 5070 5071 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5072 auto IntrID = MI.getIntrinsicID(); 5073 switch (IntrID) { 5074 case Intrinsic::amdgcn_if: 5075 case Intrinsic::amdgcn_else: { 5076 MachineInstr *Br = nullptr; 5077 MachineBasicBlock *UncondBrTarget = nullptr; 5078 bool Negated = false; 5079 if (MachineInstr *BrCond = 5080 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5081 const SIRegisterInfo *TRI 5082 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5083 5084 Register Def = MI.getOperand(1).getReg(); 5085 Register Use = MI.getOperand(3).getReg(); 5086 5087 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5088 5089 if (Negated) 5090 std::swap(CondBrTarget, UncondBrTarget); 5091 5092 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5093 if (IntrID == Intrinsic::amdgcn_if) { 5094 B.buildInstr(AMDGPU::SI_IF) 5095 .addDef(Def) 5096 .addUse(Use) 5097 .addMBB(UncondBrTarget); 5098 } else { 5099 B.buildInstr(AMDGPU::SI_ELSE) 5100 .addDef(Def) 5101 .addUse(Use) 5102 .addMBB(UncondBrTarget); 5103 } 5104 5105 if (Br) { 5106 Br->getOperand(0).setMBB(CondBrTarget); 5107 } else { 5108 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5109 // since we're swapping branch targets it needs to be reinserted. 5110 // FIXME: IRTranslator should probably not do this 5111 B.buildBr(*CondBrTarget); 5112 } 5113 5114 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5115 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5116 MI.eraseFromParent(); 5117 BrCond->eraseFromParent(); 5118 return true; 5119 } 5120 5121 return false; 5122 } 5123 case Intrinsic::amdgcn_loop: { 5124 MachineInstr *Br = nullptr; 5125 MachineBasicBlock *UncondBrTarget = nullptr; 5126 bool Negated = false; 5127 if (MachineInstr *BrCond = 5128 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5129 const SIRegisterInfo *TRI 5130 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5131 5132 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5133 Register Reg = MI.getOperand(2).getReg(); 5134 5135 if (Negated) 5136 std::swap(CondBrTarget, UncondBrTarget); 5137 5138 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5139 B.buildInstr(AMDGPU::SI_LOOP) 5140 .addUse(Reg) 5141 .addMBB(UncondBrTarget); 5142 5143 if (Br) 5144 Br->getOperand(0).setMBB(CondBrTarget); 5145 else 5146 B.buildBr(*CondBrTarget); 5147 5148 MI.eraseFromParent(); 5149 BrCond->eraseFromParent(); 5150 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5151 return true; 5152 } 5153 5154 return false; 5155 } 5156 case Intrinsic::amdgcn_kernarg_segment_ptr: 5157 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5158 // This only makes sense to call in a kernel, so just lower to null. 5159 B.buildConstant(MI.getOperand(0).getReg(), 0); 5160 MI.eraseFromParent(); 5161 return true; 5162 } 5163 5164 return legalizePreloadedArgIntrin( 5165 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5166 case Intrinsic::amdgcn_implicitarg_ptr: 5167 return legalizeImplicitArgPtr(MI, MRI, B); 5168 case Intrinsic::amdgcn_workitem_id_x: 5169 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5170 return replaceWithConstant(B, MI, 0); 5171 return legalizePreloadedArgIntrin(MI, MRI, B, 5172 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5173 case Intrinsic::amdgcn_workitem_id_y: 5174 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5175 return replaceWithConstant(B, MI, 0); 5176 5177 return legalizePreloadedArgIntrin(MI, MRI, B, 5178 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5179 case Intrinsic::amdgcn_workitem_id_z: 5180 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5181 return replaceWithConstant(B, MI, 0); 5182 5183 return legalizePreloadedArgIntrin(MI, MRI, B, 5184 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5185 case Intrinsic::amdgcn_workgroup_id_x: 5186 return legalizePreloadedArgIntrin(MI, MRI, B, 5187 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5188 case Intrinsic::amdgcn_workgroup_id_y: 5189 return legalizePreloadedArgIntrin(MI, MRI, B, 5190 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5191 case Intrinsic::amdgcn_workgroup_id_z: 5192 return legalizePreloadedArgIntrin(MI, MRI, B, 5193 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5194 case Intrinsic::amdgcn_dispatch_ptr: 5195 return legalizePreloadedArgIntrin(MI, MRI, B, 5196 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5197 case Intrinsic::amdgcn_queue_ptr: 5198 return legalizePreloadedArgIntrin(MI, MRI, B, 5199 AMDGPUFunctionArgInfo::QUEUE_PTR); 5200 case Intrinsic::amdgcn_implicit_buffer_ptr: 5201 return legalizePreloadedArgIntrin( 5202 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5203 case Intrinsic::amdgcn_dispatch_id: 5204 return legalizePreloadedArgIntrin(MI, MRI, B, 5205 AMDGPUFunctionArgInfo::DISPATCH_ID); 5206 case Intrinsic::amdgcn_fdiv_fast: 5207 return legalizeFDIVFastIntrin(MI, MRI, B); 5208 case Intrinsic::amdgcn_is_shared: 5209 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5210 case Intrinsic::amdgcn_is_private: 5211 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5212 case Intrinsic::amdgcn_wavefrontsize: { 5213 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5214 MI.eraseFromParent(); 5215 return true; 5216 } 5217 case Intrinsic::amdgcn_s_buffer_load: 5218 return legalizeSBufferLoad(Helper, MI); 5219 case Intrinsic::amdgcn_raw_buffer_store: 5220 case Intrinsic::amdgcn_struct_buffer_store: 5221 return legalizeBufferStore(MI, MRI, B, false, false); 5222 case Intrinsic::amdgcn_raw_buffer_store_format: 5223 case Intrinsic::amdgcn_struct_buffer_store_format: 5224 return legalizeBufferStore(MI, MRI, B, false, true); 5225 case Intrinsic::amdgcn_raw_tbuffer_store: 5226 case Intrinsic::amdgcn_struct_tbuffer_store: 5227 return legalizeBufferStore(MI, MRI, B, true, true); 5228 case Intrinsic::amdgcn_raw_buffer_load: 5229 case Intrinsic::amdgcn_struct_buffer_load: 5230 return legalizeBufferLoad(MI, MRI, B, false, false); 5231 case Intrinsic::amdgcn_raw_buffer_load_format: 5232 case Intrinsic::amdgcn_struct_buffer_load_format: 5233 return legalizeBufferLoad(MI, MRI, B, true, false); 5234 case Intrinsic::amdgcn_raw_tbuffer_load: 5235 case Intrinsic::amdgcn_struct_tbuffer_load: 5236 return legalizeBufferLoad(MI, MRI, B, true, true); 5237 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5238 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5239 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5240 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5241 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5242 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5243 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5244 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5245 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5246 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5247 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5248 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5249 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5250 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5251 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5252 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5253 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5254 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5255 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5256 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5257 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5258 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5259 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5260 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5261 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5262 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5263 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5264 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5265 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5266 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5267 return legalizeBufferAtomic(MI, B, IntrID); 5268 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5269 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5270 Register DstReg = MI.getOperand(0).getReg(); 5271 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5272 Function &F = B.getMF().getFunction(); 5273 DiagnosticInfoUnsupported NoFpRet( 5274 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5275 DS_Error); 5276 F.getContext().diagnose(NoFpRet); 5277 B.buildUndef(DstReg); 5278 MI.eraseFromParent(); 5279 return true; 5280 } 5281 5282 return legalizeBufferAtomic(MI, B, IntrID); 5283 } 5284 case Intrinsic::amdgcn_atomic_inc: 5285 return legalizeAtomicIncDec(MI, B, true); 5286 case Intrinsic::amdgcn_atomic_dec: 5287 return legalizeAtomicIncDec(MI, B, false); 5288 case Intrinsic::trap: 5289 return legalizeTrapIntrinsic(MI, MRI, B); 5290 case Intrinsic::debugtrap: 5291 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5292 case Intrinsic::amdgcn_rsq_clamp: 5293 return legalizeRsqClampIntrinsic(MI, MRI, B); 5294 case Intrinsic::amdgcn_ds_fadd: 5295 case Intrinsic::amdgcn_ds_fmin: 5296 case Intrinsic::amdgcn_ds_fmax: 5297 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5298 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5299 return legalizeBVHIntrinsic(MI, B); 5300 default: { 5301 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5302 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5303 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5304 return true; 5305 } 5306 } 5307 5308 return true; 5309 } 5310