1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB}) 534 .legalFor({S32, S16, V2S16}) 535 .clampMaxNumElementsStrict(0, S16, 2) 536 .scalarize(0) 537 .minScalar(0, S16) 538 .widenScalarToNextMultipleOf(0, 32) 539 .maxScalar(0, S32); 540 541 getActionDefinitionsBuilder(G_MUL) 542 .legalFor({S32, S16, V2S16}) 543 .clampMaxNumElementsStrict(0, S16, 2) 544 .scalarize(0) 545 .minScalar(0, S16) 546 .widenScalarToNextMultipleOf(0, 32) 547 .custom(); 548 assert(ST.hasMad64_32()); 549 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 551 .legalFor({S32, S16, V2S16}) // Clamp modifier 552 .minScalarOrElt(0, S16) 553 .clampMaxNumElementsStrict(0, S16, 2) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 32) 556 .lower(); 557 } else if (ST.has16BitInsts()) { 558 getActionDefinitionsBuilder({G_ADD, G_SUB}) 559 .legalFor({S32, S16}) 560 .minScalar(0, S16) 561 .widenScalarToNextMultipleOf(0, 32) 562 .maxScalar(0, S32) 563 .scalarize(0); 564 565 getActionDefinitionsBuilder(G_MUL) 566 .legalFor({S32, S16}) 567 .scalarize(0) 568 .minScalar(0, S16) 569 .widenScalarToNextMultipleOf(0, 32) 570 .custom(); 571 assert(ST.hasMad64_32()); 572 573 // Technically the saturating operations require clamp bit support, but this 574 // was introduced at the same time as 16-bit operations. 575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576 .legalFor({S32, S16}) // Clamp modifier 577 .minScalar(0, S16) 578 .scalarize(0) 579 .widenScalarToNextPow2(0, 16) 580 .lower(); 581 582 // We're just lowering this, but it helps get a better result to try to 583 // coerce to the desired type first. 584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 585 .minScalar(0, S16) 586 .scalarize(0) 587 .lower(); 588 } else { 589 getActionDefinitionsBuilder({G_ADD, G_SUB}) 590 .legalFor({S32}) 591 .widenScalarToNextMultipleOf(0, 32) 592 .clampScalar(0, S32, S32) 593 .scalarize(0); 594 595 auto &Mul = getActionDefinitionsBuilder(G_MUL) 596 .legalFor({S32}) 597 .scalarize(0) 598 .minScalar(0, S32) 599 .widenScalarToNextMultipleOf(0, 32); 600 601 if (ST.hasMad64_32()) 602 Mul.custom(); 603 else 604 Mul.maxScalar(0, S32); 605 606 if (ST.hasIntClamp()) { 607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 608 .legalFor({S32}) // Clamp modifier. 609 .scalarize(0) 610 .minScalarOrElt(0, S32) 611 .lower(); 612 } else { 613 // Clamp bit support was added in VI, along with 16-bit operations. 614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 615 .minScalar(0, S32) 616 .scalarize(0) 617 .lower(); 618 } 619 620 // FIXME: DAG expansion gets better results. The widening uses the smaller 621 // range values and goes for the min/max lowering directly. 622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 623 .minScalar(0, S32) 624 .scalarize(0) 625 .lower(); 626 } 627 628 getActionDefinitionsBuilder( 629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 630 .customFor({S32, S64}) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .scalarize(0); 634 635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 636 .legalFor({S32}) 637 .maxScalar(0, S32); 638 639 if (ST.hasVOP3PInsts()) { 640 Mulh 641 .clampMaxNumElements(0, S8, 2) 642 .lowerFor({V2S8}); 643 } 644 645 Mulh 646 .scalarize(0) 647 .lower(); 648 649 // Report legal for any types we can handle anywhere. For the cases only legal 650 // on the SALU, RegBankSelect will be able to re-legalize. 651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 653 .clampScalar(0, S32, S64) 654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 656 .widenScalarToNextPow2(0) 657 .scalarize(0); 658 659 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 661 .legalFor({{S32, S1}, {S32, S32}}) 662 .minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_BITCAST) 667 // Don't worry about the size constraint. 668 .legalIf(all(isRegisterType(0), isRegisterType(1))) 669 .lower(); 670 671 672 getActionDefinitionsBuilder(G_CONSTANT) 673 .legalFor({S1, S32, S64, S16, GlobalPtr, 674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 675 .legalIf(isPointer(0)) 676 .clampScalar(0, S32, S64) 677 .widenScalarToNextPow2(0); 678 679 getActionDefinitionsBuilder(G_FCONSTANT) 680 .legalFor({S32, S64, S16}) 681 .clampScalar(0, S16, S64); 682 683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 684 .legalIf(isRegisterType(0)) 685 // s1 and s16 are special cases because they have legal operations on 686 // them, but don't really occupy registers in the normal way. 687 .legalFor({S1, S16}) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampScalarOrElt(0, S32, MaxScalar) 690 .widenScalarToNextPow2(0, 32) 691 .clampMaxNumElements(0, S32, 16); 692 693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 694 695 // If the amount is divergent, we have to do a wave reduction to get the 696 // maximum value, so this is expanded during RegBankSelect. 697 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 698 .legalFor({{PrivatePtr, S32}}); 699 700 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 701 .customIf(typeIsNot(0, PrivatePtr)); 702 703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 704 705 auto &FPOpActions = getActionDefinitionsBuilder( 706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 707 .legalFor({S32, S64}); 708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 709 .customFor({S32, S64}); 710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 711 .customFor({S32, S64}); 712 713 if (ST.has16BitInsts()) { 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.legalFor({S16, V2S16}); 716 else 717 FPOpActions.legalFor({S16}); 718 719 TrigActions.customFor({S16}); 720 FDIVActions.customFor({S16}); 721 } 722 723 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 725 726 if (ST.hasVOP3PInsts()) { 727 MinNumMaxNum.customFor(FPTypesPK16) 728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 729 .clampMaxNumElements(0, S16, 2) 730 .clampScalar(0, S16, S64) 731 .scalarize(0); 732 } else if (ST.has16BitInsts()) { 733 MinNumMaxNum.customFor(FPTypes16) 734 .clampScalar(0, S16, S64) 735 .scalarize(0); 736 } else { 737 MinNumMaxNum.customFor(FPTypesBase) 738 .clampScalar(0, S32, S64) 739 .scalarize(0); 740 } 741 742 if (ST.hasVOP3PInsts()) 743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 744 745 FPOpActions 746 .scalarize(0) 747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 748 749 TrigActions 750 .scalarize(0) 751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 752 753 FDIVActions 754 .scalarize(0) 755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 756 757 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 758 .legalFor(FPTypesPK16) 759 .clampMaxNumElementsStrict(0, S16, 2) 760 .scalarize(0) 761 .clampScalar(0, S16, S64); 762 763 if (ST.has16BitInsts()) { 764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 765 .legalFor({S32, S64, S16}) 766 .scalarize(0) 767 .clampScalar(0, S16, S64); 768 } else { 769 getActionDefinitionsBuilder(G_FSQRT) 770 .legalFor({S32, S64}) 771 .scalarize(0) 772 .clampScalar(0, S32, S64); 773 774 if (ST.hasFractBug()) { 775 getActionDefinitionsBuilder(G_FFLOOR) 776 .customFor({S64}) 777 .legalFor({S32, S64}) 778 .scalarize(0) 779 .clampScalar(0, S32, S64); 780 } else { 781 getActionDefinitionsBuilder(G_FFLOOR) 782 .legalFor({S32, S64}) 783 .scalarize(0) 784 .clampScalar(0, S32, S64); 785 } 786 } 787 788 getActionDefinitionsBuilder(G_FPTRUNC) 789 .legalFor({{S32, S64}, {S16, S32}}) 790 .scalarize(0) 791 .lower(); 792 793 getActionDefinitionsBuilder(G_FPEXT) 794 .legalFor({{S64, S32}, {S32, S16}}) 795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 796 .scalarize(0); 797 798 getActionDefinitionsBuilder(G_FSUB) 799 // Use actual fsub instruction 800 .legalFor({S32}) 801 // Must use fadd + fneg 802 .lowerFor({S64, S16, V2S16}) 803 .scalarize(0) 804 .clampScalar(0, S32, S64); 805 806 // Whether this is legal depends on the floating point mode for the function. 807 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 808 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 809 FMad.customFor({S32, S16}); 810 else if (ST.hasMadMacF32Insts()) 811 FMad.customFor({S32}); 812 else if (ST.hasMadF16()) 813 FMad.customFor({S16}); 814 FMad.scalarize(0) 815 .lower(); 816 817 auto &FRem = getActionDefinitionsBuilder(G_FREM); 818 if (ST.has16BitInsts()) { 819 FRem.customFor({S16, S32, S64}); 820 } else { 821 FRem.minScalar(0, S32) 822 .customFor({S32, S64}); 823 } 824 FRem.scalarize(0); 825 826 // TODO: Do we need to clamp maximum bitwidth? 827 getActionDefinitionsBuilder(G_TRUNC) 828 .legalIf(isScalar(0)) 829 .legalFor({{V2S16, V2S32}}) 830 .clampMaxNumElements(0, S16, 2) 831 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 832 // situations (like an invalid implicit use), we don't want to infinite loop 833 // in the legalizer. 834 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 835 .alwaysLegal(); 836 837 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 838 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 839 {S32, S1}, {S64, S1}, {S16, S1}}) 840 .scalarize(0) 841 .clampScalar(0, S32, S64) 842 .widenScalarToNextPow2(1, 32); 843 844 // TODO: Split s1->s64 during regbankselect for VALU. 845 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 846 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 847 .lowerIf(typeIs(1, S1)) 848 .customFor({{S32, S64}, {S64, S64}}); 849 if (ST.has16BitInsts()) 850 IToFP.legalFor({{S16, S16}}); 851 IToFP.clampScalar(1, S32, S64) 852 .minScalar(0, S32) 853 .scalarize(0) 854 .widenScalarToNextPow2(1); 855 856 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 857 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 858 .customFor({{S64, S32}, {S64, S64}}) 859 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 860 if (ST.has16BitInsts()) 861 FPToI.legalFor({{S16, S16}}); 862 else 863 FPToI.minScalar(1, S32); 864 865 FPToI.minScalar(0, S32) 866 .widenScalarToNextPow2(0, 32) 867 .scalarize(0) 868 .lower(); 869 870 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 871 .customFor({S16, S32}) 872 .scalarize(0) 873 .lower(); 874 875 // Lower roundeven into G_FRINT 876 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 877 .scalarize(0) 878 .lower(); 879 880 if (ST.has16BitInsts()) { 881 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 882 .legalFor({S16, S32, S64}) 883 .clampScalar(0, S16, S64) 884 .scalarize(0); 885 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 886 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 887 .legalFor({S32, S64}) 888 .clampScalar(0, S32, S64) 889 .scalarize(0); 890 } else { 891 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 892 .legalFor({S32}) 893 .customFor({S64}) 894 .clampScalar(0, S32, S64) 895 .scalarize(0); 896 } 897 898 getActionDefinitionsBuilder(G_PTR_ADD) 899 .legalIf(all(isPointer(0), sameSize(0, 1))) 900 .scalarize(0) 901 .scalarSameSizeAs(1, 0); 902 903 getActionDefinitionsBuilder(G_PTRMASK) 904 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 905 .scalarSameSizeAs(1, 0) 906 .scalarize(0); 907 908 auto &CmpBuilder = 909 getActionDefinitionsBuilder(G_ICMP) 910 // The compare output type differs based on the register bank of the output, 911 // so make both s1 and s32 legal. 912 // 913 // Scalar compares producing output in scc will be promoted to s32, as that 914 // is the allocatable register type that will be needed for the copy from 915 // scc. This will be promoted during RegBankSelect, and we assume something 916 // before that won't try to use s32 result types. 917 // 918 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 919 // bank. 920 .legalForCartesianProduct( 921 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 922 .legalForCartesianProduct( 923 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 924 if (ST.has16BitInsts()) { 925 CmpBuilder.legalFor({{S1, S16}}); 926 } 927 928 CmpBuilder 929 .widenScalarToNextPow2(1) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 933 934 getActionDefinitionsBuilder(G_FCMP) 935 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 936 .widenScalarToNextPow2(1) 937 .clampScalar(1, S32, S64) 938 .scalarize(0); 939 940 // FIXME: fpow has a selection pattern that should move to custom lowering. 941 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 942 if (ST.has16BitInsts()) 943 Exp2Ops.legalFor({S32, S16}); 944 else 945 Exp2Ops.legalFor({S32}); 946 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 947 Exp2Ops.scalarize(0); 948 949 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 950 if (ST.has16BitInsts()) 951 ExpOps.customFor({{S32}, {S16}}); 952 else 953 ExpOps.customFor({S32}); 954 ExpOps.clampScalar(0, MinScalarFPTy, S32) 955 .scalarize(0); 956 957 getActionDefinitionsBuilder(G_FPOWI) 958 .clampScalar(0, MinScalarFPTy, S32) 959 .lower(); 960 961 // The 64-bit versions produce 32-bit results, but only on the SALU. 962 getActionDefinitionsBuilder(G_CTPOP) 963 .legalFor({{S32, S32}, {S32, S64}}) 964 .clampScalar(0, S32, S32) 965 .widenScalarToNextPow2(1, 32) 966 .clampScalar(1, S32, S64) 967 .scalarize(0) 968 .widenScalarToNextPow2(0, 32); 969 970 971 // The hardware instructions return a different result on 0 than the generic 972 // instructions expect. The hardware produces -1, but these produce the 973 // bitwidth. 974 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 975 .scalarize(0) 976 .clampScalar(0, S32, S32) 977 .clampScalar(1, S32, S64) 978 .widenScalarToNextPow2(0, 32) 979 .widenScalarToNextPow2(1, 32) 980 .custom(); 981 982 // The 64-bit versions produce 32-bit results, but only on the SALU. 983 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 984 .legalFor({{S32, S32}, {S32, S64}}) 985 .clampScalar(0, S32, S32) 986 .clampScalar(1, S32, S64) 987 .scalarize(0) 988 .widenScalarToNextPow2(0, 32) 989 .widenScalarToNextPow2(1, 32); 990 991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 992 // RegBankSelect. 993 getActionDefinitionsBuilder(G_BITREVERSE) 994 .legalFor({S32, S64}) 995 .clampScalar(0, S32, S64) 996 .scalarize(0) 997 .widenScalarToNextPow2(0); 998 999 if (ST.has16BitInsts()) { 1000 getActionDefinitionsBuilder(G_BSWAP) 1001 .legalFor({S16, S32, V2S16}) 1002 .clampMaxNumElementsStrict(0, S16, 2) 1003 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1004 // narrowScalar limitation. 1005 .widenScalarToNextPow2(0) 1006 .clampScalar(0, S16, S32) 1007 .scalarize(0); 1008 1009 if (ST.hasVOP3PInsts()) { 1010 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1011 .legalFor({S32, S16, V2S16}) 1012 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1013 .clampMaxNumElements(0, S16, 2) 1014 .minScalar(0, S16) 1015 .widenScalarToNextPow2(0) 1016 .scalarize(0) 1017 .lower(); 1018 } else { 1019 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1020 .legalFor({S32, S16}) 1021 .widenScalarToNextPow2(0) 1022 .minScalar(0, S16) 1023 .scalarize(0) 1024 .lower(); 1025 } 1026 } else { 1027 // TODO: Should have same legality without v_perm_b32 1028 getActionDefinitionsBuilder(G_BSWAP) 1029 .legalFor({S32}) 1030 .lowerIf(scalarNarrowerThan(0, 32)) 1031 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1032 // narrowScalar limitation. 1033 .widenScalarToNextPow2(0) 1034 .maxScalar(0, S32) 1035 .scalarize(0) 1036 .lower(); 1037 1038 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1039 .legalFor({S32}) 1040 .minScalar(0, S32) 1041 .widenScalarToNextPow2(0) 1042 .scalarize(0) 1043 .lower(); 1044 } 1045 1046 getActionDefinitionsBuilder(G_INTTOPTR) 1047 // List the common cases 1048 .legalForCartesianProduct(AddrSpaces64, {S64}) 1049 .legalForCartesianProduct(AddrSpaces32, {S32}) 1050 .scalarize(0) 1051 // Accept any address space as long as the size matches 1052 .legalIf(sameSize(0, 1)) 1053 .widenScalarIf(smallerThan(1, 0), 1054 [](const LegalityQuery &Query) { 1055 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1056 }) 1057 .narrowScalarIf(largerThan(1, 0), 1058 [](const LegalityQuery &Query) { 1059 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1060 }); 1061 1062 getActionDefinitionsBuilder(G_PTRTOINT) 1063 // List the common cases 1064 .legalForCartesianProduct(AddrSpaces64, {S64}) 1065 .legalForCartesianProduct(AddrSpaces32, {S32}) 1066 .scalarize(0) 1067 // Accept any address space as long as the size matches 1068 .legalIf(sameSize(0, 1)) 1069 .widenScalarIf(smallerThan(0, 1), 1070 [](const LegalityQuery &Query) { 1071 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1072 }) 1073 .narrowScalarIf( 1074 largerThan(0, 1), 1075 [](const LegalityQuery &Query) { 1076 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1077 }); 1078 1079 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1080 .scalarize(0) 1081 .custom(); 1082 1083 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1084 bool IsLoad) -> bool { 1085 const LLT DstTy = Query.Types[0]; 1086 1087 // Split vector extloads. 1088 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1089 1090 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1091 return true; 1092 1093 const LLT PtrTy = Query.Types[1]; 1094 unsigned AS = PtrTy.getAddressSpace(); 1095 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1096 return true; 1097 1098 // Catch weird sized loads that don't evenly divide into the access sizes 1099 // TODO: May be able to widen depending on alignment etc. 1100 unsigned NumRegs = (MemSize + 31) / 32; 1101 if (NumRegs == 3) { 1102 if (!ST.hasDwordx3LoadStores()) 1103 return true; 1104 } else { 1105 // If the alignment allows, these should have been widened. 1106 if (!isPowerOf2_32(NumRegs)) 1107 return true; 1108 } 1109 1110 return false; 1111 }; 1112 1113 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1114 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1115 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1116 1117 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1118 // LDS 1119 // TODO: Unsupported flat for SI. 1120 1121 for (unsigned Op : {G_LOAD, G_STORE}) { 1122 const bool IsStore = Op == G_STORE; 1123 1124 auto &Actions = getActionDefinitionsBuilder(Op); 1125 // Explicitly list some common cases. 1126 // TODO: Does this help compile time at all? 1127 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1128 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1129 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1130 {S64, GlobalPtr, S64, GlobalAlign32}, 1131 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1132 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1133 {S32, GlobalPtr, S8, GlobalAlign8}, 1134 {S32, GlobalPtr, S16, GlobalAlign16}, 1135 1136 {S32, LocalPtr, S32, 32}, 1137 {S64, LocalPtr, S64, 32}, 1138 {V2S32, LocalPtr, V2S32, 32}, 1139 {S32, LocalPtr, S8, 8}, 1140 {S32, LocalPtr, S16, 16}, 1141 {V2S16, LocalPtr, S32, 32}, 1142 1143 {S32, PrivatePtr, S32, 32}, 1144 {S32, PrivatePtr, S8, 8}, 1145 {S32, PrivatePtr, S16, 16}, 1146 {V2S16, PrivatePtr, S32, 32}, 1147 1148 {S32, ConstantPtr, S32, GlobalAlign32}, 1149 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1150 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1151 {S64, ConstantPtr, S64, GlobalAlign32}, 1152 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1153 Actions.legalIf( 1154 [=](const LegalityQuery &Query) -> bool { 1155 return isLoadStoreLegal(ST, Query); 1156 }); 1157 1158 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1159 // 64-bits. 1160 // 1161 // TODO: Should generalize bitcast action into coerce, which will also cover 1162 // inserting addrspacecasts. 1163 Actions.customIf(typeIs(1, Constant32Ptr)); 1164 1165 // Turn any illegal element vectors into something easier to deal 1166 // with. These will ultimately produce 32-bit scalar shifts to extract the 1167 // parts anyway. 1168 // 1169 // For odd 16-bit element vectors, prefer to split those into pieces with 1170 // 16-bit vector parts. 1171 Actions.bitcastIf( 1172 [=](const LegalityQuery &Query) -> bool { 1173 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1174 Query.MMODescrs[0].MemoryTy); 1175 }, bitcastToRegisterType(0)); 1176 1177 if (!IsStore) { 1178 // Widen suitably aligned loads by loading extra bytes. The standard 1179 // legalization actions can't properly express widening memory operands. 1180 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1181 return shouldWidenLoad(ST, Query, G_LOAD); 1182 }); 1183 } 1184 1185 // FIXME: load/store narrowing should be moved to lower action 1186 Actions 1187 .narrowScalarIf( 1188 [=](const LegalityQuery &Query) -> bool { 1189 return !Query.Types[0].isVector() && 1190 needToSplitMemOp(Query, Op == G_LOAD); 1191 }, 1192 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1193 const LLT DstTy = Query.Types[0]; 1194 const LLT PtrTy = Query.Types[1]; 1195 1196 const unsigned DstSize = DstTy.getSizeInBits(); 1197 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1198 1199 // Split extloads. 1200 if (DstSize > MemSize) 1201 return std::make_pair(0, LLT::scalar(MemSize)); 1202 1203 unsigned MaxSize = maxSizeForAddrSpace(ST, 1204 PtrTy.getAddressSpace(), 1205 Op == G_LOAD); 1206 if (MemSize > MaxSize) 1207 return std::make_pair(0, LLT::scalar(MaxSize)); 1208 1209 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1210 return std::make_pair(0, LLT::scalar(Align)); 1211 }) 1212 .fewerElementsIf( 1213 [=](const LegalityQuery &Query) -> bool { 1214 return Query.Types[0].isVector() && 1215 needToSplitMemOp(Query, Op == G_LOAD); 1216 }, 1217 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1218 const LLT DstTy = Query.Types[0]; 1219 const LLT PtrTy = Query.Types[1]; 1220 1221 LLT EltTy = DstTy.getElementType(); 1222 unsigned MaxSize = maxSizeForAddrSpace(ST, 1223 PtrTy.getAddressSpace(), 1224 Op == G_LOAD); 1225 1226 // FIXME: Handle widened to power of 2 results better. This ends 1227 // up scalarizing. 1228 // FIXME: 3 element stores scalarized on SI 1229 1230 // Split if it's too large for the address space. 1231 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1232 if (MemSize > MaxSize) { 1233 unsigned NumElts = DstTy.getNumElements(); 1234 unsigned EltSize = EltTy.getSizeInBits(); 1235 1236 if (MaxSize % EltSize == 0) { 1237 return std::make_pair( 1238 0, LLT::scalarOrVector( 1239 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1240 } 1241 1242 unsigned NumPieces = MemSize / MaxSize; 1243 1244 // FIXME: Refine when odd breakdowns handled 1245 // The scalars will need to be re-legalized. 1246 if (NumPieces == 1 || NumPieces >= NumElts || 1247 NumElts % NumPieces != 0) 1248 return std::make_pair(0, EltTy); 1249 1250 return std::make_pair( 1251 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1252 } 1253 1254 // FIXME: We could probably handle weird extending loads better. 1255 if (DstTy.getSizeInBits() > MemSize) 1256 return std::make_pair(0, EltTy); 1257 1258 unsigned EltSize = EltTy.getSizeInBits(); 1259 unsigned DstSize = DstTy.getSizeInBits(); 1260 if (!isPowerOf2_32(DstSize)) { 1261 // We're probably decomposing an odd sized store. Try to split 1262 // to the widest type. TODO: Account for alignment. As-is it 1263 // should be OK, since the new parts will be further legalized. 1264 unsigned FloorSize = PowerOf2Floor(DstSize); 1265 return std::make_pair( 1266 0, LLT::scalarOrVector( 1267 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1268 } 1269 1270 // May need relegalization for the scalars. 1271 return std::make_pair(0, EltTy); 1272 }) 1273 .minScalar(0, S32) 1274 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1275 .widenScalarToNextPow2(0) 1276 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1277 .lower(); 1278 } 1279 1280 // FIXME: Unaligned accesses not lowered. 1281 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1282 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1283 {S32, GlobalPtr, S16, 2 * 8}, 1284 {S32, LocalPtr, S8, 8}, 1285 {S32, LocalPtr, S16, 16}, 1286 {S32, PrivatePtr, S8, 8}, 1287 {S32, PrivatePtr, S16, 16}, 1288 {S32, ConstantPtr, S8, 8}, 1289 {S32, ConstantPtr, S16, 2 * 8}}) 1290 .legalIf( 1291 [=](const LegalityQuery &Query) -> bool { 1292 return isLoadStoreLegal(ST, Query); 1293 }); 1294 1295 if (ST.hasFlatAddressSpace()) { 1296 ExtLoads.legalForTypesWithMemDesc( 1297 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1298 } 1299 1300 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1301 // 64-bits. 1302 // 1303 // TODO: Should generalize bitcast action into coerce, which will also cover 1304 // inserting addrspacecasts. 1305 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1306 1307 ExtLoads.clampScalar(0, S32, S32) 1308 .widenScalarToNextPow2(0) 1309 .lower(); 1310 1311 auto &Atomics = getActionDefinitionsBuilder( 1312 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1313 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1314 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1315 G_ATOMICRMW_UMIN}) 1316 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1317 {S64, GlobalPtr}, {S64, LocalPtr}, 1318 {S32, RegionPtr}, {S64, RegionPtr}}); 1319 if (ST.hasFlatAddressSpace()) { 1320 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1321 } 1322 1323 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1324 if (ST.hasLDSFPAtomicAdd()) { 1325 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1326 if (ST.hasGFX90AInsts()) 1327 Atomic.legalFor({{S64, LocalPtr}}); 1328 if (ST.hasGFX940Insts()) 1329 Atomic.legalFor({{V2S16, LocalPtr}}); 1330 } 1331 if (ST.hasAtomicFaddInsts()) 1332 Atomic.legalFor({{S32, GlobalPtr}}); 1333 1334 if (ST.hasGFX90AInsts()) { 1335 // These are legal with some caveats, and should have undergone expansion in 1336 // the IR in most situations 1337 // TODO: Move atomic expansion into legalizer 1338 // TODO: Also supports <2 x f16> 1339 Atomic.legalFor({ 1340 {S32, GlobalPtr}, 1341 {S64, GlobalPtr}, 1342 {S64, FlatPtr} 1343 }); 1344 } 1345 1346 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1347 // demarshalling 1348 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1349 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1350 {S32, FlatPtr}, {S64, FlatPtr}}) 1351 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1352 {S32, RegionPtr}, {S64, RegionPtr}}); 1353 // TODO: Pointer types, any 32-bit or 64-bit vector 1354 1355 // Condition should be s32 for scalar, s1 for vector. 1356 getActionDefinitionsBuilder(G_SELECT) 1357 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1358 LocalPtr, FlatPtr, PrivatePtr, 1359 LLT::fixed_vector(2, LocalPtr), 1360 LLT::fixed_vector(2, PrivatePtr)}, 1361 {S1, S32}) 1362 .clampScalar(0, S16, S64) 1363 .scalarize(1) 1364 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1365 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1366 .clampMaxNumElements(0, S32, 2) 1367 .clampMaxNumElements(0, LocalPtr, 2) 1368 .clampMaxNumElements(0, PrivatePtr, 2) 1369 .scalarize(0) 1370 .widenScalarToNextPow2(0) 1371 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1372 1373 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1374 // be more flexible with the shift amount type. 1375 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1376 .legalFor({{S32, S32}, {S64, S32}}); 1377 if (ST.has16BitInsts()) { 1378 if (ST.hasVOP3PInsts()) { 1379 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1380 .clampMaxNumElements(0, S16, 2); 1381 } else 1382 Shifts.legalFor({{S16, S16}}); 1383 1384 // TODO: Support 16-bit shift amounts for all types 1385 Shifts.widenScalarIf( 1386 [=](const LegalityQuery &Query) { 1387 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1388 // 32-bit amount. 1389 const LLT ValTy = Query.Types[0]; 1390 const LLT AmountTy = Query.Types[1]; 1391 return ValTy.getSizeInBits() <= 16 && 1392 AmountTy.getSizeInBits() < 16; 1393 }, changeTo(1, S16)); 1394 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1395 Shifts.clampScalar(1, S32, S32); 1396 Shifts.widenScalarToNextPow2(0, 16); 1397 Shifts.clampScalar(0, S16, S64); 1398 1399 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1400 .minScalar(0, S16) 1401 .scalarize(0) 1402 .lower(); 1403 } else { 1404 // Make sure we legalize the shift amount type first, as the general 1405 // expansion for the shifted type will produce much worse code if it hasn't 1406 // been truncated already. 1407 Shifts.clampScalar(1, S32, S32); 1408 Shifts.widenScalarToNextPow2(0, 32); 1409 Shifts.clampScalar(0, S32, S64); 1410 1411 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1412 .minScalar(0, S32) 1413 .scalarize(0) 1414 .lower(); 1415 } 1416 Shifts.scalarize(0); 1417 1418 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1419 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1420 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1421 unsigned IdxTypeIdx = 2; 1422 1423 getActionDefinitionsBuilder(Op) 1424 .customIf([=](const LegalityQuery &Query) { 1425 const LLT EltTy = Query.Types[EltTypeIdx]; 1426 const LLT VecTy = Query.Types[VecTypeIdx]; 1427 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1428 const unsigned EltSize = EltTy.getSizeInBits(); 1429 return (EltSize == 32 || EltSize == 64) && 1430 VecTy.getSizeInBits() % 32 == 0 && 1431 VecTy.getSizeInBits() <= MaxRegisterSize && 1432 IdxTy.getSizeInBits() == 32; 1433 }) 1434 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1435 bitcastToVectorElement32(VecTypeIdx)) 1436 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1437 .bitcastIf( 1438 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1439 [=](const LegalityQuery &Query) { 1440 // For > 64-bit element types, try to turn this into a 64-bit 1441 // element vector since we may be able to do better indexing 1442 // if this is scalar. If not, fall back to 32. 1443 const LLT EltTy = Query.Types[EltTypeIdx]; 1444 const LLT VecTy = Query.Types[VecTypeIdx]; 1445 const unsigned DstEltSize = EltTy.getSizeInBits(); 1446 const unsigned VecSize = VecTy.getSizeInBits(); 1447 1448 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1449 return std::make_pair( 1450 VecTypeIdx, 1451 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1452 }) 1453 .clampScalar(EltTypeIdx, S32, S64) 1454 .clampScalar(VecTypeIdx, S32, S64) 1455 .clampScalar(IdxTypeIdx, S32, S32) 1456 .clampMaxNumElements(VecTypeIdx, S32, 32) 1457 // TODO: Clamp elements for 64-bit vectors? 1458 // It should only be necessary with variable indexes. 1459 // As a last resort, lower to the stack 1460 .lower(); 1461 } 1462 1463 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1464 .unsupportedIf([=](const LegalityQuery &Query) { 1465 const LLT &EltTy = Query.Types[1].getElementType(); 1466 return Query.Types[0] != EltTy; 1467 }); 1468 1469 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1470 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1471 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1472 1473 // FIXME: Doesn't handle extract of illegal sizes. 1474 getActionDefinitionsBuilder(Op) 1475 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1476 .lowerIf([=](const LegalityQuery &Query) { 1477 // Sub-vector(or single element) insert and extract. 1478 // TODO: verify immediate offset here since lower only works with 1479 // whole elements. 1480 const LLT BigTy = Query.Types[BigTyIdx]; 1481 return BigTy.isVector(); 1482 }) 1483 // FIXME: Multiples of 16 should not be legal. 1484 .legalIf([=](const LegalityQuery &Query) { 1485 const LLT BigTy = Query.Types[BigTyIdx]; 1486 const LLT LitTy = Query.Types[LitTyIdx]; 1487 return (BigTy.getSizeInBits() % 32 == 0) && 1488 (LitTy.getSizeInBits() % 16 == 0); 1489 }) 1490 .widenScalarIf( 1491 [=](const LegalityQuery &Query) { 1492 const LLT BigTy = Query.Types[BigTyIdx]; 1493 return (BigTy.getScalarSizeInBits() < 16); 1494 }, 1495 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1496 .widenScalarIf( 1497 [=](const LegalityQuery &Query) { 1498 const LLT LitTy = Query.Types[LitTyIdx]; 1499 return (LitTy.getScalarSizeInBits() < 16); 1500 }, 1501 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1502 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1503 .widenScalarToNextPow2(BigTyIdx, 32); 1504 1505 } 1506 1507 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1508 .legalForCartesianProduct(AllS32Vectors, {S32}) 1509 .legalForCartesianProduct(AllS64Vectors, {S64}) 1510 .clampNumElements(0, V16S32, V32S32) 1511 .clampNumElements(0, V2S64, V16S64) 1512 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1513 1514 if (ST.hasScalarPackInsts()) { 1515 BuildVector 1516 // FIXME: Should probably widen s1 vectors straight to s32 1517 .minScalarOrElt(0, S16) 1518 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1519 .minScalar(1, S32); 1520 1521 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1522 .legalFor({V2S16, S32}) 1523 .lower(); 1524 BuildVector.minScalarOrElt(0, S32); 1525 } else { 1526 BuildVector.customFor({V2S16, S16}); 1527 BuildVector.minScalarOrElt(0, S32); 1528 1529 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1530 .customFor({V2S16, S32}) 1531 .lower(); 1532 } 1533 1534 BuildVector.legalIf(isRegisterType(0)); 1535 1536 // FIXME: Clamp maximum size 1537 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1538 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1539 .clampMaxNumElements(0, S32, 32) 1540 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1541 .clampMaxNumElements(0, S16, 64); 1542 1543 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1544 // pre-legalize. 1545 if (ST.hasVOP3PInsts()) { 1546 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1547 .customFor({V2S16, V2S16}) 1548 .lower(); 1549 } else 1550 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1551 1552 // Merge/Unmerge 1553 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1554 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1555 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1556 1557 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1558 const LLT Ty = Query.Types[TypeIdx]; 1559 if (Ty.isVector()) { 1560 const LLT &EltTy = Ty.getElementType(); 1561 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1562 return true; 1563 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1564 return true; 1565 } 1566 return false; 1567 }; 1568 1569 auto &Builder = getActionDefinitionsBuilder(Op) 1570 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1571 .lowerFor({{S16, V2S16}}) 1572 .lowerIf([=](const LegalityQuery &Query) { 1573 const LLT BigTy = Query.Types[BigTyIdx]; 1574 return BigTy.getSizeInBits() == 32; 1575 }) 1576 // Try to widen to s16 first for small types. 1577 // TODO: Only do this on targets with legal s16 shifts 1578 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1579 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1580 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1581 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1582 elementTypeIs(1, S16)), 1583 changeTo(1, V2S16)) 1584 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1585 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1586 // valid. 1587 .clampScalar(LitTyIdx, S32, S512) 1588 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1589 // Break up vectors with weird elements into scalars 1590 .fewerElementsIf( 1591 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1592 scalarize(0)) 1593 .fewerElementsIf( 1594 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1595 scalarize(1)) 1596 .clampScalar(BigTyIdx, S32, MaxScalar); 1597 1598 if (Op == G_MERGE_VALUES) { 1599 Builder.widenScalarIf( 1600 // TODO: Use 16-bit shifts if legal for 8-bit values? 1601 [=](const LegalityQuery &Query) { 1602 const LLT Ty = Query.Types[LitTyIdx]; 1603 return Ty.getSizeInBits() < 32; 1604 }, 1605 changeTo(LitTyIdx, S32)); 1606 } 1607 1608 Builder.widenScalarIf( 1609 [=](const LegalityQuery &Query) { 1610 const LLT Ty = Query.Types[BigTyIdx]; 1611 return !isPowerOf2_32(Ty.getSizeInBits()) && 1612 Ty.getSizeInBits() % 16 != 0; 1613 }, 1614 [=](const LegalityQuery &Query) { 1615 // Pick the next power of 2, or a multiple of 64 over 128. 1616 // Whichever is smaller. 1617 const LLT &Ty = Query.Types[BigTyIdx]; 1618 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1619 if (NewSizeInBits >= 256) { 1620 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1621 if (RoundedTo < NewSizeInBits) 1622 NewSizeInBits = RoundedTo; 1623 } 1624 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1625 }) 1626 // Any vectors left are the wrong size. Scalarize them. 1627 .scalarize(0) 1628 .scalarize(1); 1629 } 1630 1631 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1632 // RegBankSelect. 1633 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1634 .legalFor({{S32}, {S64}}); 1635 1636 if (ST.hasVOP3PInsts()) { 1637 SextInReg.lowerFor({{V2S16}}) 1638 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1639 // get more vector shift opportunities, since we'll get those when 1640 // expanded. 1641 .clampMaxNumElementsStrict(0, S16, 2); 1642 } else if (ST.has16BitInsts()) { 1643 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1644 } else { 1645 // Prefer to promote to s32 before lowering if we don't have 16-bit 1646 // shifts. This avoid a lot of intermediate truncate and extend operations. 1647 SextInReg.lowerFor({{S32}, {S64}}); 1648 } 1649 1650 SextInReg 1651 .scalarize(0) 1652 .clampScalar(0, S32, S64) 1653 .lower(); 1654 1655 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1656 .scalarize(0) 1657 .lower(); 1658 1659 // TODO: Only Try to form v2s16 with legal packed instructions. 1660 getActionDefinitionsBuilder(G_FSHR) 1661 .legalFor({{S32, S32}}) 1662 .lowerFor({{V2S16, V2S16}}) 1663 .clampMaxNumElementsStrict(0, S16, 2) 1664 .scalarize(0) 1665 .lower(); 1666 1667 if (ST.hasVOP3PInsts()) { 1668 getActionDefinitionsBuilder(G_FSHL) 1669 .lowerFor({{V2S16, V2S16}}) 1670 .clampMaxNumElementsStrict(0, S16, 2) 1671 .scalarize(0) 1672 .lower(); 1673 } else { 1674 getActionDefinitionsBuilder(G_FSHL) 1675 .scalarize(0) 1676 .lower(); 1677 } 1678 1679 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1680 .legalFor({S64}); 1681 1682 getActionDefinitionsBuilder(G_FENCE) 1683 .alwaysLegal(); 1684 1685 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1686 .scalarize(0) 1687 .minScalar(0, S32) 1688 .lower(); 1689 1690 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1691 .legalFor({{S32, S32}, {S64, S32}}) 1692 .clampScalar(1, S32, S32) 1693 .clampScalar(0, S32, S64) 1694 .widenScalarToNextPow2(0) 1695 .scalarize(0); 1696 1697 getActionDefinitionsBuilder({ 1698 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1699 G_FCOPYSIGN, 1700 1701 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1702 G_ATOMICRMW_NAND, 1703 G_ATOMICRMW_FSUB, 1704 G_READ_REGISTER, 1705 G_WRITE_REGISTER, 1706 1707 G_SADDO, G_SSUBO, 1708 1709 // TODO: Implement 1710 G_FMINIMUM, G_FMAXIMUM}).lower(); 1711 1712 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1713 .lower(); 1714 1715 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1716 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1717 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1718 .unsupported(); 1719 1720 getLegacyLegalizerInfo().computeTables(); 1721 verify(*ST.getInstrInfo()); 1722 } 1723 1724 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1725 MachineInstr &MI) const { 1726 MachineIRBuilder &B = Helper.MIRBuilder; 1727 MachineRegisterInfo &MRI = *B.getMRI(); 1728 1729 switch (MI.getOpcode()) { 1730 case TargetOpcode::G_ADDRSPACE_CAST: 1731 return legalizeAddrSpaceCast(MI, MRI, B); 1732 case TargetOpcode::G_FRINT: 1733 return legalizeFrint(MI, MRI, B); 1734 case TargetOpcode::G_FCEIL: 1735 return legalizeFceil(MI, MRI, B); 1736 case TargetOpcode::G_FREM: 1737 return legalizeFrem(MI, MRI, B); 1738 case TargetOpcode::G_INTRINSIC_TRUNC: 1739 return legalizeIntrinsicTrunc(MI, MRI, B); 1740 case TargetOpcode::G_SITOFP: 1741 return legalizeITOFP(MI, MRI, B, true); 1742 case TargetOpcode::G_UITOFP: 1743 return legalizeITOFP(MI, MRI, B, false); 1744 case TargetOpcode::G_FPTOSI: 1745 return legalizeFPTOI(MI, MRI, B, true); 1746 case TargetOpcode::G_FPTOUI: 1747 return legalizeFPTOI(MI, MRI, B, false); 1748 case TargetOpcode::G_FMINNUM: 1749 case TargetOpcode::G_FMAXNUM: 1750 case TargetOpcode::G_FMINNUM_IEEE: 1751 case TargetOpcode::G_FMAXNUM_IEEE: 1752 return legalizeMinNumMaxNum(Helper, MI); 1753 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1754 return legalizeExtractVectorElt(MI, MRI, B); 1755 case TargetOpcode::G_INSERT_VECTOR_ELT: 1756 return legalizeInsertVectorElt(MI, MRI, B); 1757 case TargetOpcode::G_SHUFFLE_VECTOR: 1758 return legalizeShuffleVector(MI, MRI, B); 1759 case TargetOpcode::G_FSIN: 1760 case TargetOpcode::G_FCOS: 1761 return legalizeSinCos(MI, MRI, B); 1762 case TargetOpcode::G_GLOBAL_VALUE: 1763 return legalizeGlobalValue(MI, MRI, B); 1764 case TargetOpcode::G_LOAD: 1765 case TargetOpcode::G_SEXTLOAD: 1766 case TargetOpcode::G_ZEXTLOAD: 1767 return legalizeLoad(Helper, MI); 1768 case TargetOpcode::G_FMAD: 1769 return legalizeFMad(MI, MRI, B); 1770 case TargetOpcode::G_FDIV: 1771 return legalizeFDIV(MI, MRI, B); 1772 case TargetOpcode::G_UDIV: 1773 case TargetOpcode::G_UREM: 1774 case TargetOpcode::G_UDIVREM: 1775 return legalizeUnsignedDIV_REM(MI, MRI, B); 1776 case TargetOpcode::G_SDIV: 1777 case TargetOpcode::G_SREM: 1778 case TargetOpcode::G_SDIVREM: 1779 return legalizeSignedDIV_REM(MI, MRI, B); 1780 case TargetOpcode::G_ATOMIC_CMPXCHG: 1781 return legalizeAtomicCmpXChg(MI, MRI, B); 1782 case TargetOpcode::G_FLOG: 1783 return legalizeFlog(MI, B, numbers::ln2f); 1784 case TargetOpcode::G_FLOG10: 1785 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1786 case TargetOpcode::G_FEXP: 1787 return legalizeFExp(MI, B); 1788 case TargetOpcode::G_FPOW: 1789 return legalizeFPow(MI, B); 1790 case TargetOpcode::G_FFLOOR: 1791 return legalizeFFloor(MI, MRI, B); 1792 case TargetOpcode::G_BUILD_VECTOR: 1793 return legalizeBuildVector(MI, MRI, B); 1794 case TargetOpcode::G_MUL: 1795 return legalizeMul(Helper, MI); 1796 case TargetOpcode::G_CTLZ: 1797 case TargetOpcode::G_CTTZ: 1798 return legalizeCTLZ_CTTZ(MI, MRI, B); 1799 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1800 return legalizeFPTruncRound(MI, B); 1801 default: 1802 return false; 1803 } 1804 1805 llvm_unreachable("expected switch to return"); 1806 } 1807 1808 Register AMDGPULegalizerInfo::getSegmentAperture( 1809 unsigned AS, 1810 MachineRegisterInfo &MRI, 1811 MachineIRBuilder &B) const { 1812 MachineFunction &MF = B.getMF(); 1813 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1814 const LLT S32 = LLT::scalar(32); 1815 1816 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1817 1818 if (ST.hasApertureRegs()) { 1819 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1820 // getreg. 1821 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1822 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1823 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1824 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1825 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1826 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1827 unsigned Encoding = 1828 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1829 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1830 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1831 1832 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1833 1834 B.buildInstr(AMDGPU::S_GETREG_B32) 1835 .addDef(GetReg) 1836 .addImm(Encoding); 1837 MRI.setType(GetReg, S32); 1838 1839 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1840 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1841 } 1842 1843 // TODO: can we be smarter about machine pointer info? 1844 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1845 Register LoadAddr = MRI.createGenericVirtualRegister( 1846 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1847 // For code object version 5, private_base and shared_base are passed through 1848 // implicit kernargs. 1849 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1850 AMDGPUTargetLowering::ImplicitParameter Param = 1851 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1852 : AMDGPUTargetLowering::PRIVATE_BASE; 1853 uint64_t Offset = 1854 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1855 1856 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1857 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1858 1859 if (!loadInputValue(KernargPtrReg, B, 1860 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1861 return Register(); 1862 1863 MachineMemOperand *MMO = MF.getMachineMemOperand( 1864 PtrInfo, 1865 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1866 MachineMemOperand::MOInvariant, 1867 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1868 1869 // Pointer address 1870 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1871 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1872 // Load address 1873 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1874 } 1875 1876 Register QueuePtr = MRI.createGenericVirtualRegister( 1877 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1878 1879 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1880 return Register(); 1881 1882 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1883 // private_segment_aperture_base_hi. 1884 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1885 1886 MachineMemOperand *MMO = MF.getMachineMemOperand( 1887 PtrInfo, 1888 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1889 MachineMemOperand::MOInvariant, 1890 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1891 1892 B.buildPtrAdd(LoadAddr, QueuePtr, 1893 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1894 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1895 } 1896 1897 /// Return true if the value is a known valid address, such that a null check is 1898 /// not necessary. 1899 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1900 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1901 MachineInstr *Def = MRI.getVRegDef(Val); 1902 switch (Def->getOpcode()) { 1903 case AMDGPU::G_FRAME_INDEX: 1904 case AMDGPU::G_GLOBAL_VALUE: 1905 case AMDGPU::G_BLOCK_ADDR: 1906 return true; 1907 case AMDGPU::G_CONSTANT: { 1908 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1909 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1910 } 1911 default: 1912 return false; 1913 } 1914 1915 return false; 1916 } 1917 1918 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1919 MachineInstr &MI, MachineRegisterInfo &MRI, 1920 MachineIRBuilder &B) const { 1921 MachineFunction &MF = B.getMF(); 1922 1923 const LLT S32 = LLT::scalar(32); 1924 Register Dst = MI.getOperand(0).getReg(); 1925 Register Src = MI.getOperand(1).getReg(); 1926 1927 LLT DstTy = MRI.getType(Dst); 1928 LLT SrcTy = MRI.getType(Src); 1929 unsigned DestAS = DstTy.getAddressSpace(); 1930 unsigned SrcAS = SrcTy.getAddressSpace(); 1931 1932 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1933 // vector element. 1934 assert(!DstTy.isVector()); 1935 1936 const AMDGPUTargetMachine &TM 1937 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1938 1939 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1940 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1941 return true; 1942 } 1943 1944 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1945 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1946 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1947 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1948 // Extract low 32-bits of the pointer. 1949 B.buildExtract(Dst, Src, 0); 1950 MI.eraseFromParent(); 1951 return true; 1952 } 1953 1954 unsigned NullVal = TM.getNullPointerValue(DestAS); 1955 1956 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1957 auto FlatNull = B.buildConstant(SrcTy, 0); 1958 1959 // Extract low 32-bits of the pointer. 1960 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1961 1962 auto CmpRes = 1963 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1964 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1965 1966 MI.eraseFromParent(); 1967 return true; 1968 } 1969 1970 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1971 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1972 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1973 if (!ST.hasFlatAddressSpace()) 1974 return false; 1975 1976 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1977 if (!ApertureReg.isValid()) 1978 return false; 1979 1980 // Coerce the type of the low half of the result so we can use merge_values. 1981 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1982 1983 // TODO: Should we allow mismatched types but matching sizes in merges to 1984 // avoid the ptrtoint? 1985 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1986 1987 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1988 B.buildCopy(Dst, BuildPtr); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1994 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1995 1996 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 1997 SegmentNull.getReg(0)); 1998 1999 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2000 2001 MI.eraseFromParent(); 2002 return true; 2003 } 2004 2005 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2006 SrcTy.getSizeInBits() == 64) { 2007 // Truncate. 2008 B.buildExtract(Dst, Src, 0); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2014 DstTy.getSizeInBits() == 64) { 2015 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2016 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2017 2018 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 2019 // another. Merge operands are required to be the same type, but creating an 2020 // extra ptrtoint would be kind of pointless. 2021 auto HighAddr = B.buildConstant( 2022 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 2023 B.buildMerge(Dst, {Src, HighAddr}); 2024 MI.eraseFromParent(); 2025 return true; 2026 } 2027 2028 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2029 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2030 2031 LLVMContext &Ctx = MF.getFunction().getContext(); 2032 Ctx.diagnose(InvalidAddrSpaceCast); 2033 B.buildUndef(Dst); 2034 MI.eraseFromParent(); 2035 return true; 2036 } 2037 2038 bool AMDGPULegalizerInfo::legalizeFrint( 2039 MachineInstr &MI, MachineRegisterInfo &MRI, 2040 MachineIRBuilder &B) const { 2041 Register Src = MI.getOperand(1).getReg(); 2042 LLT Ty = MRI.getType(Src); 2043 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2044 2045 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2046 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2047 2048 auto C1 = B.buildFConstant(Ty, C1Val); 2049 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2050 2051 // TODO: Should this propagate fast-math-flags? 2052 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2053 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2054 2055 auto C2 = B.buildFConstant(Ty, C2Val); 2056 auto Fabs = B.buildFAbs(Ty, Src); 2057 2058 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2059 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2060 MI.eraseFromParent(); 2061 return true; 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeFceil( 2065 MachineInstr &MI, MachineRegisterInfo &MRI, 2066 MachineIRBuilder &B) const { 2067 2068 const LLT S1 = LLT::scalar(1); 2069 const LLT S64 = LLT::scalar(64); 2070 2071 Register Src = MI.getOperand(1).getReg(); 2072 assert(MRI.getType(Src) == S64); 2073 2074 // result = trunc(src) 2075 // if (src > 0.0 && src != result) 2076 // result += 1.0 2077 2078 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2079 2080 const auto Zero = B.buildFConstant(S64, 0.0); 2081 const auto One = B.buildFConstant(S64, 1.0); 2082 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2083 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2084 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2085 auto Add = B.buildSelect(S64, And, One, Zero); 2086 2087 // TODO: Should this propagate fast-math-flags? 2088 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2089 MI.eraseFromParent(); 2090 return true; 2091 } 2092 2093 bool AMDGPULegalizerInfo::legalizeFrem( 2094 MachineInstr &MI, MachineRegisterInfo &MRI, 2095 MachineIRBuilder &B) const { 2096 Register DstReg = MI.getOperand(0).getReg(); 2097 Register Src0Reg = MI.getOperand(1).getReg(); 2098 Register Src1Reg = MI.getOperand(2).getReg(); 2099 auto Flags = MI.getFlags(); 2100 LLT Ty = MRI.getType(DstReg); 2101 2102 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2103 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2104 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2105 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 static MachineInstrBuilder extractF64Exponent(Register Hi, 2111 MachineIRBuilder &B) { 2112 const unsigned FractBits = 52; 2113 const unsigned ExpBits = 11; 2114 LLT S32 = LLT::scalar(32); 2115 2116 auto Const0 = B.buildConstant(S32, FractBits - 32); 2117 auto Const1 = B.buildConstant(S32, ExpBits); 2118 2119 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2120 .addUse(Hi) 2121 .addUse(Const0.getReg(0)) 2122 .addUse(Const1.getReg(0)); 2123 2124 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2125 } 2126 2127 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2128 MachineInstr &MI, MachineRegisterInfo &MRI, 2129 MachineIRBuilder &B) const { 2130 const LLT S1 = LLT::scalar(1); 2131 const LLT S32 = LLT::scalar(32); 2132 const LLT S64 = LLT::scalar(64); 2133 2134 Register Src = MI.getOperand(1).getReg(); 2135 assert(MRI.getType(Src) == S64); 2136 2137 // TODO: Should this use extract since the low half is unused? 2138 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2139 Register Hi = Unmerge.getReg(1); 2140 2141 // Extract the upper half, since this is where we will find the sign and 2142 // exponent. 2143 auto Exp = extractF64Exponent(Hi, B); 2144 2145 const unsigned FractBits = 52; 2146 2147 // Extract the sign bit. 2148 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2149 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2150 2151 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2152 2153 const auto Zero32 = B.buildConstant(S32, 0); 2154 2155 // Extend back to 64-bits. 2156 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2157 2158 auto Shr = B.buildAShr(S64, FractMask, Exp); 2159 auto Not = B.buildNot(S64, Shr); 2160 auto Tmp0 = B.buildAnd(S64, Src, Not); 2161 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2162 2163 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2164 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2165 2166 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2167 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2168 MI.eraseFromParent(); 2169 return true; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeITOFP( 2173 MachineInstr &MI, MachineRegisterInfo &MRI, 2174 MachineIRBuilder &B, bool Signed) const { 2175 2176 Register Dst = MI.getOperand(0).getReg(); 2177 Register Src = MI.getOperand(1).getReg(); 2178 2179 const LLT S64 = LLT::scalar(64); 2180 const LLT S32 = LLT::scalar(32); 2181 2182 assert(MRI.getType(Src) == S64); 2183 2184 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2185 auto ThirtyTwo = B.buildConstant(S32, 32); 2186 2187 if (MRI.getType(Dst) == S64) { 2188 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2189 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2190 2191 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2192 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2193 .addUse(CvtHi.getReg(0)) 2194 .addUse(ThirtyTwo.getReg(0)); 2195 2196 // TODO: Should this propagate fast-math-flags? 2197 B.buildFAdd(Dst, LdExp, CvtLo); 2198 MI.eraseFromParent(); 2199 return true; 2200 } 2201 2202 assert(MRI.getType(Dst) == S32); 2203 2204 auto One = B.buildConstant(S32, 1); 2205 2206 MachineInstrBuilder ShAmt; 2207 if (Signed) { 2208 auto ThirtyOne = B.buildConstant(S32, 31); 2209 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2210 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2211 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2212 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2213 /*HasSideEffects=*/false) 2214 .addUse(Unmerge.getReg(1)); 2215 auto LS2 = B.buildSub(S32, LS, One); 2216 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2217 } else 2218 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2219 auto Norm = B.buildShl(S64, Src, ShAmt); 2220 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2221 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2222 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2223 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2224 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2225 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2226 /*HasSideEffects=*/false) 2227 .addUse(FVal.getReg(0)) 2228 .addUse(Scale.getReg(0)); 2229 MI.eraseFromParent(); 2230 return true; 2231 } 2232 2233 // TODO: Copied from DAG implementation. Verify logic and document how this 2234 // actually works. 2235 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2236 MachineRegisterInfo &MRI, 2237 MachineIRBuilder &B, 2238 bool Signed) const { 2239 2240 Register Dst = MI.getOperand(0).getReg(); 2241 Register Src = MI.getOperand(1).getReg(); 2242 2243 const LLT S64 = LLT::scalar(64); 2244 const LLT S32 = LLT::scalar(32); 2245 2246 const LLT SrcLT = MRI.getType(Src); 2247 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2248 2249 unsigned Flags = MI.getFlags(); 2250 2251 // The basic idea of converting a floating point number into a pair of 32-bit 2252 // integers is illustrated as follows: 2253 // 2254 // tf := trunc(val); 2255 // hif := floor(tf * 2^-32); 2256 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2257 // hi := fptoi(hif); 2258 // lo := fptoi(lof); 2259 // 2260 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2261 MachineInstrBuilder Sign; 2262 if (Signed && SrcLT == S32) { 2263 // However, a 32-bit floating point number has only 23 bits mantissa and 2264 // it's not enough to hold all the significant bits of `lof` if val is 2265 // negative. To avoid the loss of precision, We need to take the absolute 2266 // value after truncating and flip the result back based on the original 2267 // signedness. 2268 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2269 Trunc = B.buildFAbs(S32, Trunc, Flags); 2270 } 2271 MachineInstrBuilder K0, K1; 2272 if (SrcLT == S64) { 2273 K0 = B.buildFConstant(S64, 2274 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2275 K1 = B.buildFConstant(S64, 2276 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2277 } else { 2278 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2279 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2280 } 2281 2282 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2283 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2284 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2285 2286 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2287 : B.buildFPTOUI(S32, FloorMul); 2288 auto Lo = B.buildFPTOUI(S32, Fma); 2289 2290 if (Signed && SrcLT == S32) { 2291 // Flip the result based on the signedness, which is either all 0s or 1s. 2292 Sign = B.buildMerge(S64, {Sign, Sign}); 2293 // r := xor({lo, hi}, sign) - sign; 2294 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2295 } else 2296 B.buildMerge(Dst, {Lo, Hi}); 2297 MI.eraseFromParent(); 2298 2299 return true; 2300 } 2301 2302 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2303 MachineInstr &MI) const { 2304 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2305 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2306 2307 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2308 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2309 2310 // With ieee_mode disabled, the instructions have the correct behavior 2311 // already for G_FMINNUM/G_FMAXNUM 2312 if (!MFI->getMode().IEEE) 2313 return !IsIEEEOp; 2314 2315 if (IsIEEEOp) 2316 return true; 2317 2318 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2319 } 2320 2321 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2322 MachineInstr &MI, MachineRegisterInfo &MRI, 2323 MachineIRBuilder &B) const { 2324 // TODO: Should move some of this into LegalizerHelper. 2325 2326 // TODO: Promote dynamic indexing of s16 to s32 2327 2328 // FIXME: Artifact combiner probably should have replaced the truncated 2329 // constant before this, so we shouldn't need 2330 // getIConstantVRegValWithLookThrough. 2331 Optional<ValueAndVReg> MaybeIdxVal = 2332 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2333 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2334 return true; 2335 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2336 2337 Register Dst = MI.getOperand(0).getReg(); 2338 Register Vec = MI.getOperand(1).getReg(); 2339 2340 LLT VecTy = MRI.getType(Vec); 2341 LLT EltTy = VecTy.getElementType(); 2342 assert(EltTy == MRI.getType(Dst)); 2343 2344 if (IdxVal < VecTy.getNumElements()) { 2345 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2346 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2347 } else { 2348 B.buildUndef(Dst); 2349 } 2350 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2356 MachineInstr &MI, MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 // TODO: Should move some of this into LegalizerHelper. 2359 2360 // TODO: Promote dynamic indexing of s16 to s32 2361 2362 // FIXME: Artifact combiner probably should have replaced the truncated 2363 // constant before this, so we shouldn't need 2364 // getIConstantVRegValWithLookThrough. 2365 Optional<ValueAndVReg> MaybeIdxVal = 2366 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2367 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2368 return true; 2369 2370 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2371 Register Dst = MI.getOperand(0).getReg(); 2372 Register Vec = MI.getOperand(1).getReg(); 2373 Register Ins = MI.getOperand(2).getReg(); 2374 2375 LLT VecTy = MRI.getType(Vec); 2376 LLT EltTy = VecTy.getElementType(); 2377 assert(EltTy == MRI.getType(Ins)); 2378 (void)Ins; 2379 2380 unsigned NumElts = VecTy.getNumElements(); 2381 if (IdxVal < NumElts) { 2382 SmallVector<Register, 8> SrcRegs; 2383 for (unsigned i = 0; i < NumElts; ++i) 2384 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2385 B.buildUnmerge(SrcRegs, Vec); 2386 2387 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2388 B.buildMerge(Dst, SrcRegs); 2389 } else { 2390 B.buildUndef(Dst); 2391 } 2392 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2398 MachineInstr &MI, MachineRegisterInfo &MRI, 2399 MachineIRBuilder &B) const { 2400 const LLT V2S16 = LLT::fixed_vector(2, 16); 2401 2402 Register Dst = MI.getOperand(0).getReg(); 2403 Register Src0 = MI.getOperand(1).getReg(); 2404 LLT DstTy = MRI.getType(Dst); 2405 LLT SrcTy = MRI.getType(Src0); 2406 2407 if (SrcTy == V2S16 && DstTy == V2S16 && 2408 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2409 return true; 2410 2411 MachineIRBuilder HelperBuilder(MI); 2412 GISelObserverWrapper DummyObserver; 2413 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2414 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2415 } 2416 2417 bool AMDGPULegalizerInfo::legalizeSinCos( 2418 MachineInstr &MI, MachineRegisterInfo &MRI, 2419 MachineIRBuilder &B) const { 2420 2421 Register DstReg = MI.getOperand(0).getReg(); 2422 Register SrcReg = MI.getOperand(1).getReg(); 2423 LLT Ty = MRI.getType(DstReg); 2424 unsigned Flags = MI.getFlags(); 2425 2426 Register TrigVal; 2427 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2428 if (ST.hasTrigReducedRange()) { 2429 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2430 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2431 .addUse(MulVal.getReg(0)) 2432 .setMIFlags(Flags).getReg(0); 2433 } else 2434 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2435 2436 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2437 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2438 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2439 .addUse(TrigVal) 2440 .setMIFlags(Flags); 2441 MI.eraseFromParent(); 2442 return true; 2443 } 2444 2445 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2446 MachineIRBuilder &B, 2447 const GlobalValue *GV, 2448 int64_t Offset, 2449 unsigned GAFlags) const { 2450 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2451 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2452 // to the following code sequence: 2453 // 2454 // For constant address space: 2455 // s_getpc_b64 s[0:1] 2456 // s_add_u32 s0, s0, $symbol 2457 // s_addc_u32 s1, s1, 0 2458 // 2459 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2460 // a fixup or relocation is emitted to replace $symbol with a literal 2461 // constant, which is a pc-relative offset from the encoding of the $symbol 2462 // operand to the global variable. 2463 // 2464 // For global address space: 2465 // s_getpc_b64 s[0:1] 2466 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2467 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2468 // 2469 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2470 // fixups or relocations are emitted to replace $symbol@*@lo and 2471 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2472 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2473 // operand to the global variable. 2474 // 2475 // What we want here is an offset from the value returned by s_getpc 2476 // (which is the address of the s_add_u32 instruction) to the global 2477 // variable, but since the encoding of $symbol starts 4 bytes after the start 2478 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2479 // small. This requires us to add 4 to the global variable offset in order to 2480 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2481 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2482 // instruction. 2483 2484 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2485 2486 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2487 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2488 2489 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2490 .addDef(PCReg); 2491 2492 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2493 if (GAFlags == SIInstrInfo::MO_NONE) 2494 MIB.addImm(0); 2495 else 2496 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2497 2498 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2499 2500 if (PtrTy.getSizeInBits() == 32) 2501 B.buildExtract(DstReg, PCReg, 0); 2502 return true; 2503 } 2504 2505 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2506 MachineInstr &MI, MachineRegisterInfo &MRI, 2507 MachineIRBuilder &B) const { 2508 Register DstReg = MI.getOperand(0).getReg(); 2509 LLT Ty = MRI.getType(DstReg); 2510 unsigned AS = Ty.getAddressSpace(); 2511 2512 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2513 MachineFunction &MF = B.getMF(); 2514 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2515 2516 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2517 if (!MFI->isModuleEntryFunction() && 2518 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2519 const Function &Fn = MF.getFunction(); 2520 DiagnosticInfoUnsupported BadLDSDecl( 2521 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2522 DS_Warning); 2523 Fn.getContext().diagnose(BadLDSDecl); 2524 2525 // We currently don't have a way to correctly allocate LDS objects that 2526 // aren't directly associated with a kernel. We do force inlining of 2527 // functions that use local objects. However, if these dead functions are 2528 // not eliminated, we don't want a compile time error. Just emit a warning 2529 // and a trap, since there should be no callable path here. 2530 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2531 B.buildUndef(DstReg); 2532 MI.eraseFromParent(); 2533 return true; 2534 } 2535 2536 // TODO: We could emit code to handle the initialization somewhere. 2537 // We ignore the initializer for now and legalize it to allow selection. 2538 // The initializer will anyway get errored out during assembly emission. 2539 const SITargetLowering *TLI = ST.getTargetLowering(); 2540 if (!TLI->shouldUseLDSConstAddress(GV)) { 2541 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2542 return true; // Leave in place; 2543 } 2544 2545 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2546 Type *Ty = GV->getValueType(); 2547 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2548 // zero-sized type in other languages to declare the dynamic shared 2549 // memory which size is not known at the compile time. They will be 2550 // allocated by the runtime and placed directly after the static 2551 // allocated ones. They all share the same offset. 2552 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2553 // Adjust alignment for that dynamic shared memory array. 2554 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2555 LLT S32 = LLT::scalar(32); 2556 auto Sz = 2557 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2558 B.buildIntToPtr(DstReg, Sz); 2559 MI.eraseFromParent(); 2560 return true; 2561 } 2562 } 2563 2564 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2565 *cast<GlobalVariable>(GV))); 2566 MI.eraseFromParent(); 2567 return true; 2568 } 2569 2570 const SITargetLowering *TLI = ST.getTargetLowering(); 2571 2572 if (TLI->shouldEmitFixup(GV)) { 2573 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 if (TLI->shouldEmitPCReloc(GV)) { 2579 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2580 MI.eraseFromParent(); 2581 return true; 2582 } 2583 2584 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2585 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2586 2587 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2588 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2589 MachinePointerInfo::getGOT(MF), 2590 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2591 MachineMemOperand::MOInvariant, 2592 LoadTy, Align(8)); 2593 2594 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2595 2596 if (Ty.getSizeInBits() == 32) { 2597 // Truncate if this is a 32-bit constant address. 2598 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2599 B.buildExtract(DstReg, Load, 0); 2600 } else 2601 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2602 2603 MI.eraseFromParent(); 2604 return true; 2605 } 2606 2607 static LLT widenToNextPowerOf2(LLT Ty) { 2608 if (Ty.isVector()) 2609 return Ty.changeElementCount( 2610 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2611 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2612 } 2613 2614 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2615 MachineInstr &MI) const { 2616 MachineIRBuilder &B = Helper.MIRBuilder; 2617 MachineRegisterInfo &MRI = *B.getMRI(); 2618 GISelChangeObserver &Observer = Helper.Observer; 2619 2620 Register PtrReg = MI.getOperand(1).getReg(); 2621 LLT PtrTy = MRI.getType(PtrReg); 2622 unsigned AddrSpace = PtrTy.getAddressSpace(); 2623 2624 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2625 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2626 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2627 Observer.changingInstr(MI); 2628 MI.getOperand(1).setReg(Cast.getReg(0)); 2629 Observer.changedInstr(MI); 2630 return true; 2631 } 2632 2633 if (MI.getOpcode() != AMDGPU::G_LOAD) 2634 return false; 2635 2636 Register ValReg = MI.getOperand(0).getReg(); 2637 LLT ValTy = MRI.getType(ValReg); 2638 2639 MachineMemOperand *MMO = *MI.memoperands_begin(); 2640 const unsigned ValSize = ValTy.getSizeInBits(); 2641 const LLT MemTy = MMO->getMemoryType(); 2642 const Align MemAlign = MMO->getAlign(); 2643 const unsigned MemSize = MemTy.getSizeInBits(); 2644 const uint64_t AlignInBits = 8 * MemAlign.value(); 2645 2646 // Widen non-power-of-2 loads to the alignment if needed 2647 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2648 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2649 2650 // This was already the correct extending load result type, so just adjust 2651 // the memory type. 2652 if (WideMemSize == ValSize) { 2653 MachineFunction &MF = B.getMF(); 2654 2655 MachineMemOperand *WideMMO = 2656 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2657 Observer.changingInstr(MI); 2658 MI.setMemRefs(MF, {WideMMO}); 2659 Observer.changedInstr(MI); 2660 return true; 2661 } 2662 2663 // Don't bother handling edge case that should probably never be produced. 2664 if (ValSize > WideMemSize) 2665 return false; 2666 2667 LLT WideTy = widenToNextPowerOf2(ValTy); 2668 2669 Register WideLoad; 2670 if (!WideTy.isVector()) { 2671 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2672 B.buildTrunc(ValReg, WideLoad).getReg(0); 2673 } else { 2674 // Extract the subvector. 2675 2676 if (isRegisterType(ValTy)) { 2677 // If this a case where G_EXTRACT is legal, use it. 2678 // (e.g. <3 x s32> -> <4 x s32>) 2679 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2680 B.buildExtract(ValReg, WideLoad, 0); 2681 } else { 2682 // For cases where the widened type isn't a nice register value, unmerge 2683 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2684 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2685 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2686 } 2687 } 2688 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 return false; 2694 } 2695 2696 bool AMDGPULegalizerInfo::legalizeFMad( 2697 MachineInstr &MI, MachineRegisterInfo &MRI, 2698 MachineIRBuilder &B) const { 2699 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2700 assert(Ty.isScalar()); 2701 2702 MachineFunction &MF = B.getMF(); 2703 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2704 2705 // TODO: Always legal with future ftz flag. 2706 // FIXME: Do we need just output? 2707 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2708 return true; 2709 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2710 return true; 2711 2712 MachineIRBuilder HelperBuilder(MI); 2713 GISelObserverWrapper DummyObserver; 2714 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2715 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2716 } 2717 2718 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2719 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2720 Register DstReg = MI.getOperand(0).getReg(); 2721 Register PtrReg = MI.getOperand(1).getReg(); 2722 Register CmpVal = MI.getOperand(2).getReg(); 2723 Register NewVal = MI.getOperand(3).getReg(); 2724 2725 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2726 "this should not have been custom lowered"); 2727 2728 LLT ValTy = MRI.getType(CmpVal); 2729 LLT VecTy = LLT::fixed_vector(2, ValTy); 2730 2731 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2732 2733 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2734 .addDef(DstReg) 2735 .addUse(PtrReg) 2736 .addUse(PackedVal) 2737 .setMemRefs(MI.memoperands()); 2738 2739 MI.eraseFromParent(); 2740 return true; 2741 } 2742 2743 bool AMDGPULegalizerInfo::legalizeFlog( 2744 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2745 Register Dst = MI.getOperand(0).getReg(); 2746 Register Src = MI.getOperand(1).getReg(); 2747 LLT Ty = B.getMRI()->getType(Dst); 2748 unsigned Flags = MI.getFlags(); 2749 2750 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2751 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2752 2753 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2754 MI.eraseFromParent(); 2755 return true; 2756 } 2757 2758 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2759 MachineIRBuilder &B) const { 2760 Register Dst = MI.getOperand(0).getReg(); 2761 Register Src = MI.getOperand(1).getReg(); 2762 unsigned Flags = MI.getFlags(); 2763 LLT Ty = B.getMRI()->getType(Dst); 2764 2765 auto K = B.buildFConstant(Ty, numbers::log2e); 2766 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2767 B.buildFExp2(Dst, Mul, Flags); 2768 MI.eraseFromParent(); 2769 return true; 2770 } 2771 2772 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2773 MachineIRBuilder &B) const { 2774 Register Dst = MI.getOperand(0).getReg(); 2775 Register Src0 = MI.getOperand(1).getReg(); 2776 Register Src1 = MI.getOperand(2).getReg(); 2777 unsigned Flags = MI.getFlags(); 2778 LLT Ty = B.getMRI()->getType(Dst); 2779 const LLT S16 = LLT::scalar(16); 2780 const LLT S32 = LLT::scalar(32); 2781 2782 if (Ty == S32) { 2783 auto Log = B.buildFLog2(S32, Src0, Flags); 2784 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2785 .addUse(Log.getReg(0)) 2786 .addUse(Src1) 2787 .setMIFlags(Flags); 2788 B.buildFExp2(Dst, Mul, Flags); 2789 } else if (Ty == S16) { 2790 // There's no f16 fmul_legacy, so we need to convert for it. 2791 auto Log = B.buildFLog2(S16, Src0, Flags); 2792 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2793 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2794 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2795 .addUse(Ext0.getReg(0)) 2796 .addUse(Ext1.getReg(0)) 2797 .setMIFlags(Flags); 2798 2799 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2800 } else 2801 return false; 2802 2803 MI.eraseFromParent(); 2804 return true; 2805 } 2806 2807 // Find a source register, ignoring any possible source modifiers. 2808 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2809 Register ModSrc = OrigSrc; 2810 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2811 ModSrc = SrcFNeg->getOperand(1).getReg(); 2812 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2813 ModSrc = SrcFAbs->getOperand(1).getReg(); 2814 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2815 ModSrc = SrcFAbs->getOperand(1).getReg(); 2816 return ModSrc; 2817 } 2818 2819 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2820 MachineRegisterInfo &MRI, 2821 MachineIRBuilder &B) const { 2822 2823 const LLT S1 = LLT::scalar(1); 2824 const LLT S64 = LLT::scalar(64); 2825 Register Dst = MI.getOperand(0).getReg(); 2826 Register OrigSrc = MI.getOperand(1).getReg(); 2827 unsigned Flags = MI.getFlags(); 2828 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2829 "this should not have been custom lowered"); 2830 2831 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2832 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2833 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2834 // V_FRACT bug is: 2835 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2836 // 2837 // Convert floor(x) to (x - fract(x)) 2838 2839 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2840 .addUse(OrigSrc) 2841 .setMIFlags(Flags); 2842 2843 // Give source modifier matching some assistance before obscuring a foldable 2844 // pattern. 2845 2846 // TODO: We can avoid the neg on the fract? The input sign to fract 2847 // shouldn't matter? 2848 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2849 2850 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2851 2852 Register Min = MRI.createGenericVirtualRegister(S64); 2853 2854 // We don't need to concern ourselves with the snan handling difference, so 2855 // use the one which will directly select. 2856 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2857 if (MFI->getMode().IEEE) 2858 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2859 else 2860 B.buildFMinNum(Min, Fract, Const, Flags); 2861 2862 Register CorrectedFract = Min; 2863 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2864 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2865 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2866 } 2867 2868 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2869 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2870 2871 MI.eraseFromParent(); 2872 return true; 2873 } 2874 2875 // Turn an illegal packed v2s16 build vector into bit operations. 2876 // TODO: This should probably be a bitcast action in LegalizerHelper. 2877 bool AMDGPULegalizerInfo::legalizeBuildVector( 2878 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2879 Register Dst = MI.getOperand(0).getReg(); 2880 const LLT S32 = LLT::scalar(32); 2881 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2882 2883 Register Src0 = MI.getOperand(1).getReg(); 2884 Register Src1 = MI.getOperand(2).getReg(); 2885 assert(MRI.getType(Src0) == LLT::scalar(16)); 2886 2887 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2888 B.buildBitcast(Dst, Merge); 2889 2890 MI.eraseFromParent(); 2891 return true; 2892 } 2893 2894 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 2895 // 2896 // Source and accumulation registers must all be 32-bits. 2897 // 2898 // TODO: When the multiply is uniform, we should produce a code sequence 2899 // that is better suited to instruction selection on the SALU. Instead of 2900 // the outer loop going over parts of the result, the outer loop should go 2901 // over parts of one of the factors. This should result in instruction 2902 // selection that makes full use of S_ADDC_U32 instructions. 2903 void AMDGPULegalizerInfo::buildMultiply( 2904 LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 2905 ArrayRef<Register> Src0, ArrayRef<Register> Src1, 2906 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 2907 // Use (possibly empty) vectors of S1 registers to represent the set of 2908 // carries from one pair of positions to the next. 2909 using Carry = SmallVector<Register, 2>; 2910 2911 MachineIRBuilder &B = Helper.MIRBuilder; 2912 2913 const LLT S1 = LLT::scalar(1); 2914 const LLT S32 = LLT::scalar(32); 2915 const LLT S64 = LLT::scalar(64); 2916 2917 Register Zero32; 2918 Register Zero64; 2919 2920 auto getZero32 = [&]() -> Register { 2921 if (!Zero32) 2922 Zero32 = B.buildConstant(S32, 0).getReg(0); 2923 return Zero32; 2924 }; 2925 auto getZero64 = [&]() -> Register { 2926 if (!Zero64) 2927 Zero64 = B.buildConstant(S64, 0).getReg(0); 2928 return Zero64; 2929 }; 2930 2931 // Merge the given carries into the 32-bit LocalAccum, which is modified 2932 // in-place. 2933 // 2934 // Returns the carry-out, which is a single S1 register or null. 2935 auto mergeCarry = 2936 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 2937 if (CarryIn.empty()) 2938 return Register(); 2939 2940 bool HaveCarryOut = true; 2941 Register CarryAccum; 2942 if (CarryIn.size() == 1) { 2943 if (!LocalAccum) { 2944 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2945 return Register(); 2946 } 2947 2948 CarryAccum = getZero32(); 2949 } else { 2950 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2951 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 2952 CarryAccum = 2953 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 2954 .getReg(0); 2955 } 2956 2957 if (!LocalAccum) { 2958 LocalAccum = getZero32(); 2959 HaveCarryOut = false; 2960 } 2961 } 2962 2963 auto Add = 2964 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 2965 LocalAccum = Add.getReg(0); 2966 return HaveCarryOut ? Add.getReg(1) : Register(); 2967 }; 2968 2969 // Build a multiply-add chain to compute 2970 // 2971 // LocalAccum + (partial products at DstIndex) 2972 // + (opportunistic subset of CarryIn) 2973 // 2974 // LocalAccum is an array of one or two 32-bit registers that are updated 2975 // in-place. The incoming registers may be null. 2976 // 2977 // In some edge cases, carry-ins can be consumed "for free". In that case, 2978 // the consumed carry bits are removed from CarryIn in-place. 2979 auto buildMadChain = 2980 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 2981 -> Carry { 2982 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 2983 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 2984 2985 Carry CarryOut; 2986 unsigned j0 = 0; 2987 2988 // Use plain 32-bit multiplication for the most significant part of the 2989 // result by default. 2990 if (LocalAccum.size() == 1 && 2991 (!UsePartialMad64_32 || !CarryIn.empty())) { 2992 do { 2993 unsigned j1 = DstIndex - j0; 2994 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 2995 if (!LocalAccum[0]) { 2996 LocalAccum[0] = Mul.getReg(0); 2997 } else { 2998 if (CarryIn.empty()) { 2999 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3000 } else { 3001 LocalAccum[0] = 3002 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3003 .getReg(0); 3004 CarryIn.pop_back(); 3005 } 3006 } 3007 ++j0; 3008 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3009 } 3010 3011 // Build full 64-bit multiplies. 3012 if (j0 <= DstIndex) { 3013 bool HaveSmallAccum = false; 3014 Register Tmp; 3015 3016 if (LocalAccum[0]) { 3017 if (LocalAccum.size() == 1) { 3018 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3019 HaveSmallAccum = true; 3020 } else if (LocalAccum[1]) { 3021 Tmp = B.buildMerge(S64, LocalAccum).getReg(0); 3022 HaveSmallAccum = false; 3023 } else { 3024 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3025 HaveSmallAccum = true; 3026 } 3027 } else { 3028 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3029 Tmp = getZero64(); 3030 HaveSmallAccum = true; 3031 } 3032 3033 do { 3034 unsigned j1 = DstIndex - j0; 3035 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3036 {Src0[j0], Src1[j1], Tmp}); 3037 Tmp = Mad.getReg(0); 3038 if (!HaveSmallAccum) 3039 CarryOut.push_back(Mad.getReg(1)); 3040 HaveSmallAccum = false; 3041 ++j0; 3042 } while (j0 <= DstIndex); 3043 3044 auto Unmerge = B.buildUnmerge(S32, Tmp); 3045 LocalAccum[0] = Unmerge.getReg(0); 3046 if (LocalAccum.size() > 1) 3047 LocalAccum[1] = Unmerge.getReg(1); 3048 } 3049 3050 return CarryOut; 3051 }; 3052 3053 // Outer multiply loop, iterating over destination parts from least 3054 // significant to most significant parts. 3055 // 3056 // The columns of the following diagram correspond to the destination parts 3057 // affected by one iteration of the outer loop (ignoring boundary 3058 // conditions). 3059 // 3060 // Dest index relative to 2 * i: 1 0 -1 3061 // ------ 3062 // Carries from previous iteration: e o 3063 // Even-aligned partial product sum: E E . 3064 // Odd-aligned partial product sum: O O 3065 // 3066 // 'o' is OddCarry, 'e' is EvenCarry. 3067 // EE and OO are computed from partial products via buildMadChain and use 3068 // accumulation where possible and appropriate. 3069 // 3070 Register SeparateOddCarry; 3071 Carry EvenCarry; 3072 Carry OddCarry; 3073 3074 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3075 Carry OddCarryIn = std::move(OddCarry); 3076 Carry EvenCarryIn = std::move(EvenCarry); 3077 OddCarry.clear(); 3078 EvenCarry.clear(); 3079 3080 // Partial products at offset 2 * i. 3081 if (2 * i < Accum.size()) { 3082 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3083 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3084 } 3085 3086 // Partial products at offset 2 * i - 1. 3087 if (i > 0) { 3088 if (!SeparateOddAlignedProducts) { 3089 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3090 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3091 } else { 3092 bool IsHighest = 2 * i >= Accum.size(); 3093 Register SeparateOddOut[2]; 3094 auto LocalAccum = makeMutableArrayRef(SeparateOddOut) 3095 .take_front(IsHighest ? 1 : 2); 3096 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3097 3098 MachineInstr *Lo; 3099 3100 if (i == 1) { 3101 if (!IsHighest) 3102 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3103 else 3104 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3105 } else { 3106 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3107 SeparateOddCarry); 3108 } 3109 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3110 3111 if (!IsHighest) { 3112 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3113 Lo->getOperand(1).getReg()); 3114 Accum[2 * i] = Hi.getReg(0); 3115 SeparateOddCarry = Hi.getReg(1); 3116 } 3117 } 3118 } 3119 3120 // Add in the carries from the previous iteration 3121 if (i > 0) { 3122 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3123 EvenCarryIn.push_back(CarryOut); 3124 3125 if (2 * i < Accum.size()) { 3126 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3127 OddCarry.push_back(CarryOut); 3128 } 3129 } 3130 } 3131 } 3132 3133 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3134 // 3135 // TODO: If the multiply is followed by an addition, we should attempt to 3136 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3137 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3138 MachineInstr &MI) const { 3139 assert(ST.hasMad64_32()); 3140 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3141 3142 MachineIRBuilder &B = Helper.MIRBuilder; 3143 MachineRegisterInfo &MRI = *B.getMRI(); 3144 3145 Register DstReg = MI.getOperand(0).getReg(); 3146 Register Src0 = MI.getOperand(1).getReg(); 3147 Register Src1 = MI.getOperand(2).getReg(); 3148 3149 LLT Ty = MRI.getType(DstReg); 3150 assert(Ty.isScalar()); 3151 3152 unsigned Size = Ty.getSizeInBits(); 3153 unsigned NumParts = Size / 32; 3154 assert((Size % 32) == 0); 3155 assert(NumParts >= 2); 3156 3157 // Whether to use MAD_64_32 for partial products whose high half is 3158 // discarded. This avoids some ADD instructions but risks false dependency 3159 // stalls on some subtargets in some cases. 3160 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3161 3162 // Whether to compute odd-aligned partial products separately. This is 3163 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3164 // in an even-aligned VGPR. 3165 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3166 3167 LLT S32 = LLT::scalar(32); 3168 SmallVector<Register, 2> Src0Parts, Src1Parts; 3169 for (unsigned i = 0; i < NumParts; ++i) { 3170 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3171 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3172 } 3173 B.buildUnmerge(Src0Parts, Src0); 3174 B.buildUnmerge(Src1Parts, Src1); 3175 3176 SmallVector<Register, 2> AccumRegs(NumParts); 3177 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3178 SeparateOddAlignedProducts); 3179 3180 B.buildMerge(DstReg, AccumRegs); 3181 MI.eraseFromParent(); 3182 return true; 3183 3184 } 3185 3186 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3187 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3188 // case with a single min instruction instead of a compare+select. 3189 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3190 MachineRegisterInfo &MRI, 3191 MachineIRBuilder &B) const { 3192 Register Dst = MI.getOperand(0).getReg(); 3193 Register Src = MI.getOperand(1).getReg(); 3194 LLT DstTy = MRI.getType(Dst); 3195 LLT SrcTy = MRI.getType(Src); 3196 3197 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3198 ? AMDGPU::G_AMDGPU_FFBH_U32 3199 : AMDGPU::G_AMDGPU_FFBL_B32; 3200 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3201 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3202 3203 MI.eraseFromParent(); 3204 return true; 3205 } 3206 3207 // Check that this is a G_XOR x, -1 3208 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3209 if (MI.getOpcode() != TargetOpcode::G_XOR) 3210 return false; 3211 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3212 return ConstVal && *ConstVal == -1; 3213 } 3214 3215 // Return the use branch instruction, otherwise null if the usage is invalid. 3216 static MachineInstr * 3217 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3218 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3219 Register CondDef = MI.getOperand(0).getReg(); 3220 if (!MRI.hasOneNonDBGUse(CondDef)) 3221 return nullptr; 3222 3223 MachineBasicBlock *Parent = MI.getParent(); 3224 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3225 3226 if (isNot(MRI, *UseMI)) { 3227 Register NegatedCond = UseMI->getOperand(0).getReg(); 3228 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3229 return nullptr; 3230 3231 // We're deleting the def of this value, so we need to remove it. 3232 eraseInstr(*UseMI, MRI); 3233 3234 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3235 Negated = true; 3236 } 3237 3238 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3239 return nullptr; 3240 3241 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3242 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3243 if (Next == Parent->end()) { 3244 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3245 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3246 return nullptr; 3247 UncondBrTarget = &*NextMBB; 3248 } else { 3249 if (Next->getOpcode() != AMDGPU::G_BR) 3250 return nullptr; 3251 Br = &*Next; 3252 UncondBrTarget = Br->getOperand(0).getMBB(); 3253 } 3254 3255 return UseMI; 3256 } 3257 3258 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3259 const ArgDescriptor *Arg, 3260 const TargetRegisterClass *ArgRC, 3261 LLT ArgTy) const { 3262 MCRegister SrcReg = Arg->getRegister(); 3263 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3264 assert(DstReg.isVirtual() && "Virtual register expected"); 3265 3266 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3267 *ArgRC, B.getDebugLoc(), ArgTy); 3268 if (Arg->isMasked()) { 3269 // TODO: Should we try to emit this once in the entry block? 3270 const LLT S32 = LLT::scalar(32); 3271 const unsigned Mask = Arg->getMask(); 3272 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 3273 3274 Register AndMaskSrc = LiveIn; 3275 3276 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3277 // 0. 3278 if (Shift != 0) { 3279 auto ShiftAmt = B.buildConstant(S32, Shift); 3280 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3281 } 3282 3283 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3284 } else { 3285 B.buildCopy(DstReg, LiveIn); 3286 } 3287 3288 return true; 3289 } 3290 3291 bool AMDGPULegalizerInfo::loadInputValue( 3292 Register DstReg, MachineIRBuilder &B, 3293 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3294 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3295 const ArgDescriptor *Arg; 3296 const TargetRegisterClass *ArgRC; 3297 LLT ArgTy; 3298 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3299 3300 if (!Arg) { 3301 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3302 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3303 // case the pointer argument may be missing and we use null. 3304 B.buildConstant(DstReg, 0); 3305 return true; 3306 } 3307 3308 // It's undefined behavior if a function marked with the amdgpu-no-* 3309 // attributes uses the corresponding intrinsic. 3310 B.buildUndef(DstReg); 3311 return true; 3312 } 3313 3314 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3315 return false; // TODO: Handle these 3316 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3317 } 3318 3319 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 3320 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3321 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3322 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3323 return false; 3324 3325 MI.eraseFromParent(); 3326 return true; 3327 } 3328 3329 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3330 int64_t C) { 3331 B.buildConstant(MI.getOperand(0).getReg(), C); 3332 MI.eraseFromParent(); 3333 return true; 3334 } 3335 3336 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3337 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3338 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3339 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3340 if (MaxID == 0) 3341 return replaceWithConstant(B, MI, 0); 3342 3343 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3344 const ArgDescriptor *Arg; 3345 const TargetRegisterClass *ArgRC; 3346 LLT ArgTy; 3347 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3348 3349 Register DstReg = MI.getOperand(0).getReg(); 3350 if (!Arg) { 3351 // It's undefined behavior if a function marked with the amdgpu-no-* 3352 // attributes uses the corresponding intrinsic. 3353 B.buildUndef(DstReg); 3354 MI.eraseFromParent(); 3355 return true; 3356 } 3357 3358 if (Arg->isMasked()) { 3359 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3360 // masking operations anyway. 3361 // 3362 // TODO: We could assert the top bit is 0 for the source copy. 3363 if (!loadInputValue(DstReg, B, ArgType)) 3364 return false; 3365 } else { 3366 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3367 if (!loadInputValue(TmpReg, B, ArgType)) 3368 return false; 3369 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 3370 } 3371 3372 MI.eraseFromParent(); 3373 return true; 3374 } 3375 3376 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3377 int64_t Offset) const { 3378 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3379 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3380 3381 // TODO: If we passed in the base kernel offset we could have a better 3382 // alignment than 4, but we don't really need it. 3383 if (!loadInputValue(KernArgReg, B, 3384 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3385 llvm_unreachable("failed to find kernarg segment ptr"); 3386 3387 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3388 // TODO: Should get nuw 3389 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3390 } 3391 3392 /// Legalize a value that's loaded from kernel arguments. This is only used by 3393 /// legacy intrinsics. 3394 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3395 MachineIRBuilder &B, 3396 uint64_t Offset, 3397 Align Alignment) const { 3398 Register DstReg = MI.getOperand(0).getReg(); 3399 3400 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3401 "unexpected kernarg parameter type"); 3402 3403 Register Ptr = getKernargParameterPtr(B, Offset); 3404 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3405 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3406 MachineMemOperand::MODereferenceable | 3407 MachineMemOperand::MOInvariant); 3408 MI.eraseFromParent(); 3409 return true; 3410 } 3411 3412 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3413 MachineRegisterInfo &MRI, 3414 MachineIRBuilder &B) const { 3415 Register Dst = MI.getOperand(0).getReg(); 3416 LLT DstTy = MRI.getType(Dst); 3417 LLT S16 = LLT::scalar(16); 3418 LLT S32 = LLT::scalar(32); 3419 LLT S64 = LLT::scalar(64); 3420 3421 if (DstTy == S16) 3422 return legalizeFDIV16(MI, MRI, B); 3423 if (DstTy == S32) 3424 return legalizeFDIV32(MI, MRI, B); 3425 if (DstTy == S64) 3426 return legalizeFDIV64(MI, MRI, B); 3427 3428 return false; 3429 } 3430 3431 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3432 Register DstDivReg, 3433 Register DstRemReg, 3434 Register X, 3435 Register Y) const { 3436 const LLT S1 = LLT::scalar(1); 3437 const LLT S32 = LLT::scalar(32); 3438 3439 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3440 // algorithm used here. 3441 3442 // Initial estimate of inv(y). 3443 auto FloatY = B.buildUITOFP(S32, Y); 3444 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3445 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3446 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3447 auto Z = B.buildFPTOUI(S32, ScaledY); 3448 3449 // One round of UNR. 3450 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3451 auto NegYZ = B.buildMul(S32, NegY, Z); 3452 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3453 3454 // Quotient/remainder estimate. 3455 auto Q = B.buildUMulH(S32, X, Z); 3456 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3457 3458 // First quotient/remainder refinement. 3459 auto One = B.buildConstant(S32, 1); 3460 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3461 if (DstDivReg) 3462 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3463 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3464 3465 // Second quotient/remainder refinement. 3466 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3467 if (DstDivReg) 3468 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3469 3470 if (DstRemReg) 3471 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3472 } 3473 3474 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3475 // 3476 // Return lo, hi of result 3477 // 3478 // %cvt.lo = G_UITOFP Val.lo 3479 // %cvt.hi = G_UITOFP Val.hi 3480 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3481 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3482 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3483 // %mul2 = G_FMUL %mul1, 2**(-32) 3484 // %trunc = G_INTRINSIC_TRUNC %mul2 3485 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3486 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3487 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3488 Register Val) { 3489 const LLT S32 = LLT::scalar(32); 3490 auto Unmerge = B.buildUnmerge(S32, Val); 3491 3492 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3493 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3494 3495 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3496 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3497 3498 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3499 auto Mul1 = 3500 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3501 3502 // 2**(-32) 3503 auto Mul2 = 3504 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3505 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3506 3507 // -(2**32) 3508 auto Mad2 = B.buildFMAD(S32, Trunc, 3509 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3510 3511 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3512 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3513 3514 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3515 } 3516 3517 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3518 Register DstDivReg, 3519 Register DstRemReg, 3520 Register Numer, 3521 Register Denom) const { 3522 const LLT S32 = LLT::scalar(32); 3523 const LLT S64 = LLT::scalar(64); 3524 const LLT S1 = LLT::scalar(1); 3525 Register RcpLo, RcpHi; 3526 3527 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3528 3529 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3530 3531 auto Zero64 = B.buildConstant(S64, 0); 3532 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3533 3534 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3535 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3536 3537 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3538 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3539 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3540 3541 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3542 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3543 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3544 3545 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3546 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3547 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3548 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3549 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3550 3551 auto Zero32 = B.buildConstant(S32, 0); 3552 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3553 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3554 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3555 3556 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3557 Register NumerLo = UnmergeNumer.getReg(0); 3558 Register NumerHi = UnmergeNumer.getReg(1); 3559 3560 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3561 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3562 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3563 Register Mul3_Lo = UnmergeMul3.getReg(0); 3564 Register Mul3_Hi = UnmergeMul3.getReg(1); 3565 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3566 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3567 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3568 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3569 3570 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3571 Register DenomLo = UnmergeDenom.getReg(0); 3572 Register DenomHi = UnmergeDenom.getReg(1); 3573 3574 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3575 auto C1 = B.buildSExt(S32, CmpHi); 3576 3577 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3578 auto C2 = B.buildSExt(S32, CmpLo); 3579 3580 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3581 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3582 3583 // TODO: Here and below portions of the code can be enclosed into if/endif. 3584 // Currently control flow is unconditional and we have 4 selects after 3585 // potential endif to substitute PHIs. 3586 3587 // if C3 != 0 ... 3588 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3589 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3590 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3591 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3592 3593 auto One64 = B.buildConstant(S64, 1); 3594 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3595 3596 auto C4 = 3597 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3598 auto C5 = 3599 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3600 auto C6 = B.buildSelect( 3601 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3602 3603 // if (C6 != 0) 3604 auto Add4 = B.buildAdd(S64, Add3, One64); 3605 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3606 3607 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3608 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3609 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3610 3611 // endif C6 3612 // endif C3 3613 3614 if (DstDivReg) { 3615 auto Sel1 = B.buildSelect( 3616 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3617 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3618 Sel1, MulHi3); 3619 } 3620 3621 if (DstRemReg) { 3622 auto Sel2 = B.buildSelect( 3623 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3624 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3625 Sel2, Sub1); 3626 } 3627 } 3628 3629 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3630 MachineRegisterInfo &MRI, 3631 MachineIRBuilder &B) const { 3632 Register DstDivReg, DstRemReg; 3633 switch (MI.getOpcode()) { 3634 default: 3635 llvm_unreachable("Unexpected opcode!"); 3636 case AMDGPU::G_UDIV: { 3637 DstDivReg = MI.getOperand(0).getReg(); 3638 break; 3639 } 3640 case AMDGPU::G_UREM: { 3641 DstRemReg = MI.getOperand(0).getReg(); 3642 break; 3643 } 3644 case AMDGPU::G_UDIVREM: { 3645 DstDivReg = MI.getOperand(0).getReg(); 3646 DstRemReg = MI.getOperand(1).getReg(); 3647 break; 3648 } 3649 } 3650 3651 const LLT S64 = LLT::scalar(64); 3652 const LLT S32 = LLT::scalar(32); 3653 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3654 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3655 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3656 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3657 3658 if (Ty == S32) 3659 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3660 else if (Ty == S64) 3661 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3662 else 3663 return false; 3664 3665 MI.eraseFromParent(); 3666 return true; 3667 } 3668 3669 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3670 MachineRegisterInfo &MRI, 3671 MachineIRBuilder &B) const { 3672 const LLT S64 = LLT::scalar(64); 3673 const LLT S32 = LLT::scalar(32); 3674 3675 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3676 if (Ty != S32 && Ty != S64) 3677 return false; 3678 3679 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3680 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3681 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3682 3683 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3684 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3685 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3686 3687 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3688 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3689 3690 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3691 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3692 3693 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3694 switch (MI.getOpcode()) { 3695 default: 3696 llvm_unreachable("Unexpected opcode!"); 3697 case AMDGPU::G_SDIV: { 3698 DstDivReg = MI.getOperand(0).getReg(); 3699 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3700 break; 3701 } 3702 case AMDGPU::G_SREM: { 3703 DstRemReg = MI.getOperand(0).getReg(); 3704 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3705 break; 3706 } 3707 case AMDGPU::G_SDIVREM: { 3708 DstDivReg = MI.getOperand(0).getReg(); 3709 DstRemReg = MI.getOperand(1).getReg(); 3710 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3711 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3712 break; 3713 } 3714 } 3715 3716 if (Ty == S32) 3717 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3718 else 3719 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3720 3721 if (DstDivReg) { 3722 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3723 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3724 B.buildSub(DstDivReg, SignXor, Sign); 3725 } 3726 3727 if (DstRemReg) { 3728 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3729 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3730 B.buildSub(DstRemReg, SignXor, Sign); 3731 } 3732 3733 MI.eraseFromParent(); 3734 return true; 3735 } 3736 3737 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3738 MachineRegisterInfo &MRI, 3739 MachineIRBuilder &B) const { 3740 Register Res = MI.getOperand(0).getReg(); 3741 Register LHS = MI.getOperand(1).getReg(); 3742 Register RHS = MI.getOperand(2).getReg(); 3743 uint16_t Flags = MI.getFlags(); 3744 LLT ResTy = MRI.getType(Res); 3745 3746 const MachineFunction &MF = B.getMF(); 3747 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3748 MI.getFlag(MachineInstr::FmAfn); 3749 3750 if (!AllowInaccurateRcp) 3751 return false; 3752 3753 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3754 // 1 / x -> RCP(x) 3755 if (CLHS->isExactlyValue(1.0)) { 3756 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3757 .addUse(RHS) 3758 .setMIFlags(Flags); 3759 3760 MI.eraseFromParent(); 3761 return true; 3762 } 3763 3764 // -1 / x -> RCP( FNEG(x) ) 3765 if (CLHS->isExactlyValue(-1.0)) { 3766 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3767 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3768 .addUse(FNeg.getReg(0)) 3769 .setMIFlags(Flags); 3770 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 } 3775 3776 // x / y -> x * (1.0 / y) 3777 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3778 .addUse(RHS) 3779 .setMIFlags(Flags); 3780 B.buildFMul(Res, LHS, RCP, Flags); 3781 3782 MI.eraseFromParent(); 3783 return true; 3784 } 3785 3786 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3787 MachineRegisterInfo &MRI, 3788 MachineIRBuilder &B) const { 3789 Register Res = MI.getOperand(0).getReg(); 3790 Register X = MI.getOperand(1).getReg(); 3791 Register Y = MI.getOperand(2).getReg(); 3792 uint16_t Flags = MI.getFlags(); 3793 LLT ResTy = MRI.getType(Res); 3794 3795 const MachineFunction &MF = B.getMF(); 3796 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3797 MI.getFlag(MachineInstr::FmAfn); 3798 3799 if (!AllowInaccurateRcp) 3800 return false; 3801 3802 auto NegY = B.buildFNeg(ResTy, Y); 3803 auto One = B.buildFConstant(ResTy, 1.0); 3804 3805 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3806 .addUse(Y) 3807 .setMIFlags(Flags); 3808 3809 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3810 R = B.buildFMA(ResTy, Tmp0, R, R); 3811 3812 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3813 R = B.buildFMA(ResTy, Tmp1, R, R); 3814 3815 auto Ret = B.buildFMul(ResTy, X, R); 3816 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3817 3818 B.buildFMA(Res, Tmp2, R, Ret); 3819 MI.eraseFromParent(); 3820 return true; 3821 } 3822 3823 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3824 MachineRegisterInfo &MRI, 3825 MachineIRBuilder &B) const { 3826 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3827 return true; 3828 3829 Register Res = MI.getOperand(0).getReg(); 3830 Register LHS = MI.getOperand(1).getReg(); 3831 Register RHS = MI.getOperand(2).getReg(); 3832 3833 uint16_t Flags = MI.getFlags(); 3834 3835 LLT S16 = LLT::scalar(16); 3836 LLT S32 = LLT::scalar(32); 3837 3838 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3839 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3840 3841 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3842 .addUse(RHSExt.getReg(0)) 3843 .setMIFlags(Flags); 3844 3845 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3846 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3847 3848 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3849 .addUse(RDst.getReg(0)) 3850 .addUse(RHS) 3851 .addUse(LHS) 3852 .setMIFlags(Flags); 3853 3854 MI.eraseFromParent(); 3855 return true; 3856 } 3857 3858 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3859 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3860 static void toggleSPDenormMode(bool Enable, 3861 MachineIRBuilder &B, 3862 const GCNSubtarget &ST, 3863 AMDGPU::SIModeRegisterDefaults Mode) { 3864 // Set SP denorm mode to this value. 3865 unsigned SPDenormMode = 3866 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3867 3868 if (ST.hasDenormModeInst()) { 3869 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3870 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3871 3872 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3873 B.buildInstr(AMDGPU::S_DENORM_MODE) 3874 .addImm(NewDenormModeValue); 3875 3876 } else { 3877 // Select FP32 bit field in mode register. 3878 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3879 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3880 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3881 3882 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3883 .addImm(SPDenormMode) 3884 .addImm(SPDenormModeBitField); 3885 } 3886 } 3887 3888 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3889 MachineRegisterInfo &MRI, 3890 MachineIRBuilder &B) const { 3891 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3892 return true; 3893 3894 Register Res = MI.getOperand(0).getReg(); 3895 Register LHS = MI.getOperand(1).getReg(); 3896 Register RHS = MI.getOperand(2).getReg(); 3897 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3898 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3899 3900 uint16_t Flags = MI.getFlags(); 3901 3902 LLT S32 = LLT::scalar(32); 3903 LLT S1 = LLT::scalar(1); 3904 3905 auto One = B.buildFConstant(S32, 1.0f); 3906 3907 auto DenominatorScaled = 3908 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3909 .addUse(LHS) 3910 .addUse(RHS) 3911 .addImm(0) 3912 .setMIFlags(Flags); 3913 auto NumeratorScaled = 3914 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3915 .addUse(LHS) 3916 .addUse(RHS) 3917 .addImm(1) 3918 .setMIFlags(Flags); 3919 3920 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3921 .addUse(DenominatorScaled.getReg(0)) 3922 .setMIFlags(Flags); 3923 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3924 3925 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3926 // aren't modeled as reading it. 3927 if (!Mode.allFP32Denormals()) 3928 toggleSPDenormMode(true, B, ST, Mode); 3929 3930 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3931 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3932 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3933 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3934 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3935 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3936 3937 if (!Mode.allFP32Denormals()) 3938 toggleSPDenormMode(false, B, ST, Mode); 3939 3940 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3941 .addUse(Fma4.getReg(0)) 3942 .addUse(Fma1.getReg(0)) 3943 .addUse(Fma3.getReg(0)) 3944 .addUse(NumeratorScaled.getReg(1)) 3945 .setMIFlags(Flags); 3946 3947 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3948 .addUse(Fmas.getReg(0)) 3949 .addUse(RHS) 3950 .addUse(LHS) 3951 .setMIFlags(Flags); 3952 3953 MI.eraseFromParent(); 3954 return true; 3955 } 3956 3957 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3958 MachineRegisterInfo &MRI, 3959 MachineIRBuilder &B) const { 3960 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3961 return true; 3962 3963 Register Res = MI.getOperand(0).getReg(); 3964 Register LHS = MI.getOperand(1).getReg(); 3965 Register RHS = MI.getOperand(2).getReg(); 3966 3967 uint16_t Flags = MI.getFlags(); 3968 3969 LLT S64 = LLT::scalar(64); 3970 LLT S1 = LLT::scalar(1); 3971 3972 auto One = B.buildFConstant(S64, 1.0); 3973 3974 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3975 .addUse(LHS) 3976 .addUse(RHS) 3977 .addImm(0) 3978 .setMIFlags(Flags); 3979 3980 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3981 3982 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3983 .addUse(DivScale0.getReg(0)) 3984 .setMIFlags(Flags); 3985 3986 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3987 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3988 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3989 3990 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3991 .addUse(LHS) 3992 .addUse(RHS) 3993 .addImm(1) 3994 .setMIFlags(Flags); 3995 3996 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3997 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3998 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3999 4000 Register Scale; 4001 if (!ST.hasUsableDivScaleConditionOutput()) { 4002 // Workaround a hardware bug on SI where the condition output from div_scale 4003 // is not usable. 4004 4005 LLT S32 = LLT::scalar(32); 4006 4007 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4008 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4009 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4010 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4011 4012 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4013 Scale1Unmerge.getReg(1)); 4014 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4015 Scale0Unmerge.getReg(1)); 4016 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4017 } else { 4018 Scale = DivScale1.getReg(1); 4019 } 4020 4021 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4022 .addUse(Fma4.getReg(0)) 4023 .addUse(Fma3.getReg(0)) 4024 .addUse(Mul.getReg(0)) 4025 .addUse(Scale) 4026 .setMIFlags(Flags); 4027 4028 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 4029 .addUse(Fmas.getReg(0)) 4030 .addUse(RHS) 4031 .addUse(LHS) 4032 .setMIFlags(Flags); 4033 4034 MI.eraseFromParent(); 4035 return true; 4036 } 4037 4038 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4039 MachineRegisterInfo &MRI, 4040 MachineIRBuilder &B) const { 4041 Register Res = MI.getOperand(0).getReg(); 4042 Register LHS = MI.getOperand(2).getReg(); 4043 Register RHS = MI.getOperand(3).getReg(); 4044 uint16_t Flags = MI.getFlags(); 4045 4046 LLT S32 = LLT::scalar(32); 4047 LLT S1 = LLT::scalar(1); 4048 4049 auto Abs = B.buildFAbs(S32, RHS, Flags); 4050 const APFloat C0Val(1.0f); 4051 4052 auto C0 = B.buildConstant(S32, 0x6f800000); 4053 auto C1 = B.buildConstant(S32, 0x2f800000); 4054 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 4055 4056 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4057 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4058 4059 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4060 4061 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4062 .addUse(Mul0.getReg(0)) 4063 .setMIFlags(Flags); 4064 4065 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4066 4067 B.buildFMul(Res, Sel, Mul1, Flags); 4068 4069 MI.eraseFromParent(); 4070 return true; 4071 } 4072 4073 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4074 // FIXME: Why do we handle this one but not other removed instructions? 4075 // 4076 // Reciprocal square root. The clamp prevents infinite results, clamping 4077 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4078 // +-max_float. 4079 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4080 MachineRegisterInfo &MRI, 4081 MachineIRBuilder &B) const { 4082 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4083 return true; 4084 4085 Register Dst = MI.getOperand(0).getReg(); 4086 Register Src = MI.getOperand(2).getReg(); 4087 auto Flags = MI.getFlags(); 4088 4089 LLT Ty = MRI.getType(Dst); 4090 4091 const fltSemantics *FltSemantics; 4092 if (Ty == LLT::scalar(32)) 4093 FltSemantics = &APFloat::IEEEsingle(); 4094 else if (Ty == LLT::scalar(64)) 4095 FltSemantics = &APFloat::IEEEdouble(); 4096 else 4097 return false; 4098 4099 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4100 .addUse(Src) 4101 .setMIFlags(Flags); 4102 4103 // We don't need to concern ourselves with the snan handling difference, since 4104 // the rsq quieted (or not) so use the one which will directly select. 4105 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4106 const bool UseIEEE = MFI->getMode().IEEE; 4107 4108 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4109 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4110 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4111 4112 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4113 4114 if (UseIEEE) 4115 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4116 else 4117 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4118 MI.eraseFromParent(); 4119 return true; 4120 } 4121 4122 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4123 switch (IID) { 4124 case Intrinsic::amdgcn_ds_fadd: 4125 return AMDGPU::G_ATOMICRMW_FADD; 4126 case Intrinsic::amdgcn_ds_fmin: 4127 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4128 case Intrinsic::amdgcn_ds_fmax: 4129 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4130 default: 4131 llvm_unreachable("not a DS FP intrinsic"); 4132 } 4133 } 4134 4135 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4136 MachineInstr &MI, 4137 Intrinsic::ID IID) const { 4138 GISelChangeObserver &Observer = Helper.Observer; 4139 Observer.changingInstr(MI); 4140 4141 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4142 4143 // The remaining operands were used to set fields in the MemOperand on 4144 // construction. 4145 for (int I = 6; I > 3; --I) 4146 MI.removeOperand(I); 4147 4148 MI.removeOperand(1); // Remove the intrinsic ID. 4149 Observer.changedInstr(MI); 4150 return true; 4151 } 4152 4153 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4154 MachineRegisterInfo &MRI, 4155 MachineIRBuilder &B) const { 4156 uint64_t Offset = 4157 ST.getTargetLowering()->getImplicitParameterOffset( 4158 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4159 LLT DstTy = MRI.getType(DstReg); 4160 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4161 4162 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4163 if (!loadInputValue(KernargPtrReg, B, 4164 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4165 return false; 4166 4167 // FIXME: This should be nuw 4168 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4169 return true; 4170 } 4171 4172 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 4173 MachineRegisterInfo &MRI, 4174 MachineIRBuilder &B) const { 4175 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4176 if (!MFI->isEntryFunction()) { 4177 return legalizePreloadedArgIntrin(MI, MRI, B, 4178 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4179 } 4180 4181 Register DstReg = MI.getOperand(0).getReg(); 4182 if (!getImplicitArgPtr(DstReg, MRI, B)) 4183 return false; 4184 4185 MI.eraseFromParent(); 4186 return true; 4187 } 4188 4189 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 4190 MachineRegisterInfo &MRI, 4191 MachineIRBuilder &B, 4192 unsigned AddrSpace) const { 4193 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4194 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4195 Register Hi32 = Unmerge.getReg(1); 4196 4197 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 4198 MI.eraseFromParent(); 4199 return true; 4200 } 4201 4202 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 4203 // offset (the offset that is included in bounds checking and swizzling, to be 4204 // split between the instruction's voffset and immoffset fields) and soffset 4205 // (the offset that is excluded from bounds checking and swizzling, to go in 4206 // the instruction's soffset field). This function takes the first kind of 4207 // offset and figures out how to split it between voffset and immoffset. 4208 std::pair<Register, unsigned> 4209 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 4210 Register OrigOffset) const { 4211 const unsigned MaxImm = 4095; 4212 Register BaseReg; 4213 unsigned ImmOffset; 4214 const LLT S32 = LLT::scalar(32); 4215 MachineRegisterInfo &MRI = *B.getMRI(); 4216 4217 std::tie(BaseReg, ImmOffset) = 4218 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 4219 4220 // If BaseReg is a pointer, convert it to int. 4221 if (MRI.getType(BaseReg).isPointer()) 4222 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 4223 4224 // If the immediate value is too big for the immoffset field, put the value 4225 // and -4096 into the immoffset field so that the value that is copied/added 4226 // for the voffset field is a multiple of 4096, and it stands more chance 4227 // of being CSEd with the copy/add for another similar load/store. 4228 // However, do not do that rounding down to a multiple of 4096 if that is a 4229 // negative number, as it appears to be illegal to have a negative offset 4230 // in the vgpr, even if adding the immediate offset makes it positive. 4231 unsigned Overflow = ImmOffset & ~MaxImm; 4232 ImmOffset -= Overflow; 4233 if ((int32_t)Overflow < 0) { 4234 Overflow += ImmOffset; 4235 ImmOffset = 0; 4236 } 4237 4238 if (Overflow != 0) { 4239 if (!BaseReg) { 4240 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 4241 } else { 4242 auto OverflowVal = B.buildConstant(S32, Overflow); 4243 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 4244 } 4245 } 4246 4247 if (!BaseReg) 4248 BaseReg = B.buildConstant(S32, 0).getReg(0); 4249 4250 return std::make_pair(BaseReg, ImmOffset); 4251 } 4252 4253 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4254 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4255 Register VOffset, Register SOffset, 4256 unsigned ImmOffset, Register VIndex, 4257 MachineRegisterInfo &MRI) const { 4258 Optional<ValueAndVReg> MaybeVOffsetVal = 4259 getIConstantVRegValWithLookThrough(VOffset, MRI); 4260 Optional<ValueAndVReg> MaybeSOffsetVal = 4261 getIConstantVRegValWithLookThrough(SOffset, MRI); 4262 Optional<ValueAndVReg> MaybeVIndexVal = 4263 getIConstantVRegValWithLookThrough(VIndex, MRI); 4264 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4265 // update the MMO with that offset. The stride is unknown so we can only do 4266 // this if VIndex is constant 0. 4267 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4268 MaybeVIndexVal->Value == 0) { 4269 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4270 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4271 MMO->setOffset(TotalOffset); 4272 } else { 4273 // We don't have a constant combined offset to use in the MMO. Give up. 4274 MMO->setValue((Value *)nullptr); 4275 } 4276 } 4277 4278 /// Handle register layout difference for f16 images for some subtargets. 4279 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 4280 MachineRegisterInfo &MRI, 4281 Register Reg, 4282 bool ImageStore) const { 4283 const LLT S16 = LLT::scalar(16); 4284 const LLT S32 = LLT::scalar(32); 4285 LLT StoreVT = MRI.getType(Reg); 4286 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 4287 4288 if (ST.hasUnpackedD16VMem()) { 4289 auto Unmerge = B.buildUnmerge(S16, Reg); 4290 4291 SmallVector<Register, 4> WideRegs; 4292 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4293 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 4294 4295 int NumElts = StoreVT.getNumElements(); 4296 4297 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4298 .getReg(0); 4299 } 4300 4301 if (ImageStore && ST.hasImageStoreD16Bug()) { 4302 if (StoreVT.getNumElements() == 2) { 4303 SmallVector<Register, 4> PackedRegs; 4304 Reg = B.buildBitcast(S32, Reg).getReg(0); 4305 PackedRegs.push_back(Reg); 4306 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4307 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4308 .getReg(0); 4309 } 4310 4311 if (StoreVT.getNumElements() == 3) { 4312 SmallVector<Register, 4> PackedRegs; 4313 auto Unmerge = B.buildUnmerge(S16, Reg); 4314 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4315 PackedRegs.push_back(Unmerge.getReg(I)); 4316 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4317 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4318 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4319 } 4320 4321 if (StoreVT.getNumElements() == 4) { 4322 SmallVector<Register, 4> PackedRegs; 4323 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4324 auto Unmerge = B.buildUnmerge(S32, Reg); 4325 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4326 PackedRegs.push_back(Unmerge.getReg(I)); 4327 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4328 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4329 .getReg(0); 4330 } 4331 4332 llvm_unreachable("invalid data type"); 4333 } 4334 4335 if (StoreVT == LLT::fixed_vector(3, S16)) { 4336 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4337 .getReg(0); 4338 } 4339 return Reg; 4340 } 4341 4342 Register AMDGPULegalizerInfo::fixStoreSourceType( 4343 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4344 MachineRegisterInfo *MRI = B.getMRI(); 4345 LLT Ty = MRI->getType(VData); 4346 4347 const LLT S16 = LLT::scalar(16); 4348 4349 // Fixup illegal register types for i8 stores. 4350 if (Ty == LLT::scalar(8) || Ty == S16) { 4351 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4352 return AnyExt; 4353 } 4354 4355 if (Ty.isVector()) { 4356 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4357 if (IsFormat) 4358 return handleD16VData(B, *MRI, VData); 4359 } 4360 } 4361 4362 return VData; 4363 } 4364 4365 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4366 MachineRegisterInfo &MRI, 4367 MachineIRBuilder &B, 4368 bool IsTyped, 4369 bool IsFormat) const { 4370 Register VData = MI.getOperand(1).getReg(); 4371 LLT Ty = MRI.getType(VData); 4372 LLT EltTy = Ty.getScalarType(); 4373 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4374 const LLT S32 = LLT::scalar(32); 4375 4376 VData = fixStoreSourceType(B, VData, IsFormat); 4377 Register RSrc = MI.getOperand(2).getReg(); 4378 4379 MachineMemOperand *MMO = *MI.memoperands_begin(); 4380 const int MemSize = MMO->getSize(); 4381 4382 unsigned ImmOffset; 4383 4384 // The typed intrinsics add an immediate after the registers. 4385 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4386 4387 // The struct intrinsic variants add one additional operand over raw. 4388 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4389 Register VIndex; 4390 int OpOffset = 0; 4391 if (HasVIndex) { 4392 VIndex = MI.getOperand(3).getReg(); 4393 OpOffset = 1; 4394 } else { 4395 VIndex = B.buildConstant(S32, 0).getReg(0); 4396 } 4397 4398 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4399 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4400 4401 unsigned Format = 0; 4402 if (IsTyped) { 4403 Format = MI.getOperand(5 + OpOffset).getImm(); 4404 ++OpOffset; 4405 } 4406 4407 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4408 4409 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4410 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4411 4412 unsigned Opc; 4413 if (IsTyped) { 4414 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4415 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4416 } else if (IsFormat) { 4417 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4418 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4419 } else { 4420 switch (MemSize) { 4421 case 1: 4422 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4423 break; 4424 case 2: 4425 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4426 break; 4427 default: 4428 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4429 break; 4430 } 4431 } 4432 4433 auto MIB = B.buildInstr(Opc) 4434 .addUse(VData) // vdata 4435 .addUse(RSrc) // rsrc 4436 .addUse(VIndex) // vindex 4437 .addUse(VOffset) // voffset 4438 .addUse(SOffset) // soffset 4439 .addImm(ImmOffset); // offset(imm) 4440 4441 if (IsTyped) 4442 MIB.addImm(Format); 4443 4444 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4445 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4446 .addMemOperand(MMO); 4447 4448 MI.eraseFromParent(); 4449 return true; 4450 } 4451 4452 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4453 MachineRegisterInfo &MRI, 4454 MachineIRBuilder &B, 4455 bool IsFormat, 4456 bool IsTyped) const { 4457 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4458 MachineMemOperand *MMO = *MI.memoperands_begin(); 4459 const LLT MemTy = MMO->getMemoryType(); 4460 const LLT S32 = LLT::scalar(32); 4461 4462 Register Dst = MI.getOperand(0).getReg(); 4463 Register RSrc = MI.getOperand(2).getReg(); 4464 4465 // The typed intrinsics add an immediate after the registers. 4466 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4467 4468 // The struct intrinsic variants add one additional operand over raw. 4469 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4470 Register VIndex; 4471 int OpOffset = 0; 4472 if (HasVIndex) { 4473 VIndex = MI.getOperand(3).getReg(); 4474 OpOffset = 1; 4475 } else { 4476 VIndex = B.buildConstant(S32, 0).getReg(0); 4477 } 4478 4479 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4480 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4481 4482 unsigned Format = 0; 4483 if (IsTyped) { 4484 Format = MI.getOperand(5 + OpOffset).getImm(); 4485 ++OpOffset; 4486 } 4487 4488 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4489 unsigned ImmOffset; 4490 4491 LLT Ty = MRI.getType(Dst); 4492 LLT EltTy = Ty.getScalarType(); 4493 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4494 const bool Unpacked = ST.hasUnpackedD16VMem(); 4495 4496 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4497 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4498 4499 unsigned Opc; 4500 4501 if (IsTyped) { 4502 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4503 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4504 } else if (IsFormat) { 4505 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4506 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4507 } else { 4508 switch (MemTy.getSizeInBits()) { 4509 case 8: 4510 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4511 break; 4512 case 16: 4513 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4514 break; 4515 default: 4516 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4517 break; 4518 } 4519 } 4520 4521 Register LoadDstReg; 4522 4523 bool IsExtLoad = 4524 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4525 LLT UnpackedTy = Ty.changeElementSize(32); 4526 4527 if (IsExtLoad) 4528 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4529 else if (Unpacked && IsD16 && Ty.isVector()) 4530 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4531 else 4532 LoadDstReg = Dst; 4533 4534 auto MIB = B.buildInstr(Opc) 4535 .addDef(LoadDstReg) // vdata 4536 .addUse(RSrc) // rsrc 4537 .addUse(VIndex) // vindex 4538 .addUse(VOffset) // voffset 4539 .addUse(SOffset) // soffset 4540 .addImm(ImmOffset); // offset(imm) 4541 4542 if (IsTyped) 4543 MIB.addImm(Format); 4544 4545 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4546 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4547 .addMemOperand(MMO); 4548 4549 if (LoadDstReg != Dst) { 4550 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4551 4552 // Widen result for extending loads was widened. 4553 if (IsExtLoad) 4554 B.buildTrunc(Dst, LoadDstReg); 4555 else { 4556 // Repack to original 16-bit vector result 4557 // FIXME: G_TRUNC should work, but legalization currently fails 4558 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4559 SmallVector<Register, 4> Repack; 4560 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4561 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4562 B.buildMerge(Dst, Repack); 4563 } 4564 } 4565 4566 MI.eraseFromParent(); 4567 return true; 4568 } 4569 4570 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4571 MachineIRBuilder &B, 4572 bool IsInc) const { 4573 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4574 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4575 B.buildInstr(Opc) 4576 .addDef(MI.getOperand(0).getReg()) 4577 .addUse(MI.getOperand(2).getReg()) 4578 .addUse(MI.getOperand(3).getReg()) 4579 .cloneMemRefs(MI); 4580 MI.eraseFromParent(); 4581 return true; 4582 } 4583 4584 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4585 switch (IntrID) { 4586 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4587 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4588 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4589 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4590 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4591 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4592 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4593 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4594 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4595 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4596 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4597 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4598 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4599 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4601 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4602 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4603 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4604 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4605 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4606 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4607 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4608 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4609 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4610 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4611 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4612 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4613 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4614 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4616 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4617 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4618 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4619 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4620 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4621 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4622 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4623 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4624 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4625 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4626 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4627 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4628 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4629 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4631 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4632 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4633 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4634 default: 4635 llvm_unreachable("unhandled atomic opcode"); 4636 } 4637 } 4638 4639 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4640 MachineIRBuilder &B, 4641 Intrinsic::ID IID) const { 4642 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4643 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4644 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4645 4646 Register Dst; 4647 4648 int OpOffset = 0; 4649 if (HasReturn) { 4650 // A few FP atomics do not support return values. 4651 Dst = MI.getOperand(0).getReg(); 4652 } else { 4653 OpOffset = -1; 4654 } 4655 4656 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4657 Register CmpVal; 4658 4659 if (IsCmpSwap) { 4660 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4661 ++OpOffset; 4662 } 4663 4664 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4665 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4666 4667 // The struct intrinsic variants add one additional operand over raw. 4668 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4669 Register VIndex; 4670 if (HasVIndex) { 4671 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4672 ++OpOffset; 4673 } else { 4674 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4675 } 4676 4677 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4678 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4679 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4680 4681 MachineMemOperand *MMO = *MI.memoperands_begin(); 4682 4683 unsigned ImmOffset; 4684 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4685 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4686 4687 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4688 4689 if (HasReturn) 4690 MIB.addDef(Dst); 4691 4692 MIB.addUse(VData); // vdata 4693 4694 if (IsCmpSwap) 4695 MIB.addReg(CmpVal); 4696 4697 MIB.addUse(RSrc) // rsrc 4698 .addUse(VIndex) // vindex 4699 .addUse(VOffset) // voffset 4700 .addUse(SOffset) // soffset 4701 .addImm(ImmOffset) // offset(imm) 4702 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4703 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4704 .addMemOperand(MMO); 4705 4706 MI.eraseFromParent(); 4707 return true; 4708 } 4709 4710 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4711 /// vector with s16 typed elements. 4712 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4713 SmallVectorImpl<Register> &PackedAddrs, 4714 unsigned ArgOffset, 4715 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4716 bool IsA16, bool IsG16) { 4717 const LLT S16 = LLT::scalar(16); 4718 const LLT V2S16 = LLT::fixed_vector(2, 16); 4719 auto EndIdx = Intr->VAddrEnd; 4720 4721 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4722 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4723 if (!SrcOp.isReg()) 4724 continue; // _L to _LZ may have eliminated this. 4725 4726 Register AddrReg = SrcOp.getReg(); 4727 4728 if ((I < Intr->GradientStart) || 4729 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4730 (I >= Intr->CoordStart && !IsA16)) { 4731 if ((I < Intr->GradientStart) && IsA16 && 4732 (B.getMRI()->getType(AddrReg) == S16)) { 4733 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4734 // Special handling of bias when A16 is on. Bias is of type half but 4735 // occupies full 32-bit. 4736 PackedAddrs.push_back( 4737 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4738 .getReg(0)); 4739 } else { 4740 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4741 "Bias needs to be converted to 16 bit in A16 mode"); 4742 // Handle any gradient or coordinate operands that should not be packed 4743 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4744 PackedAddrs.push_back(AddrReg); 4745 } 4746 } else { 4747 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4748 // derivatives dx/dh and dx/dv are packed with undef. 4749 if (((I + 1) >= EndIdx) || 4750 ((Intr->NumGradients / 2) % 2 == 1 && 4751 (I == static_cast<unsigned>(Intr->GradientStart + 4752 (Intr->NumGradients / 2) - 1) || 4753 I == static_cast<unsigned>(Intr->GradientStart + 4754 Intr->NumGradients - 1))) || 4755 // Check for _L to _LZ optimization 4756 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4757 PackedAddrs.push_back( 4758 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4759 .getReg(0)); 4760 } else { 4761 PackedAddrs.push_back( 4762 B.buildBuildVector( 4763 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4764 .getReg(0)); 4765 ++I; 4766 } 4767 } 4768 } 4769 } 4770 4771 /// Convert from separate vaddr components to a single vector address register, 4772 /// and replace the remaining operands with $noreg. 4773 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4774 int DimIdx, int NumVAddrs) { 4775 const LLT S32 = LLT::scalar(32); 4776 4777 SmallVector<Register, 8> AddrRegs; 4778 for (int I = 0; I != NumVAddrs; ++I) { 4779 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4780 if (SrcOp.isReg()) { 4781 AddrRegs.push_back(SrcOp.getReg()); 4782 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4783 } 4784 } 4785 4786 int NumAddrRegs = AddrRegs.size(); 4787 if (NumAddrRegs != 1) { 4788 // Above 8 elements round up to next power of 2 (i.e. 16). 4789 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4790 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4791 auto Undef = B.buildUndef(S32); 4792 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4793 NumAddrRegs = RoundedNumRegs; 4794 } 4795 4796 auto VAddr = 4797 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4798 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4799 } 4800 4801 for (int I = 1; I != NumVAddrs; ++I) { 4802 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4803 if (SrcOp.isReg()) 4804 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4805 } 4806 } 4807 4808 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4809 /// 4810 /// Depending on the subtarget, load/store with 16-bit element data need to be 4811 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4812 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4813 /// registers. 4814 /// 4815 /// We don't want to directly select image instructions just yet, but also want 4816 /// to exposes all register repacking to the legalizer/combiners. We also don't 4817 /// want a selected instruction entering RegBankSelect. In order to avoid 4818 /// defining a multitude of intermediate image instructions, directly hack on 4819 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4820 /// padding now unnecessary arguments with $noreg. 4821 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4822 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4823 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4824 4825 const unsigned NumDefs = MI.getNumExplicitDefs(); 4826 const unsigned ArgOffset = NumDefs + 1; 4827 bool IsTFE = NumDefs == 2; 4828 // We are only processing the operands of d16 image operations on subtargets 4829 // that use the unpacked register layout, or need to repack the TFE result. 4830 4831 // TODO: Do we need to guard against already legalized intrinsics? 4832 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4833 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4834 4835 MachineRegisterInfo *MRI = B.getMRI(); 4836 const LLT S32 = LLT::scalar(32); 4837 const LLT S16 = LLT::scalar(16); 4838 const LLT V2S16 = LLT::fixed_vector(2, 16); 4839 4840 unsigned DMask = 0; 4841 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4842 LLT Ty = MRI->getType(VData); 4843 4844 // Check for 16 bit addresses and pack if true. 4845 LLT GradTy = 4846 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4847 LLT AddrTy = 4848 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4849 const bool IsG16 = GradTy == S16; 4850 const bool IsA16 = AddrTy == S16; 4851 const bool IsD16 = Ty.getScalarType() == S16; 4852 4853 int DMaskLanes = 0; 4854 if (!BaseOpcode->Atomic) { 4855 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4856 if (BaseOpcode->Gather4) { 4857 DMaskLanes = 4; 4858 } else if (DMask != 0) { 4859 DMaskLanes = countPopulation(DMask); 4860 } else if (!IsTFE && !BaseOpcode->Store) { 4861 // If dmask is 0, this is a no-op load. This can be eliminated. 4862 B.buildUndef(MI.getOperand(0)); 4863 MI.eraseFromParent(); 4864 return true; 4865 } 4866 } 4867 4868 Observer.changingInstr(MI); 4869 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4870 4871 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4872 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4873 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4874 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4875 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4876 4877 // Track that we legalized this 4878 MI.setDesc(B.getTII().get(NewOpcode)); 4879 4880 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4881 // dmask to be at least 1 otherwise the instruction will fail 4882 if (IsTFE && DMask == 0) { 4883 DMask = 0x1; 4884 DMaskLanes = 1; 4885 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4886 } 4887 4888 if (BaseOpcode->Atomic) { 4889 Register VData0 = MI.getOperand(2).getReg(); 4890 LLT Ty = MRI->getType(VData0); 4891 4892 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4893 if (Ty.isVector()) 4894 return false; 4895 4896 if (BaseOpcode->AtomicX2) { 4897 Register VData1 = MI.getOperand(3).getReg(); 4898 // The two values are packed in one register. 4899 LLT PackedTy = LLT::fixed_vector(2, Ty); 4900 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4901 MI.getOperand(2).setReg(Concat.getReg(0)); 4902 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4903 } 4904 } 4905 4906 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4907 4908 // Rewrite the addressing register layout before doing anything else. 4909 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4910 // 16 bit gradients are supported, but are tied to the A16 control 4911 // so both gradients and addresses must be 16 bit 4912 return false; 4913 } 4914 4915 if (IsA16 && !ST.hasA16()) { 4916 // A16 not supported 4917 return false; 4918 } 4919 4920 if (IsA16 || IsG16) { 4921 if (Intr->NumVAddrs > 1) { 4922 SmallVector<Register, 4> PackedRegs; 4923 4924 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4925 IsG16); 4926 4927 // See also below in the non-a16 branch 4928 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4929 PackedRegs.size() <= ST.getNSAMaxSize(); 4930 4931 if (!UseNSA && PackedRegs.size() > 1) { 4932 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4933 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4934 PackedRegs[0] = Concat.getReg(0); 4935 PackedRegs.resize(1); 4936 } 4937 4938 const unsigned NumPacked = PackedRegs.size(); 4939 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4940 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4941 if (!SrcOp.isReg()) { 4942 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4943 continue; 4944 } 4945 4946 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4947 4948 if (I - Intr->VAddrStart < NumPacked) 4949 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4950 else 4951 SrcOp.setReg(AMDGPU::NoRegister); 4952 } 4953 } 4954 } else { 4955 // If the register allocator cannot place the address registers contiguously 4956 // without introducing moves, then using the non-sequential address encoding 4957 // is always preferable, since it saves VALU instructions and is usually a 4958 // wash in terms of code size or even better. 4959 // 4960 // However, we currently have no way of hinting to the register allocator 4961 // that MIMG addresses should be placed contiguously when it is possible to 4962 // do so, so force non-NSA for the common 2-address case as a heuristic. 4963 // 4964 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4965 // allocation when possible. 4966 // 4967 // TODO: we can actually allow partial NSA where the final register is a 4968 // contiguous set of the remaining addresses. 4969 // This could help where there are more addresses than supported. 4970 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4971 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4972 4973 if (!UseNSA && Intr->NumVAddrs > 1) 4974 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4975 Intr->NumVAddrs); 4976 } 4977 4978 int Flags = 0; 4979 if (IsA16) 4980 Flags |= 1; 4981 if (IsG16) 4982 Flags |= 2; 4983 MI.addOperand(MachineOperand::CreateImm(Flags)); 4984 4985 if (BaseOpcode->Store) { // No TFE for stores? 4986 // TODO: Handle dmask trim 4987 if (!Ty.isVector() || !IsD16) 4988 return true; 4989 4990 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4991 if (RepackedReg != VData) { 4992 MI.getOperand(1).setReg(RepackedReg); 4993 } 4994 4995 return true; 4996 } 4997 4998 Register DstReg = MI.getOperand(0).getReg(); 4999 const LLT EltTy = Ty.getScalarType(); 5000 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 5001 5002 // Confirm that the return type is large enough for the dmask specified 5003 if (NumElts < DMaskLanes) 5004 return false; 5005 5006 if (NumElts > 4 || DMaskLanes > 4) 5007 return false; 5008 5009 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5010 const LLT AdjustedTy = 5011 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5012 5013 // The raw dword aligned data component of the load. The only legal cases 5014 // where this matters should be when using the packed D16 format, for 5015 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 5016 LLT RoundedTy; 5017 5018 // S32 vector to to cover all data, plus TFE result element. 5019 LLT TFETy; 5020 5021 // Register type to use for each loaded component. Will be S32 or V2S16. 5022 LLT RegTy; 5023 5024 if (IsD16 && ST.hasUnpackedD16VMem()) { 5025 RoundedTy = 5026 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5027 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 5028 RegTy = S32; 5029 } else { 5030 unsigned EltSize = EltTy.getSizeInBits(); 5031 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 5032 unsigned RoundedSize = 32 * RoundedElts; 5033 RoundedTy = LLT::scalarOrVector( 5034 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5035 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 5036 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 5037 } 5038 5039 // The return type does not need adjustment. 5040 // TODO: Should we change s16 case to s32 or <2 x s16>? 5041 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 5042 return true; 5043 5044 Register Dst1Reg; 5045 5046 // Insert after the instruction. 5047 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 5048 5049 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 5050 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 5051 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 5052 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 5053 5054 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 5055 5056 MI.getOperand(0).setReg(NewResultReg); 5057 5058 // In the IR, TFE is supposed to be used with a 2 element struct return 5059 // type. The instruction really returns these two values in one contiguous 5060 // register, with one additional dword beyond the loaded data. Rewrite the 5061 // return type to use a single register result. 5062 5063 if (IsTFE) { 5064 Dst1Reg = MI.getOperand(1).getReg(); 5065 if (MRI->getType(Dst1Reg) != S32) 5066 return false; 5067 5068 // TODO: Make sure the TFE operand bit is set. 5069 MI.removeOperand(1); 5070 5071 // Handle the easy case that requires no repack instructions. 5072 if (Ty == S32) { 5073 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 5074 return true; 5075 } 5076 } 5077 5078 // Now figure out how to copy the new result register back into the old 5079 // result. 5080 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 5081 5082 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 5083 5084 if (ResultNumRegs == 1) { 5085 assert(!IsTFE); 5086 ResultRegs[0] = NewResultReg; 5087 } else { 5088 // We have to repack into a new vector of some kind. 5089 for (int I = 0; I != NumDataRegs; ++I) 5090 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 5091 B.buildUnmerge(ResultRegs, NewResultReg); 5092 5093 // Drop the final TFE element to get the data part. The TFE result is 5094 // directly written to the right place already. 5095 if (IsTFE) 5096 ResultRegs.resize(NumDataRegs); 5097 } 5098 5099 // For an s16 scalar result, we form an s32 result with a truncate regardless 5100 // of packed vs. unpacked. 5101 if (IsD16 && !Ty.isVector()) { 5102 B.buildTrunc(DstReg, ResultRegs[0]); 5103 return true; 5104 } 5105 5106 // Avoid a build/concat_vector of 1 entry. 5107 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 5108 B.buildBitcast(DstReg, ResultRegs[0]); 5109 return true; 5110 } 5111 5112 assert(Ty.isVector()); 5113 5114 if (IsD16) { 5115 // For packed D16 results with TFE enabled, all the data components are 5116 // S32. Cast back to the expected type. 5117 // 5118 // TODO: We don't really need to use load s32 elements. We would only need one 5119 // cast for the TFE result if a multiple of v2s16 was used. 5120 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 5121 for (Register &Reg : ResultRegs) 5122 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 5123 } else if (ST.hasUnpackedD16VMem()) { 5124 for (Register &Reg : ResultRegs) 5125 Reg = B.buildTrunc(S16, Reg).getReg(0); 5126 } 5127 } 5128 5129 auto padWithUndef = [&](LLT Ty, int NumElts) { 5130 if (NumElts == 0) 5131 return; 5132 Register Undef = B.buildUndef(Ty).getReg(0); 5133 for (int I = 0; I != NumElts; ++I) 5134 ResultRegs.push_back(Undef); 5135 }; 5136 5137 // Pad out any elements eliminated due to the dmask. 5138 LLT ResTy = MRI->getType(ResultRegs[0]); 5139 if (!ResTy.isVector()) { 5140 padWithUndef(ResTy, NumElts - ResultRegs.size()); 5141 B.buildBuildVector(DstReg, ResultRegs); 5142 return true; 5143 } 5144 5145 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 5146 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 5147 5148 // Deal with the one annoying legal case. 5149 const LLT V3S16 = LLT::fixed_vector(3, 16); 5150 if (Ty == V3S16) { 5151 if (IsTFE) { 5152 if (ResultRegs.size() == 1) { 5153 NewResultReg = ResultRegs[0]; 5154 } else if (ResultRegs.size() == 2) { 5155 LLT V4S16 = LLT::fixed_vector(4, 16); 5156 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 5157 } else { 5158 return false; 5159 } 5160 } 5161 5162 if (MRI->getType(DstReg).getNumElements() < 5163 MRI->getType(NewResultReg).getNumElements()) { 5164 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 5165 } else { 5166 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 5167 } 5168 return true; 5169 } 5170 5171 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 5172 B.buildConcatVectors(DstReg, ResultRegs); 5173 return true; 5174 } 5175 5176 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5177 LegalizerHelper &Helper, MachineInstr &MI) const { 5178 MachineIRBuilder &B = Helper.MIRBuilder; 5179 GISelChangeObserver &Observer = Helper.Observer; 5180 5181 Register Dst = MI.getOperand(0).getReg(); 5182 LLT Ty = B.getMRI()->getType(Dst); 5183 unsigned Size = Ty.getSizeInBits(); 5184 MachineFunction &MF = B.getMF(); 5185 5186 Observer.changingInstr(MI); 5187 5188 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5189 Ty = getBitcastRegisterType(Ty); 5190 Helper.bitcastDst(MI, Ty, 0); 5191 Dst = MI.getOperand(0).getReg(); 5192 B.setInsertPt(B.getMBB(), MI); 5193 } 5194 5195 // FIXME: We don't really need this intermediate instruction. The intrinsic 5196 // should be fixed to have a memory operand. Since it's readnone, we're not 5197 // allowed to add one. 5198 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 5199 MI.removeOperand(1); // Remove intrinsic ID 5200 5201 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 5202 // TODO: Should this use datalayout alignment? 5203 const unsigned MemSize = (Size + 7) / 8; 5204 const Align MemAlign(4); 5205 MachineMemOperand *MMO = MF.getMachineMemOperand( 5206 MachinePointerInfo(), 5207 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5208 MachineMemOperand::MOInvariant, 5209 MemSize, MemAlign); 5210 MI.addMemOperand(MF, MMO); 5211 5212 // There are no 96-bit result scalar loads, but widening to 128-bit should 5213 // always be legal. We may need to restore this to a 96-bit result if it turns 5214 // out this needs to be converted to a vector load during RegBankSelect. 5215 if (!isPowerOf2_32(Size)) { 5216 if (Ty.isVector()) 5217 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 5218 else 5219 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 5220 } 5221 5222 Observer.changedInstr(MI); 5223 return true; 5224 } 5225 5226 // TODO: Move to selection 5227 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 5228 MachineRegisterInfo &MRI, 5229 MachineIRBuilder &B) const { 5230 if (!ST.isTrapHandlerEnabled() || 5231 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5232 return legalizeTrapEndpgm(MI, MRI, B); 5233 5234 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5235 switch (*HsaAbiVer) { 5236 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5237 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5238 return legalizeTrapHsaQueuePtr(MI, MRI, B); 5239 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 5240 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5241 return ST.supportsGetDoorbellID() ? 5242 legalizeTrapHsa(MI, MRI, B) : 5243 legalizeTrapHsaQueuePtr(MI, MRI, B); 5244 } 5245 } 5246 5247 llvm_unreachable("Unknown trap handler"); 5248 } 5249 5250 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5251 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5252 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5253 MI.eraseFromParent(); 5254 return true; 5255 } 5256 5257 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5258 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5259 MachineFunction &MF = B.getMF(); 5260 const LLT S64 = LLT::scalar(64); 5261 5262 Register SGPR01(AMDGPU::SGPR0_SGPR1); 5263 // For code object version 5, queue_ptr is passed through implicit kernarg. 5264 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 5265 AMDGPUTargetLowering::ImplicitParameter Param = 5266 AMDGPUTargetLowering::QUEUE_PTR; 5267 uint64_t Offset = 5268 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 5269 5270 Register KernargPtrReg = MRI.createGenericVirtualRegister( 5271 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5272 5273 if (!loadInputValue(KernargPtrReg, B, 5274 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5275 return false; 5276 5277 // TODO: can we be smarter about machine pointer info? 5278 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 5279 MachineMemOperand *MMO = MF.getMachineMemOperand( 5280 PtrInfo, 5281 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5282 MachineMemOperand::MOInvariant, 5283 LLT::scalar(64), commonAlignment(Align(64), Offset)); 5284 5285 // Pointer address 5286 Register LoadAddr = MRI.createGenericVirtualRegister( 5287 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5288 B.buildPtrAdd(LoadAddr, KernargPtrReg, 5289 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 5290 // Load address 5291 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 5292 B.buildCopy(SGPR01, Temp); 5293 B.buildInstr(AMDGPU::S_TRAP) 5294 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5295 .addReg(SGPR01, RegState::Implicit); 5296 MI.eraseFromParent(); 5297 return true; 5298 } 5299 5300 // Pass queue pointer to trap handler as input, and insert trap instruction 5301 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5302 Register LiveIn = 5303 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5304 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 5305 return false; 5306 5307 B.buildCopy(SGPR01, LiveIn); 5308 B.buildInstr(AMDGPU::S_TRAP) 5309 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5310 .addReg(SGPR01, RegState::Implicit); 5311 5312 MI.eraseFromParent(); 5313 return true; 5314 } 5315 5316 bool AMDGPULegalizerInfo::legalizeTrapHsa( 5317 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5318 B.buildInstr(AMDGPU::S_TRAP) 5319 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 5320 MI.eraseFromParent(); 5321 return true; 5322 } 5323 5324 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 5325 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5326 // Is non-HSA path or trap-handler disabled? Then, report a warning 5327 // accordingly 5328 if (!ST.isTrapHandlerEnabled() || 5329 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5330 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5331 "debugtrap handler not supported", 5332 MI.getDebugLoc(), DS_Warning); 5333 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5334 Ctx.diagnose(NoTrap); 5335 } else { 5336 // Insert debug-trap instruction 5337 B.buildInstr(AMDGPU::S_TRAP) 5338 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5339 } 5340 5341 MI.eraseFromParent(); 5342 return true; 5343 } 5344 5345 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5346 MachineIRBuilder &B) const { 5347 MachineRegisterInfo &MRI = *B.getMRI(); 5348 const LLT S16 = LLT::scalar(16); 5349 const LLT S32 = LLT::scalar(32); 5350 const LLT V2S16 = LLT::fixed_vector(2, 16); 5351 const LLT V3S32 = LLT::fixed_vector(3, 32); 5352 5353 Register DstReg = MI.getOperand(0).getReg(); 5354 Register NodePtr = MI.getOperand(2).getReg(); 5355 Register RayExtent = MI.getOperand(3).getReg(); 5356 Register RayOrigin = MI.getOperand(4).getReg(); 5357 Register RayDir = MI.getOperand(5).getReg(); 5358 Register RayInvDir = MI.getOperand(6).getReg(); 5359 Register TDescr = MI.getOperand(7).getReg(); 5360 5361 if (!ST.hasGFX10_AEncoding()) { 5362 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5363 "intrinsic not supported on subtarget", 5364 MI.getDebugLoc()); 5365 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5366 return false; 5367 } 5368 5369 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5370 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5371 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5372 const unsigned NumVDataDwords = 4; 5373 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5374 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 5375 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5376 const unsigned BaseOpcodes[2][2] = { 5377 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5378 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5379 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5380 int Opcode; 5381 if (UseNSA) { 5382 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5383 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 5384 : AMDGPU::MIMGEncGfx10NSA, 5385 NumVDataDwords, NumVAddrDwords); 5386 } else { 5387 Opcode = AMDGPU::getMIMGOpcode( 5388 BaseOpcodes[Is64][IsA16], 5389 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 5390 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); 5391 } 5392 assert(Opcode != -1); 5393 5394 SmallVector<Register, 12> Ops; 5395 if (UseNSA && IsGFX11Plus) { 5396 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 5397 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5398 auto Merged = B.buildMerge( 5399 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 5400 Ops.push_back(Merged.getReg(0)); 5401 }; 5402 5403 Ops.push_back(NodePtr); 5404 Ops.push_back(RayExtent); 5405 packLanes(RayOrigin); 5406 5407 if (IsA16) { 5408 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5409 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5410 auto MergedDir = B.buildMerge( 5411 V3S32, 5412 {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), 5413 UnmergeRayDir.getReg(0)})) 5414 .getReg(0), 5415 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), 5416 UnmergeRayDir.getReg(1)})) 5417 .getReg(0), 5418 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), 5419 UnmergeRayDir.getReg(2)})) 5420 .getReg(0)}); 5421 Ops.push_back(MergedDir.getReg(0)); 5422 } else { 5423 packLanes(RayDir); 5424 packLanes(RayInvDir); 5425 } 5426 } else { 5427 if (Is64) { 5428 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5429 Ops.push_back(Unmerge.getReg(0)); 5430 Ops.push_back(Unmerge.getReg(1)); 5431 } else { 5432 Ops.push_back(NodePtr); 5433 } 5434 Ops.push_back(RayExtent); 5435 5436 auto packLanes = [&Ops, &S32, &B](Register Src) { 5437 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5438 Ops.push_back(Unmerge.getReg(0)); 5439 Ops.push_back(Unmerge.getReg(1)); 5440 Ops.push_back(Unmerge.getReg(2)); 5441 }; 5442 5443 packLanes(RayOrigin); 5444 if (IsA16) { 5445 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5446 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5447 Register R1 = MRI.createGenericVirtualRegister(S32); 5448 Register R2 = MRI.createGenericVirtualRegister(S32); 5449 Register R3 = MRI.createGenericVirtualRegister(S32); 5450 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5451 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5452 B.buildMerge(R3, 5453 {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5454 Ops.push_back(R1); 5455 Ops.push_back(R2); 5456 Ops.push_back(R3); 5457 } else { 5458 packLanes(RayDir); 5459 packLanes(RayInvDir); 5460 } 5461 } 5462 5463 if (!UseNSA) { 5464 // Build a single vector containing all the operands so far prepared. 5465 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5466 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5467 Ops.clear(); 5468 Ops.push_back(MergedOps); 5469 } 5470 5471 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5472 .addDef(DstReg) 5473 .addImm(Opcode); 5474 5475 for (Register R : Ops) { 5476 MIB.addUse(R); 5477 } 5478 5479 MIB.addUse(TDescr) 5480 .addImm(IsA16 ? 1 : 0) 5481 .cloneMemRefs(MI); 5482 5483 MI.eraseFromParent(); 5484 return true; 5485 } 5486 5487 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5488 MachineIRBuilder &B) const { 5489 unsigned Opc; 5490 int RoundMode = MI.getOperand(2).getImm(); 5491 5492 if (RoundMode == (int)RoundingMode::TowardPositive) 5493 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5494 else if (RoundMode == (int)RoundingMode::TowardNegative) 5495 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5496 else 5497 return false; 5498 5499 B.buildInstr(Opc) 5500 .addDef(MI.getOperand(0).getReg()) 5501 .addUse(MI.getOperand(1).getReg()); 5502 5503 MI.eraseFromParent(); 5504 5505 return true; 5506 } 5507 5508 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5509 MachineInstr &MI) const { 5510 MachineIRBuilder &B = Helper.MIRBuilder; 5511 MachineRegisterInfo &MRI = *B.getMRI(); 5512 5513 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5514 auto IntrID = MI.getIntrinsicID(); 5515 switch (IntrID) { 5516 case Intrinsic::amdgcn_if: 5517 case Intrinsic::amdgcn_else: { 5518 MachineInstr *Br = nullptr; 5519 MachineBasicBlock *UncondBrTarget = nullptr; 5520 bool Negated = false; 5521 if (MachineInstr *BrCond = 5522 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5523 const SIRegisterInfo *TRI 5524 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5525 5526 Register Def = MI.getOperand(1).getReg(); 5527 Register Use = MI.getOperand(3).getReg(); 5528 5529 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5530 5531 if (Negated) 5532 std::swap(CondBrTarget, UncondBrTarget); 5533 5534 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5535 if (IntrID == Intrinsic::amdgcn_if) { 5536 B.buildInstr(AMDGPU::SI_IF) 5537 .addDef(Def) 5538 .addUse(Use) 5539 .addMBB(UncondBrTarget); 5540 } else { 5541 B.buildInstr(AMDGPU::SI_ELSE) 5542 .addDef(Def) 5543 .addUse(Use) 5544 .addMBB(UncondBrTarget); 5545 } 5546 5547 if (Br) { 5548 Br->getOperand(0).setMBB(CondBrTarget); 5549 } else { 5550 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5551 // since we're swapping branch targets it needs to be reinserted. 5552 // FIXME: IRTranslator should probably not do this 5553 B.buildBr(*CondBrTarget); 5554 } 5555 5556 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5557 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5558 MI.eraseFromParent(); 5559 BrCond->eraseFromParent(); 5560 return true; 5561 } 5562 5563 return false; 5564 } 5565 case Intrinsic::amdgcn_loop: { 5566 MachineInstr *Br = nullptr; 5567 MachineBasicBlock *UncondBrTarget = nullptr; 5568 bool Negated = false; 5569 if (MachineInstr *BrCond = 5570 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5571 const SIRegisterInfo *TRI 5572 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5573 5574 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5575 Register Reg = MI.getOperand(2).getReg(); 5576 5577 if (Negated) 5578 std::swap(CondBrTarget, UncondBrTarget); 5579 5580 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5581 B.buildInstr(AMDGPU::SI_LOOP) 5582 .addUse(Reg) 5583 .addMBB(UncondBrTarget); 5584 5585 if (Br) 5586 Br->getOperand(0).setMBB(CondBrTarget); 5587 else 5588 B.buildBr(*CondBrTarget); 5589 5590 MI.eraseFromParent(); 5591 BrCond->eraseFromParent(); 5592 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5593 return true; 5594 } 5595 5596 return false; 5597 } 5598 case Intrinsic::amdgcn_kernarg_segment_ptr: 5599 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5600 // This only makes sense to call in a kernel, so just lower to null. 5601 B.buildConstant(MI.getOperand(0).getReg(), 0); 5602 MI.eraseFromParent(); 5603 return true; 5604 } 5605 5606 return legalizePreloadedArgIntrin( 5607 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5608 case Intrinsic::amdgcn_implicitarg_ptr: 5609 return legalizeImplicitArgPtr(MI, MRI, B); 5610 case Intrinsic::amdgcn_workitem_id_x: 5611 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5612 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5613 case Intrinsic::amdgcn_workitem_id_y: 5614 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5615 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5616 case Intrinsic::amdgcn_workitem_id_z: 5617 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5618 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5619 case Intrinsic::amdgcn_workgroup_id_x: 5620 return legalizePreloadedArgIntrin(MI, MRI, B, 5621 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5622 case Intrinsic::amdgcn_workgroup_id_y: 5623 return legalizePreloadedArgIntrin(MI, MRI, B, 5624 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5625 case Intrinsic::amdgcn_workgroup_id_z: 5626 return legalizePreloadedArgIntrin(MI, MRI, B, 5627 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5628 case Intrinsic::amdgcn_dispatch_ptr: 5629 return legalizePreloadedArgIntrin(MI, MRI, B, 5630 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5631 case Intrinsic::amdgcn_queue_ptr: 5632 return legalizePreloadedArgIntrin(MI, MRI, B, 5633 AMDGPUFunctionArgInfo::QUEUE_PTR); 5634 case Intrinsic::amdgcn_implicit_buffer_ptr: 5635 return legalizePreloadedArgIntrin( 5636 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5637 case Intrinsic::amdgcn_dispatch_id: 5638 return legalizePreloadedArgIntrin(MI, MRI, B, 5639 AMDGPUFunctionArgInfo::DISPATCH_ID); 5640 case Intrinsic::r600_read_ngroups_x: 5641 // TODO: Emit error for hsa 5642 return legalizeKernargMemParameter(MI, B, 5643 SI::KernelInputOffsets::NGROUPS_X); 5644 case Intrinsic::r600_read_ngroups_y: 5645 return legalizeKernargMemParameter(MI, B, 5646 SI::KernelInputOffsets::NGROUPS_Y); 5647 case Intrinsic::r600_read_ngroups_z: 5648 return legalizeKernargMemParameter(MI, B, 5649 SI::KernelInputOffsets::NGROUPS_Z); 5650 case Intrinsic::r600_read_local_size_x: 5651 // TODO: Could insert G_ASSERT_ZEXT from s16 5652 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5653 case Intrinsic::r600_read_local_size_y: 5654 // TODO: Could insert G_ASSERT_ZEXT from s16 5655 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5656 // TODO: Could insert G_ASSERT_ZEXT from s16 5657 case Intrinsic::r600_read_local_size_z: 5658 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5659 case Intrinsic::r600_read_global_size_x: 5660 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5661 case Intrinsic::r600_read_global_size_y: 5662 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5663 case Intrinsic::r600_read_global_size_z: 5664 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5665 case Intrinsic::amdgcn_fdiv_fast: 5666 return legalizeFDIVFastIntrin(MI, MRI, B); 5667 case Intrinsic::amdgcn_is_shared: 5668 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5669 case Intrinsic::amdgcn_is_private: 5670 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5671 case Intrinsic::amdgcn_wavefrontsize: { 5672 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5673 MI.eraseFromParent(); 5674 return true; 5675 } 5676 case Intrinsic::amdgcn_s_buffer_load: 5677 return legalizeSBufferLoad(Helper, MI); 5678 case Intrinsic::amdgcn_raw_buffer_store: 5679 case Intrinsic::amdgcn_struct_buffer_store: 5680 return legalizeBufferStore(MI, MRI, B, false, false); 5681 case Intrinsic::amdgcn_raw_buffer_store_format: 5682 case Intrinsic::amdgcn_struct_buffer_store_format: 5683 return legalizeBufferStore(MI, MRI, B, false, true); 5684 case Intrinsic::amdgcn_raw_tbuffer_store: 5685 case Intrinsic::amdgcn_struct_tbuffer_store: 5686 return legalizeBufferStore(MI, MRI, B, true, true); 5687 case Intrinsic::amdgcn_raw_buffer_load: 5688 case Intrinsic::amdgcn_struct_buffer_load: 5689 return legalizeBufferLoad(MI, MRI, B, false, false); 5690 case Intrinsic::amdgcn_raw_buffer_load_format: 5691 case Intrinsic::amdgcn_struct_buffer_load_format: 5692 return legalizeBufferLoad(MI, MRI, B, true, false); 5693 case Intrinsic::amdgcn_raw_tbuffer_load: 5694 case Intrinsic::amdgcn_struct_tbuffer_load: 5695 return legalizeBufferLoad(MI, MRI, B, true, true); 5696 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5697 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5698 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5699 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5700 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5701 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5702 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5703 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5704 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5705 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5706 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5707 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5708 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5709 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5710 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5711 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5712 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5713 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5714 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5715 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5716 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5717 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5718 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5719 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5720 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5721 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5722 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5723 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5724 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5725 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5726 return legalizeBufferAtomic(MI, B, IntrID); 5727 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5728 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5729 Register DstReg = MI.getOperand(0).getReg(); 5730 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5731 Function &F = B.getMF().getFunction(); 5732 DiagnosticInfoUnsupported NoFpRet( 5733 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5734 DS_Error); 5735 F.getContext().diagnose(NoFpRet); 5736 B.buildUndef(DstReg); 5737 MI.eraseFromParent(); 5738 return true; 5739 } 5740 5741 return legalizeBufferAtomic(MI, B, IntrID); 5742 } 5743 case Intrinsic::amdgcn_atomic_inc: 5744 return legalizeAtomicIncDec(MI, B, true); 5745 case Intrinsic::amdgcn_atomic_dec: 5746 return legalizeAtomicIncDec(MI, B, false); 5747 case Intrinsic::trap: 5748 return legalizeTrapIntrinsic(MI, MRI, B); 5749 case Intrinsic::debugtrap: 5750 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5751 case Intrinsic::amdgcn_rsq_clamp: 5752 return legalizeRsqClampIntrinsic(MI, MRI, B); 5753 case Intrinsic::amdgcn_ds_fadd: 5754 case Intrinsic::amdgcn_ds_fmin: 5755 case Intrinsic::amdgcn_ds_fmax: 5756 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5757 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5758 return legalizeBVHIntrinsic(MI, B); 5759 default: { 5760 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5761 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5762 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5763 return true; 5764 } 5765 } 5766 5767 return true; 5768 } 5769