1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const unsigned MemSizeInBits) { 365 const unsigned Size = Ty.getSizeInBits(); 366 if (Size != MemSizeInBits) 367 return Size <= 32 && Ty.isVector(); 368 369 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 370 return true; 371 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 372 !isRegisterVectorElementType(Ty.getElementType()); 373 } 374 375 /// Return true if we should legalize a load by widening an odd sized memory 376 /// access up to the alignment. Note this case when the memory access itself 377 /// changes, not the size of the result register. 378 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 379 unsigned AlignInBits, unsigned AddrSpace, 380 unsigned Opcode) { 381 // We don't want to widen cases that are naturally legal. 382 if (isPowerOf2_32(SizeInBits)) 383 return false; 384 385 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 386 // end up widening these for a scalar load during RegBankSelect, since there 387 // aren't 96-bit scalar loads. 388 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 389 return false; 390 391 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 392 return false; 393 394 // A load is known dereferenceable up to the alignment, so it's legal to widen 395 // to it. 396 // 397 // TODO: Could check dereferenceable for less aligned cases. 398 unsigned RoundedSize = NextPowerOf2(SizeInBits); 399 if (AlignInBits < RoundedSize) 400 return false; 401 402 // Do not widen if it would introduce a slow unaligned load. 403 const SITargetLowering *TLI = ST.getTargetLowering(); 404 bool Fast = false; 405 return TLI->allowsMisalignedMemoryAccessesImpl( 406 RoundedSize, AddrSpace, Align(AlignInBits / 8), 407 MachineMemOperand::MOLoad, &Fast) && 408 Fast; 409 } 410 411 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 412 unsigned Opcode) { 413 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 414 return false; 415 416 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 417 Query.MMODescrs[0].AlignInBits, 418 Query.Types[1].getAddressSpace(), Opcode); 419 } 420 421 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 422 const GCNTargetMachine &TM) 423 : ST(ST_) { 424 using namespace TargetOpcode; 425 426 auto GetAddrSpacePtr = [&TM](unsigned AS) { 427 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 428 }; 429 430 const LLT S1 = LLT::scalar(1); 431 const LLT S8 = LLT::scalar(8); 432 const LLT S16 = LLT::scalar(16); 433 const LLT S32 = LLT::scalar(32); 434 const LLT S64 = LLT::scalar(64); 435 const LLT S128 = LLT::scalar(128); 436 const LLT S256 = LLT::scalar(256); 437 const LLT S512 = LLT::scalar(512); 438 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 439 440 const LLT V2S8 = LLT::fixed_vector(2, 8); 441 const LLT V2S16 = LLT::fixed_vector(2, 16); 442 const LLT V4S16 = LLT::fixed_vector(4, 16); 443 444 const LLT V2S32 = LLT::fixed_vector(2, 32); 445 const LLT V3S32 = LLT::fixed_vector(3, 32); 446 const LLT V4S32 = LLT::fixed_vector(4, 32); 447 const LLT V5S32 = LLT::fixed_vector(5, 32); 448 const LLT V6S32 = LLT::fixed_vector(6, 32); 449 const LLT V7S32 = LLT::fixed_vector(7, 32); 450 const LLT V8S32 = LLT::fixed_vector(8, 32); 451 const LLT V9S32 = LLT::fixed_vector(9, 32); 452 const LLT V10S32 = LLT::fixed_vector(10, 32); 453 const LLT V11S32 = LLT::fixed_vector(11, 32); 454 const LLT V12S32 = LLT::fixed_vector(12, 32); 455 const LLT V13S32 = LLT::fixed_vector(13, 32); 456 const LLT V14S32 = LLT::fixed_vector(14, 32); 457 const LLT V15S32 = LLT::fixed_vector(15, 32); 458 const LLT V16S32 = LLT::fixed_vector(16, 32); 459 const LLT V32S32 = LLT::fixed_vector(32, 32); 460 461 const LLT V2S64 = LLT::fixed_vector(2, 64); 462 const LLT V3S64 = LLT::fixed_vector(3, 64); 463 const LLT V4S64 = LLT::fixed_vector(4, 64); 464 const LLT V5S64 = LLT::fixed_vector(5, 64); 465 const LLT V6S64 = LLT::fixed_vector(6, 64); 466 const LLT V7S64 = LLT::fixed_vector(7, 64); 467 const LLT V8S64 = LLT::fixed_vector(8, 64); 468 const LLT V16S64 = LLT::fixed_vector(16, 64); 469 470 std::initializer_list<LLT> AllS32Vectors = 471 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 472 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 473 std::initializer_list<LLT> AllS64Vectors = 474 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 475 476 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 477 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 478 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 479 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 480 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 481 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 482 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 483 484 const LLT CodePtr = FlatPtr; 485 486 const std::initializer_list<LLT> AddrSpaces64 = { 487 GlobalPtr, ConstantPtr, FlatPtr 488 }; 489 490 const std::initializer_list<LLT> AddrSpaces32 = { 491 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 492 }; 493 494 const std::initializer_list<LLT> FPTypesBase = { 495 S32, S64 496 }; 497 498 const std::initializer_list<LLT> FPTypes16 = { 499 S32, S64, S16 500 }; 501 502 const std::initializer_list<LLT> FPTypesPK16 = { 503 S32, S64, S16, V2S16 504 }; 505 506 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 507 508 // s1 for VCC branches, s32 for SCC branches. 509 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 510 511 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 512 // elements for v3s16 513 getActionDefinitionsBuilder(G_PHI) 514 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 515 .legalFor(AllS32Vectors) 516 .legalFor(AllS64Vectors) 517 .legalFor(AddrSpaces64) 518 .legalFor(AddrSpaces32) 519 .legalIf(isPointer(0)) 520 .clampScalar(0, S16, S256) 521 .widenScalarToNextPow2(0, 32) 522 .clampMaxNumElements(0, S32, 16) 523 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 524 .scalarize(0); 525 526 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 527 // Full set of gfx9 features. 528 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 529 .legalFor({S32, S16, V2S16}) 530 .clampScalar(0, S16, S32) 531 .clampMaxNumElements(0, S16, 2) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32); 534 535 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 536 .legalFor({S32, S16, V2S16}) // Clamp modifier 537 .minScalarOrElt(0, S16) 538 .clampMaxNumElements(0, S16, 2) 539 .scalarize(0) 540 .widenScalarToNextPow2(0, 32) 541 .lower(); 542 } else if (ST.has16BitInsts()) { 543 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 544 .legalFor({S32, S16}) 545 .clampScalar(0, S16, S32) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 548 549 // Technically the saturating operations require clamp bit support, but this 550 // was introduced at the same time as 16-bit operations. 551 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 552 .legalFor({S32, S16}) // Clamp modifier 553 .minScalar(0, S16) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 16) 556 .lower(); 557 558 // We're just lowering this, but it helps get a better result to try to 559 // coerce to the desired type first. 560 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 561 .minScalar(0, S16) 562 .scalarize(0) 563 .lower(); 564 } else { 565 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 566 .legalFor({S32}) 567 .clampScalar(0, S32, S32) 568 .scalarize(0); 569 570 if (ST.hasIntClamp()) { 571 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 572 .legalFor({S32}) // Clamp modifier. 573 .scalarize(0) 574 .minScalarOrElt(0, S32) 575 .lower(); 576 } else { 577 // Clamp bit support was added in VI, along with 16-bit operations. 578 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 // FIXME: DAG expansion gets better results. The widening uses the smaller 585 // range values and goes for the min/max lowering directly. 586 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 getActionDefinitionsBuilder( 593 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 594 .customFor({S32, S64}) 595 .clampScalar(0, S32, S64) 596 .widenScalarToNextPow2(0, 32) 597 .scalarize(0); 598 599 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 600 .legalFor({S32}) 601 .maxScalarOrElt(0, S32); 602 603 if (ST.hasVOP3PInsts()) { 604 Mulh 605 .clampMaxNumElements(0, S8, 2) 606 .lowerFor({V2S8}); 607 } 608 609 Mulh 610 .scalarize(0) 611 .lower(); 612 613 // Report legal for any types we can handle anywhere. For the cases only legal 614 // on the SALU, RegBankSelect will be able to re-legalize. 615 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 616 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 617 .clampScalar(0, S32, S64) 618 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 619 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 620 .widenScalarToNextPow2(0) 621 .scalarize(0); 622 623 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 624 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 625 .legalFor({{S32, S1}, {S32, S32}}) 626 .minScalar(0, S32) 627 // TODO: .scalarize(0) 628 .lower(); 629 630 getActionDefinitionsBuilder(G_BITCAST) 631 // Don't worry about the size constraint. 632 .legalIf(all(isRegisterType(0), isRegisterType(1))) 633 .lower(); 634 635 636 getActionDefinitionsBuilder(G_CONSTANT) 637 .legalFor({S1, S32, S64, S16, GlobalPtr, 638 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 639 .legalIf(isPointer(0)) 640 .clampScalar(0, S32, S64) 641 .widenScalarToNextPow2(0); 642 643 getActionDefinitionsBuilder(G_FCONSTANT) 644 .legalFor({S32, S64, S16}) 645 .clampScalar(0, S16, S64); 646 647 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 648 .legalIf(isRegisterType(0)) 649 // s1 and s16 are special cases because they have legal operations on 650 // them, but don't really occupy registers in the normal way. 651 .legalFor({S1, S16}) 652 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 653 .clampScalarOrElt(0, S32, MaxScalar) 654 .widenScalarToNextPow2(0, 32) 655 .clampMaxNumElements(0, S32, 16); 656 657 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 658 659 // If the amount is divergent, we have to do a wave reduction to get the 660 // maximum value, so this is expanded during RegBankSelect. 661 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 662 .legalFor({{PrivatePtr, S32}}); 663 664 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 665 .customIf(typeIsNot(0, PrivatePtr)); 666 667 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 668 669 auto &FPOpActions = getActionDefinitionsBuilder( 670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 671 .legalFor({S32, S64}); 672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 673 .customFor({S32, S64}); 674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 675 .customFor({S32, S64}); 676 677 if (ST.has16BitInsts()) { 678 if (ST.hasVOP3PInsts()) 679 FPOpActions.legalFor({S16, V2S16}); 680 else 681 FPOpActions.legalFor({S16}); 682 683 TrigActions.customFor({S16}); 684 FDIVActions.customFor({S16}); 685 } 686 687 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 689 690 if (ST.hasVOP3PInsts()) { 691 MinNumMaxNum.customFor(FPTypesPK16) 692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 693 .clampMaxNumElements(0, S16, 2) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else if (ST.has16BitInsts()) { 697 MinNumMaxNum.customFor(FPTypes16) 698 .clampScalar(0, S16, S64) 699 .scalarize(0); 700 } else { 701 MinNumMaxNum.customFor(FPTypesBase) 702 .clampScalar(0, S32, S64) 703 .scalarize(0); 704 } 705 706 if (ST.hasVOP3PInsts()) 707 FPOpActions.clampMaxNumElements(0, S16, 2); 708 709 FPOpActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 TrigActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 FDIVActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 722 .legalFor(FPTypesPK16) 723 .clampMaxNumElements(0, S16, 2) 724 .scalarize(0) 725 .clampScalar(0, S16, S64); 726 727 if (ST.has16BitInsts()) { 728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 729 .legalFor({S32, S64, S16}) 730 .scalarize(0) 731 .clampScalar(0, S16, S64); 732 } else { 733 getActionDefinitionsBuilder(G_FSQRT) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 738 if (ST.hasFractBug()) { 739 getActionDefinitionsBuilder(G_FFLOOR) 740 .customFor({S64}) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 } else { 745 getActionDefinitionsBuilder(G_FFLOOR) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } 750 } 751 752 getActionDefinitionsBuilder(G_FPTRUNC) 753 .legalFor({{S32, S64}, {S16, S32}}) 754 .scalarize(0) 755 .lower(); 756 757 getActionDefinitionsBuilder(G_FPEXT) 758 .legalFor({{S64, S32}, {S32, S16}}) 759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 760 .scalarize(0); 761 762 getActionDefinitionsBuilder(G_FSUB) 763 // Use actual fsub instruction 764 .legalFor({S32}) 765 // Must use fadd + fneg 766 .lowerFor({S64, S16, V2S16}) 767 .scalarize(0) 768 .clampScalar(0, S32, S64); 769 770 // Whether this is legal depends on the floating point mode for the function. 771 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 772 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 773 FMad.customFor({S32, S16}); 774 else if (ST.hasMadMacF32Insts()) 775 FMad.customFor({S32}); 776 else if (ST.hasMadF16()) 777 FMad.customFor({S16}); 778 FMad.scalarize(0) 779 .lower(); 780 781 auto &FRem = getActionDefinitionsBuilder(G_FREM); 782 if (ST.has16BitInsts()) { 783 FRem.customFor({S16, S32, S64}); 784 } else { 785 FRem.minScalar(0, S32) 786 .customFor({S32, S64}); 787 } 788 FRem.scalarize(0); 789 790 // TODO: Do we need to clamp maximum bitwidth? 791 getActionDefinitionsBuilder(G_TRUNC) 792 .legalIf(isScalar(0)) 793 .legalFor({{V2S16, V2S32}}) 794 .clampMaxNumElements(0, S16, 2) 795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 796 // situations (like an invalid implicit use), we don't want to infinite loop 797 // in the legalizer. 798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 799 .alwaysLegal(); 800 801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 803 {S32, S1}, {S64, S1}, {S16, S1}}) 804 .scalarize(0) 805 .clampScalar(0, S32, S64) 806 .widenScalarToNextPow2(1, 32); 807 808 // TODO: Split s1->s64 during regbankselect for VALU. 809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 811 .lowerFor({{S32, S64}}) 812 .lowerIf(typeIs(1, S1)) 813 .customFor({{S64, S64}}); 814 if (ST.has16BitInsts()) 815 IToFP.legalFor({{S16, S16}}); 816 IToFP.clampScalar(1, S32, S64) 817 .minScalar(0, S32) 818 .scalarize(0) 819 .widenScalarToNextPow2(1); 820 821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 823 .customFor({{S64, S32}, {S64, S64}}) 824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 825 if (ST.has16BitInsts()) 826 FPToI.legalFor({{S16, S16}}); 827 else 828 FPToI.minScalar(1, S32); 829 830 FPToI.minScalar(0, S32) 831 .widenScalarToNextPow2(0, 32) 832 .scalarize(0) 833 .lower(); 834 835 // Lower roundeven into G_FRINT 836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 837 .scalarize(0) 838 .lower(); 839 840 if (ST.has16BitInsts()) { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S16, S32, S64}) 843 .clampScalar(0, S16, S64) 844 .scalarize(0); 845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S32, S64}) 848 .clampScalar(0, S32, S64) 849 .scalarize(0); 850 } else { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32}) 853 .customFor({S64}) 854 .clampScalar(0, S32, S64) 855 .scalarize(0); 856 } 857 858 getActionDefinitionsBuilder(G_PTR_ADD) 859 .legalIf(all(isPointer(0), sameSize(0, 1))) 860 .scalarize(0) 861 .scalarSameSizeAs(1, 0); 862 863 getActionDefinitionsBuilder(G_PTRMASK) 864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 865 .scalarSameSizeAs(1, 0) 866 .scalarize(0); 867 868 auto &CmpBuilder = 869 getActionDefinitionsBuilder(G_ICMP) 870 // The compare output type differs based on the register bank of the output, 871 // so make both s1 and s32 legal. 872 // 873 // Scalar compares producing output in scc will be promoted to s32, as that 874 // is the allocatable register type that will be needed for the copy from 875 // scc. This will be promoted during RegBankSelect, and we assume something 876 // before that won't try to use s32 result types. 877 // 878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 879 // bank. 880 .legalForCartesianProduct( 881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 882 .legalForCartesianProduct( 883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 884 if (ST.has16BitInsts()) { 885 CmpBuilder.legalFor({{S1, S16}}); 886 } 887 888 CmpBuilder 889 .widenScalarToNextPow2(1) 890 .clampScalar(1, S32, S64) 891 .scalarize(0) 892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 893 894 getActionDefinitionsBuilder(G_FCMP) 895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0); 899 900 // FIXME: fpow has a selection pattern that should move to custom lowering. 901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 902 if (ST.has16BitInsts()) 903 Exp2Ops.legalFor({S32, S16}); 904 else 905 Exp2Ops.legalFor({S32}); 906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 907 Exp2Ops.scalarize(0); 908 909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 910 if (ST.has16BitInsts()) 911 ExpOps.customFor({{S32}, {S16}}); 912 else 913 ExpOps.customFor({S32}); 914 ExpOps.clampScalar(0, MinScalarFPTy, S32) 915 .scalarize(0); 916 917 getActionDefinitionsBuilder(G_FPOWI) 918 .clampScalar(0, MinScalarFPTy, S32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder(G_CTPOP) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 // The hardware instructions return a different result on 0 than the generic 931 // instructions expect. The hardware produces -1, but these produce the 932 // bitwidth. 933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 934 .scalarize(0) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32) 939 .lower(); 940 941 // The 64-bit versions produce 32-bit results, but only on the SALU. 942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 943 .legalFor({{S32, S32}, {S32, S64}}) 944 .clampScalar(0, S32, S32) 945 .clampScalar(1, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0, 32) 948 .widenScalarToNextPow2(1, 32); 949 950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 951 // RegBankSelect. 952 getActionDefinitionsBuilder(G_BITREVERSE) 953 .legalFor({S32, S64}) 954 .clampScalar(0, S32, S64) 955 .scalarize(0) 956 .widenScalarToNextPow2(0); 957 958 if (ST.has16BitInsts()) { 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S16, S32, V2S16}) 961 .clampMaxNumElements(0, S16, 2) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .clampScalar(0, S16, S32) 966 .scalarize(0); 967 968 if (ST.hasVOP3PInsts()) { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 970 .legalFor({S32, S16, V2S16}) 971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 972 .clampMaxNumElements(0, S16, 2) 973 .minScalar(0, S16) 974 .widenScalarToNextPow2(0) 975 .scalarize(0) 976 .lower(); 977 } else { 978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 979 .legalFor({S32, S16}) 980 .widenScalarToNextPow2(0) 981 .minScalar(0, S16) 982 .scalarize(0) 983 .lower(); 984 } 985 } else { 986 // TODO: Should have same legality without v_perm_b32 987 getActionDefinitionsBuilder(G_BSWAP) 988 .legalFor({S32}) 989 .lowerIf(scalarNarrowerThan(0, 32)) 990 // FIXME: Fixing non-power-of-2 before clamp is workaround for 991 // narrowScalar limitation. 992 .widenScalarToNextPow2(0) 993 .maxScalar(0, S32) 994 .scalarize(0) 995 .lower(); 996 997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 998 .legalFor({S32}) 999 .minScalar(0, S32) 1000 .widenScalarToNextPow2(0) 1001 .scalarize(0) 1002 .lower(); 1003 } 1004 1005 getActionDefinitionsBuilder(G_INTTOPTR) 1006 // List the common cases 1007 .legalForCartesianProduct(AddrSpaces64, {S64}) 1008 .legalForCartesianProduct(AddrSpaces32, {S32}) 1009 .scalarize(0) 1010 // Accept any address space as long as the size matches 1011 .legalIf(sameSize(0, 1)) 1012 .widenScalarIf(smallerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }) 1016 .narrowScalarIf(largerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }); 1020 1021 getActionDefinitionsBuilder(G_PTRTOINT) 1022 // List the common cases 1023 .legalForCartesianProduct(AddrSpaces64, {S64}) 1024 .legalForCartesianProduct(AddrSpaces32, {S32}) 1025 .scalarize(0) 1026 // Accept any address space as long as the size matches 1027 .legalIf(sameSize(0, 1)) 1028 .widenScalarIf(smallerThan(0, 1), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1031 }) 1032 .narrowScalarIf( 1033 largerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }); 1037 1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1039 .scalarize(0) 1040 .custom(); 1041 1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1043 bool IsLoad) -> bool { 1044 const LLT DstTy = Query.Types[0]; 1045 1046 // Split vector extloads. 1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1049 1050 if (MemSize < DstTy.getSizeInBits()) 1051 MemSize = std::max(MemSize, AlignBits); 1052 1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1054 return true; 1055 1056 const LLT PtrTy = Query.Types[1]; 1057 unsigned AS = PtrTy.getAddressSpace(); 1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1059 return true; 1060 1061 // Catch weird sized loads that don't evenly divide into the access sizes 1062 // TODO: May be able to widen depending on alignment etc. 1063 unsigned NumRegs = (MemSize + 31) / 32; 1064 if (NumRegs == 3) { 1065 if (!ST.hasDwordx3LoadStores()) 1066 return true; 1067 } else { 1068 // If the alignment allows, these should have been widened. 1069 if (!isPowerOf2_32(NumRegs)) 1070 return true; 1071 } 1072 1073 if (AlignBits < MemSize) { 1074 const SITargetLowering *TLI = ST.getTargetLowering(); 1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1076 Align(AlignBits / 8)); 1077 } 1078 1079 return false; 1080 }; 1081 1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1085 1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1087 // LDS 1088 // TODO: Unsupported flat for SI. 1089 1090 for (unsigned Op : {G_LOAD, G_STORE}) { 1091 const bool IsStore = Op == G_STORE; 1092 1093 auto &Actions = getActionDefinitionsBuilder(Op); 1094 // Explicitly list some common cases. 1095 // TODO: Does this help compile time at all? 1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1097 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1098 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1099 {S64, GlobalPtr, 64, GlobalAlign32}, 1100 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1101 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1102 {S32, GlobalPtr, 8, GlobalAlign8}, 1103 {S32, GlobalPtr, 16, GlobalAlign16}, 1104 1105 {S32, LocalPtr, 32, 32}, 1106 {S64, LocalPtr, 64, 32}, 1107 {V2S32, LocalPtr, 64, 32}, 1108 {S32, LocalPtr, 8, 8}, 1109 {S32, LocalPtr, 16, 16}, 1110 {V2S16, LocalPtr, 32, 32}, 1111 1112 {S32, PrivatePtr, 32, 32}, 1113 {S32, PrivatePtr, 8, 8}, 1114 {S32, PrivatePtr, 16, 16}, 1115 {V2S16, PrivatePtr, 32, 32}, 1116 1117 {S32, ConstantPtr, 32, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1119 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1120 {S64, ConstantPtr, 64, GlobalAlign32}, 1121 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1122 Actions.legalIf( 1123 [=](const LegalityQuery &Query) -> bool { 1124 return isLoadStoreLegal(ST, Query); 1125 }); 1126 1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1128 // 64-bits. 1129 // 1130 // TODO: Should generalize bitcast action into coerce, which will also cover 1131 // inserting addrspacecasts. 1132 Actions.customIf(typeIs(1, Constant32Ptr)); 1133 1134 // Turn any illegal element vectors into something easier to deal 1135 // with. These will ultimately produce 32-bit scalar shifts to extract the 1136 // parts anyway. 1137 // 1138 // For odd 16-bit element vectors, prefer to split those into pieces with 1139 // 16-bit vector parts. 1140 Actions.bitcastIf( 1141 [=](const LegalityQuery &Query) -> bool { 1142 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1143 Query.MMODescrs[0].SizeInBits); 1144 }, bitcastToRegisterType(0)); 1145 1146 if (!IsStore) { 1147 // Widen suitably aligned loads by loading extra bytes. The standard 1148 // legalization actions can't properly express widening memory operands. 1149 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1150 return shouldWidenLoad(ST, Query, G_LOAD); 1151 }); 1152 } 1153 1154 // FIXME: load/store narrowing should be moved to lower action 1155 Actions 1156 .narrowScalarIf( 1157 [=](const LegalityQuery &Query) -> bool { 1158 return !Query.Types[0].isVector() && 1159 needToSplitMemOp(Query, Op == G_LOAD); 1160 }, 1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1162 const LLT DstTy = Query.Types[0]; 1163 const LLT PtrTy = Query.Types[1]; 1164 1165 const unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1167 1168 // Split extloads. 1169 if (DstSize > MemSize) 1170 return std::make_pair(0, LLT::scalar(MemSize)); 1171 1172 if (!isPowerOf2_32(DstSize)) { 1173 // We're probably decomposing an odd sized store. Try to split 1174 // to the widest type. TODO: Account for alignment. As-is it 1175 // should be OK, since the new parts will be further legalized. 1176 unsigned FloorSize = PowerOf2Floor(DstSize); 1177 return std::make_pair(0, LLT::scalar(FloorSize)); 1178 } 1179 1180 if (DstSize > 32 && (DstSize % 32 != 0)) { 1181 // FIXME: Need a way to specify non-extload of larger size if 1182 // suitably aligned. 1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1184 } 1185 1186 unsigned MaxSize = maxSizeForAddrSpace(ST, 1187 PtrTy.getAddressSpace(), 1188 Op == G_LOAD); 1189 if (MemSize > MaxSize) 1190 return std::make_pair(0, LLT::scalar(MaxSize)); 1191 1192 unsigned Align = Query.MMODescrs[0].AlignInBits; 1193 return std::make_pair(0, LLT::scalar(Align)); 1194 }) 1195 .fewerElementsIf( 1196 [=](const LegalityQuery &Query) -> bool { 1197 return Query.Types[0].isVector() && 1198 needToSplitMemOp(Query, Op == G_LOAD); 1199 }, 1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1201 const LLT DstTy = Query.Types[0]; 1202 const LLT PtrTy = Query.Types[1]; 1203 1204 LLT EltTy = DstTy.getElementType(); 1205 unsigned MaxSize = maxSizeForAddrSpace(ST, 1206 PtrTy.getAddressSpace(), 1207 Op == G_LOAD); 1208 1209 // FIXME: Handle widened to power of 2 results better. This ends 1210 // up scalarizing. 1211 // FIXME: 3 element stores scalarized on SI 1212 1213 // Split if it's too large for the address space. 1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1215 unsigned NumElts = DstTy.getNumElements(); 1216 unsigned EltSize = EltTy.getSizeInBits(); 1217 1218 if (MaxSize % EltSize == 0) { 1219 return std::make_pair( 1220 0, LLT::scalarOrVector( 1221 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1222 } 1223 1224 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1225 1226 // FIXME: Refine when odd breakdowns handled 1227 // The scalars will need to be re-legalized. 1228 if (NumPieces == 1 || NumPieces >= NumElts || 1229 NumElts % NumPieces != 0) 1230 return std::make_pair(0, EltTy); 1231 1232 return std::make_pair( 1233 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1234 } 1235 1236 // FIXME: We could probably handle weird extending loads better. 1237 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1238 if (DstTy.getSizeInBits() > MemSize) 1239 return std::make_pair(0, EltTy); 1240 1241 unsigned EltSize = EltTy.getSizeInBits(); 1242 unsigned DstSize = DstTy.getSizeInBits(); 1243 if (!isPowerOf2_32(DstSize)) { 1244 // We're probably decomposing an odd sized store. Try to split 1245 // to the widest type. TODO: Account for alignment. As-is it 1246 // should be OK, since the new parts will be further legalized. 1247 unsigned FloorSize = PowerOf2Floor(DstSize); 1248 return std::make_pair( 1249 0, LLT::scalarOrVector( 1250 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1251 } 1252 1253 // Need to split because of alignment. 1254 unsigned Align = Query.MMODescrs[0].AlignInBits; 1255 if (EltSize > Align && 1256 (EltSize / Align < DstTy.getNumElements())) { 1257 return std::make_pair( 1258 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1259 } 1260 1261 // May need relegalization for the scalars. 1262 return std::make_pair(0, EltTy); 1263 }) 1264 .lowerIfMemSizeNotPow2() 1265 .minScalar(0, S32) 1266 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1267 .widenScalarToNextPow2(0) 1268 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1269 .lower(); 1270 } 1271 1272 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1273 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1274 {S32, GlobalPtr, 16, 2 * 8}, 1275 {S32, LocalPtr, 8, 8}, 1276 {S32, LocalPtr, 16, 16}, 1277 {S32, PrivatePtr, 8, 8}, 1278 {S32, PrivatePtr, 16, 16}, 1279 {S32, ConstantPtr, 8, 8}, 1280 {S32, ConstantPtr, 16, 2 * 8}}) 1281 .legalIf( 1282 [=](const LegalityQuery &Query) -> bool { 1283 return isLoadStoreLegal(ST, Query); 1284 }); 1285 1286 if (ST.hasFlatAddressSpace()) { 1287 ExtLoads.legalForTypesWithMemDesc( 1288 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1289 } 1290 1291 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1292 // 64-bits. 1293 // 1294 // TODO: Should generalize bitcast action into coerce, which will also cover 1295 // inserting addrspacecasts. 1296 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1297 1298 ExtLoads.clampScalar(0, S32, S32) 1299 .widenScalarToNextPow2(0) 1300 .unsupportedIfMemSizeNotPow2() 1301 .lower(); 1302 1303 auto &Atomics = getActionDefinitionsBuilder( 1304 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1305 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1306 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1307 G_ATOMICRMW_UMIN}) 1308 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1309 {S64, GlobalPtr}, {S64, LocalPtr}, 1310 {S32, RegionPtr}, {S64, RegionPtr}}); 1311 if (ST.hasFlatAddressSpace()) { 1312 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1313 } 1314 1315 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1316 if (ST.hasLDSFPAtomics()) { 1317 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1318 if (ST.hasGFX90AInsts()) 1319 Atomic.legalFor({{S64, LocalPtr}}); 1320 } 1321 if (ST.hasAtomicFaddInsts()) 1322 Atomic.legalFor({{S32, GlobalPtr}}); 1323 1324 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1325 // demarshalling 1326 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1327 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1328 {S32, FlatPtr}, {S64, FlatPtr}}) 1329 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1330 {S32, RegionPtr}, {S64, RegionPtr}}); 1331 // TODO: Pointer types, any 32-bit or 64-bit vector 1332 1333 // Condition should be s32 for scalar, s1 for vector. 1334 getActionDefinitionsBuilder(G_SELECT) 1335 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1336 LocalPtr, FlatPtr, PrivatePtr, 1337 LLT::fixed_vector(2, LocalPtr), 1338 LLT::fixed_vector(2, PrivatePtr)}, 1339 {S1, S32}) 1340 .clampScalar(0, S16, S64) 1341 .scalarize(1) 1342 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1343 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1344 .clampMaxNumElements(0, S32, 2) 1345 .clampMaxNumElements(0, LocalPtr, 2) 1346 .clampMaxNumElements(0, PrivatePtr, 2) 1347 .scalarize(0) 1348 .widenScalarToNextPow2(0) 1349 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1350 1351 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1352 // be more flexible with the shift amount type. 1353 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1354 .legalFor({{S32, S32}, {S64, S32}}); 1355 if (ST.has16BitInsts()) { 1356 if (ST.hasVOP3PInsts()) { 1357 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1358 .clampMaxNumElements(0, S16, 2); 1359 } else 1360 Shifts.legalFor({{S16, S16}}); 1361 1362 // TODO: Support 16-bit shift amounts for all types 1363 Shifts.widenScalarIf( 1364 [=](const LegalityQuery &Query) { 1365 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1366 // 32-bit amount. 1367 const LLT ValTy = Query.Types[0]; 1368 const LLT AmountTy = Query.Types[1]; 1369 return ValTy.getSizeInBits() <= 16 && 1370 AmountTy.getSizeInBits() < 16; 1371 }, changeTo(1, S16)); 1372 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1373 Shifts.clampScalar(1, S32, S32); 1374 Shifts.clampScalar(0, S16, S64); 1375 Shifts.widenScalarToNextPow2(0, 16); 1376 1377 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1378 .minScalar(0, S16) 1379 .scalarize(0) 1380 .lower(); 1381 } else { 1382 // Make sure we legalize the shift amount type first, as the general 1383 // expansion for the shifted type will produce much worse code if it hasn't 1384 // been truncated already. 1385 Shifts.clampScalar(1, S32, S32); 1386 Shifts.clampScalar(0, S32, S64); 1387 Shifts.widenScalarToNextPow2(0, 32); 1388 1389 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1390 .minScalar(0, S32) 1391 .scalarize(0) 1392 .lower(); 1393 } 1394 Shifts.scalarize(0); 1395 1396 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1397 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1398 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1399 unsigned IdxTypeIdx = 2; 1400 1401 getActionDefinitionsBuilder(Op) 1402 .customIf([=](const LegalityQuery &Query) { 1403 const LLT EltTy = Query.Types[EltTypeIdx]; 1404 const LLT VecTy = Query.Types[VecTypeIdx]; 1405 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1406 const unsigned EltSize = EltTy.getSizeInBits(); 1407 return (EltSize == 32 || EltSize == 64) && 1408 VecTy.getSizeInBits() % 32 == 0 && 1409 VecTy.getSizeInBits() <= MaxRegisterSize && 1410 IdxTy.getSizeInBits() == 32; 1411 }) 1412 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1413 bitcastToVectorElement32(VecTypeIdx)) 1414 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1415 .bitcastIf( 1416 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1417 [=](const LegalityQuery &Query) { 1418 // For > 64-bit element types, try to turn this into a 64-bit 1419 // element vector since we may be able to do better indexing 1420 // if this is scalar. If not, fall back to 32. 1421 const LLT EltTy = Query.Types[EltTypeIdx]; 1422 const LLT VecTy = Query.Types[VecTypeIdx]; 1423 const unsigned DstEltSize = EltTy.getSizeInBits(); 1424 const unsigned VecSize = VecTy.getSizeInBits(); 1425 1426 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1427 return std::make_pair( 1428 VecTypeIdx, 1429 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1430 }) 1431 .clampScalar(EltTypeIdx, S32, S64) 1432 .clampScalar(VecTypeIdx, S32, S64) 1433 .clampScalar(IdxTypeIdx, S32, S32) 1434 .clampMaxNumElements(VecTypeIdx, S32, 32) 1435 // TODO: Clamp elements for 64-bit vectors? 1436 // It should only be necessary with variable indexes. 1437 // As a last resort, lower to the stack 1438 .lower(); 1439 } 1440 1441 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1442 .unsupportedIf([=](const LegalityQuery &Query) { 1443 const LLT &EltTy = Query.Types[1].getElementType(); 1444 return Query.Types[0] != EltTy; 1445 }); 1446 1447 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1448 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1449 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1450 1451 // FIXME: Doesn't handle extract of illegal sizes. 1452 getActionDefinitionsBuilder(Op) 1453 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1454 // FIXME: Multiples of 16 should not be legal. 1455 .legalIf([=](const LegalityQuery &Query) { 1456 const LLT BigTy = Query.Types[BigTyIdx]; 1457 const LLT LitTy = Query.Types[LitTyIdx]; 1458 return (BigTy.getSizeInBits() % 32 == 0) && 1459 (LitTy.getSizeInBits() % 16 == 0); 1460 }) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT BigTy = Query.Types[BigTyIdx]; 1464 return (BigTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1467 .widenScalarIf( 1468 [=](const LegalityQuery &Query) { 1469 const LLT LitTy = Query.Types[LitTyIdx]; 1470 return (LitTy.getScalarSizeInBits() < 16); 1471 }, 1472 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1473 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1474 .widenScalarToNextPow2(BigTyIdx, 32); 1475 1476 } 1477 1478 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1479 .legalForCartesianProduct(AllS32Vectors, {S32}) 1480 .legalForCartesianProduct(AllS64Vectors, {S64}) 1481 .clampNumElements(0, V16S32, V32S32) 1482 .clampNumElements(0, V2S64, V16S64) 1483 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1484 1485 if (ST.hasScalarPackInsts()) { 1486 BuildVector 1487 // FIXME: Should probably widen s1 vectors straight to s32 1488 .minScalarOrElt(0, S16) 1489 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1490 .minScalar(1, S32); 1491 1492 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1493 .legalFor({V2S16, S32}) 1494 .lower(); 1495 BuildVector.minScalarOrElt(0, S32); 1496 } else { 1497 BuildVector.customFor({V2S16, S16}); 1498 BuildVector.minScalarOrElt(0, S32); 1499 1500 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1501 .customFor({V2S16, S32}) 1502 .lower(); 1503 } 1504 1505 BuildVector.legalIf(isRegisterType(0)); 1506 1507 // FIXME: Clamp maximum size 1508 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1509 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1510 .clampMaxNumElements(0, S32, 32) 1511 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1512 .clampMaxNumElements(0, S16, 64); 1513 1514 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1515 // pre-legalize. 1516 if (ST.hasVOP3PInsts()) { 1517 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1518 .customFor({V2S16, V2S16}) 1519 .lower(); 1520 } else 1521 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1522 1523 // Merge/Unmerge 1524 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1525 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1526 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1527 1528 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1529 const LLT Ty = Query.Types[TypeIdx]; 1530 if (Ty.isVector()) { 1531 const LLT &EltTy = Ty.getElementType(); 1532 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1533 return true; 1534 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1535 return true; 1536 } 1537 return false; 1538 }; 1539 1540 auto &Builder = getActionDefinitionsBuilder(Op) 1541 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1542 .lowerFor({{S16, V2S16}}) 1543 .lowerIf([=](const LegalityQuery &Query) { 1544 const LLT BigTy = Query.Types[BigTyIdx]; 1545 return BigTy.getSizeInBits() == 32; 1546 }) 1547 // Try to widen to s16 first for small types. 1548 // TODO: Only do this on targets with legal s16 shifts 1549 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1550 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1551 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1552 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1553 elementTypeIs(1, S16)), 1554 changeTo(1, V2S16)) 1555 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1556 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1557 // valid. 1558 .clampScalar(LitTyIdx, S32, S512) 1559 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1560 // Break up vectors with weird elements into scalars 1561 .fewerElementsIf( 1562 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1563 scalarize(0)) 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1566 scalarize(1)) 1567 .clampScalar(BigTyIdx, S32, MaxScalar); 1568 1569 if (Op == G_MERGE_VALUES) { 1570 Builder.widenScalarIf( 1571 // TODO: Use 16-bit shifts if legal for 8-bit values? 1572 [=](const LegalityQuery &Query) { 1573 const LLT Ty = Query.Types[LitTyIdx]; 1574 return Ty.getSizeInBits() < 32; 1575 }, 1576 changeTo(LitTyIdx, S32)); 1577 } 1578 1579 Builder.widenScalarIf( 1580 [=](const LegalityQuery &Query) { 1581 const LLT Ty = Query.Types[BigTyIdx]; 1582 return !isPowerOf2_32(Ty.getSizeInBits()) && 1583 Ty.getSizeInBits() % 16 != 0; 1584 }, 1585 [=](const LegalityQuery &Query) { 1586 // Pick the next power of 2, or a multiple of 64 over 128. 1587 // Whichever is smaller. 1588 const LLT &Ty = Query.Types[BigTyIdx]; 1589 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1590 if (NewSizeInBits >= 256) { 1591 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1592 if (RoundedTo < NewSizeInBits) 1593 NewSizeInBits = RoundedTo; 1594 } 1595 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1596 }) 1597 // Any vectors left are the wrong size. Scalarize them. 1598 .scalarize(0) 1599 .scalarize(1); 1600 } 1601 1602 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1603 // RegBankSelect. 1604 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1605 .legalFor({{S32}, {S64}}); 1606 1607 if (ST.hasVOP3PInsts()) { 1608 SextInReg.lowerFor({{V2S16}}) 1609 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1610 // get more vector shift opportunities, since we'll get those when 1611 // expanded. 1612 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1613 } else if (ST.has16BitInsts()) { 1614 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1615 } else { 1616 // Prefer to promote to s32 before lowering if we don't have 16-bit 1617 // shifts. This avoid a lot of intermediate truncate and extend operations. 1618 SextInReg.lowerFor({{S32}, {S64}}); 1619 } 1620 1621 SextInReg 1622 .scalarize(0) 1623 .clampScalar(0, S32, S64) 1624 .lower(); 1625 1626 // TODO: Only Try to form v2s16 with legal packed instructions. 1627 getActionDefinitionsBuilder(G_FSHR) 1628 .legalFor({{S32, S32}}) 1629 .lowerFor({{V2S16, V2S16}}) 1630 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1631 .scalarize(0) 1632 .lower(); 1633 1634 if (ST.hasVOP3PInsts()) { 1635 getActionDefinitionsBuilder(G_FSHL) 1636 .lowerFor({{V2S16, V2S16}}) 1637 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1638 .scalarize(0) 1639 .lower(); 1640 } else { 1641 getActionDefinitionsBuilder(G_FSHL) 1642 .scalarize(0) 1643 .lower(); 1644 } 1645 1646 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1647 .legalFor({S64}); 1648 1649 getActionDefinitionsBuilder(G_FENCE) 1650 .alwaysLegal(); 1651 1652 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1653 .scalarize(0) 1654 .minScalar(0, S32) 1655 .lower(); 1656 1657 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1658 .legalFor({{S32, S32}, {S64, S32}}) 1659 .clampScalar(1, S32, S32) 1660 .clampScalar(0, S32, S64) 1661 .widenScalarToNextPow2(0) 1662 .scalarize(0); 1663 1664 getActionDefinitionsBuilder({ 1665 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1666 G_FCOPYSIGN, 1667 1668 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1669 G_ATOMICRMW_NAND, 1670 G_ATOMICRMW_FSUB, 1671 G_READ_REGISTER, 1672 G_WRITE_REGISTER, 1673 1674 G_SADDO, G_SSUBO, 1675 1676 // TODO: Implement 1677 G_FMINIMUM, G_FMAXIMUM}).lower(); 1678 1679 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1680 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1681 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1682 .unsupported(); 1683 1684 getLegacyLegalizerInfo().computeTables(); 1685 verify(*ST.getInstrInfo()); 1686 } 1687 1688 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1689 MachineInstr &MI) const { 1690 MachineIRBuilder &B = Helper.MIRBuilder; 1691 MachineRegisterInfo &MRI = *B.getMRI(); 1692 1693 switch (MI.getOpcode()) { 1694 case TargetOpcode::G_ADDRSPACE_CAST: 1695 return legalizeAddrSpaceCast(MI, MRI, B); 1696 case TargetOpcode::G_FRINT: 1697 return legalizeFrint(MI, MRI, B); 1698 case TargetOpcode::G_FCEIL: 1699 return legalizeFceil(MI, MRI, B); 1700 case TargetOpcode::G_FREM: 1701 return legalizeFrem(MI, MRI, B); 1702 case TargetOpcode::G_INTRINSIC_TRUNC: 1703 return legalizeIntrinsicTrunc(MI, MRI, B); 1704 case TargetOpcode::G_SITOFP: 1705 return legalizeITOFP(MI, MRI, B, true); 1706 case TargetOpcode::G_UITOFP: 1707 return legalizeITOFP(MI, MRI, B, false); 1708 case TargetOpcode::G_FPTOSI: 1709 return legalizeFPTOI(MI, MRI, B, true); 1710 case TargetOpcode::G_FPTOUI: 1711 return legalizeFPTOI(MI, MRI, B, false); 1712 case TargetOpcode::G_FMINNUM: 1713 case TargetOpcode::G_FMAXNUM: 1714 case TargetOpcode::G_FMINNUM_IEEE: 1715 case TargetOpcode::G_FMAXNUM_IEEE: 1716 return legalizeMinNumMaxNum(Helper, MI); 1717 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1718 return legalizeExtractVectorElt(MI, MRI, B); 1719 case TargetOpcode::G_INSERT_VECTOR_ELT: 1720 return legalizeInsertVectorElt(MI, MRI, B); 1721 case TargetOpcode::G_SHUFFLE_VECTOR: 1722 return legalizeShuffleVector(MI, MRI, B); 1723 case TargetOpcode::G_FSIN: 1724 case TargetOpcode::G_FCOS: 1725 return legalizeSinCos(MI, MRI, B); 1726 case TargetOpcode::G_GLOBAL_VALUE: 1727 return legalizeGlobalValue(MI, MRI, B); 1728 case TargetOpcode::G_LOAD: 1729 case TargetOpcode::G_SEXTLOAD: 1730 case TargetOpcode::G_ZEXTLOAD: 1731 return legalizeLoad(Helper, MI); 1732 case TargetOpcode::G_FMAD: 1733 return legalizeFMad(MI, MRI, B); 1734 case TargetOpcode::G_FDIV: 1735 return legalizeFDIV(MI, MRI, B); 1736 case TargetOpcode::G_UDIV: 1737 case TargetOpcode::G_UREM: 1738 case TargetOpcode::G_UDIVREM: 1739 return legalizeUnsignedDIV_REM(MI, MRI, B); 1740 case TargetOpcode::G_SDIV: 1741 case TargetOpcode::G_SREM: 1742 case TargetOpcode::G_SDIVREM: 1743 return legalizeSignedDIV_REM(MI, MRI, B); 1744 case TargetOpcode::G_ATOMIC_CMPXCHG: 1745 return legalizeAtomicCmpXChg(MI, MRI, B); 1746 case TargetOpcode::G_FLOG: 1747 return legalizeFlog(MI, B, numbers::ln2f); 1748 case TargetOpcode::G_FLOG10: 1749 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1750 case TargetOpcode::G_FEXP: 1751 return legalizeFExp(MI, B); 1752 case TargetOpcode::G_FPOW: 1753 return legalizeFPow(MI, B); 1754 case TargetOpcode::G_FFLOOR: 1755 return legalizeFFloor(MI, MRI, B); 1756 case TargetOpcode::G_BUILD_VECTOR: 1757 return legalizeBuildVector(MI, MRI, B); 1758 default: 1759 return false; 1760 } 1761 1762 llvm_unreachable("expected switch to return"); 1763 } 1764 1765 Register AMDGPULegalizerInfo::getSegmentAperture( 1766 unsigned AS, 1767 MachineRegisterInfo &MRI, 1768 MachineIRBuilder &B) const { 1769 MachineFunction &MF = B.getMF(); 1770 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1771 const LLT S32 = LLT::scalar(32); 1772 1773 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1774 1775 if (ST.hasApertureRegs()) { 1776 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1777 // getreg. 1778 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1779 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1780 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1781 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1782 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1783 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1784 unsigned Encoding = 1785 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1786 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1787 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1788 1789 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1790 1791 B.buildInstr(AMDGPU::S_GETREG_B32) 1792 .addDef(GetReg) 1793 .addImm(Encoding); 1794 MRI.setType(GetReg, S32); 1795 1796 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1797 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1798 } 1799 1800 Register QueuePtr = MRI.createGenericVirtualRegister( 1801 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1802 1803 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1804 return Register(); 1805 1806 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1807 // private_segment_aperture_base_hi. 1808 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1809 1810 // TODO: can we be smarter about machine pointer info? 1811 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1812 MachineMemOperand *MMO = MF.getMachineMemOperand( 1813 PtrInfo, 1814 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1815 MachineMemOperand::MOInvariant, 1816 4, commonAlignment(Align(64), StructOffset)); 1817 1818 Register LoadAddr; 1819 1820 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1821 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1822 } 1823 1824 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1825 MachineInstr &MI, MachineRegisterInfo &MRI, 1826 MachineIRBuilder &B) const { 1827 MachineFunction &MF = B.getMF(); 1828 1829 const LLT S32 = LLT::scalar(32); 1830 Register Dst = MI.getOperand(0).getReg(); 1831 Register Src = MI.getOperand(1).getReg(); 1832 1833 LLT DstTy = MRI.getType(Dst); 1834 LLT SrcTy = MRI.getType(Src); 1835 unsigned DestAS = DstTy.getAddressSpace(); 1836 unsigned SrcAS = SrcTy.getAddressSpace(); 1837 1838 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1839 // vector element. 1840 assert(!DstTy.isVector()); 1841 1842 const AMDGPUTargetMachine &TM 1843 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1844 1845 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1846 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1847 return true; 1848 } 1849 1850 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1851 // Truncate. 1852 B.buildExtract(Dst, Src, 0); 1853 MI.eraseFromParent(); 1854 return true; 1855 } 1856 1857 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1858 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1859 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1860 1861 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1862 // another. Merge operands are required to be the same type, but creating an 1863 // extra ptrtoint would be kind of pointless. 1864 auto HighAddr = B.buildConstant( 1865 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1866 B.buildMerge(Dst, {Src, HighAddr}); 1867 MI.eraseFromParent(); 1868 return true; 1869 } 1870 1871 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1872 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1873 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1874 unsigned NullVal = TM.getNullPointerValue(DestAS); 1875 1876 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1877 auto FlatNull = B.buildConstant(SrcTy, 0); 1878 1879 // Extract low 32-bits of the pointer. 1880 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1881 1882 auto CmpRes = 1883 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1884 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1885 1886 MI.eraseFromParent(); 1887 return true; 1888 } 1889 1890 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1891 return false; 1892 1893 if (!ST.hasFlatAddressSpace()) 1894 return false; 1895 1896 auto SegmentNull = 1897 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1898 auto FlatNull = 1899 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1900 1901 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1902 if (!ApertureReg.isValid()) 1903 return false; 1904 1905 auto CmpRes = 1906 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1907 1908 // Coerce the type of the low half of the result so we can use merge_values. 1909 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1910 1911 // TODO: Should we allow mismatched types but matching sizes in merges to 1912 // avoid the ptrtoint? 1913 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1914 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1915 1916 MI.eraseFromParent(); 1917 return true; 1918 } 1919 1920 bool AMDGPULegalizerInfo::legalizeFrint( 1921 MachineInstr &MI, MachineRegisterInfo &MRI, 1922 MachineIRBuilder &B) const { 1923 Register Src = MI.getOperand(1).getReg(); 1924 LLT Ty = MRI.getType(Src); 1925 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1926 1927 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1928 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1929 1930 auto C1 = B.buildFConstant(Ty, C1Val); 1931 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1932 1933 // TODO: Should this propagate fast-math-flags? 1934 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1935 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1936 1937 auto C2 = B.buildFConstant(Ty, C2Val); 1938 auto Fabs = B.buildFAbs(Ty, Src); 1939 1940 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1941 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1942 MI.eraseFromParent(); 1943 return true; 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeFceil( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 1950 const LLT S1 = LLT::scalar(1); 1951 const LLT S64 = LLT::scalar(64); 1952 1953 Register Src = MI.getOperand(1).getReg(); 1954 assert(MRI.getType(Src) == S64); 1955 1956 // result = trunc(src) 1957 // if (src > 0.0 && src != result) 1958 // result += 1.0 1959 1960 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1961 1962 const auto Zero = B.buildFConstant(S64, 0.0); 1963 const auto One = B.buildFConstant(S64, 1.0); 1964 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1965 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1966 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1967 auto Add = B.buildSelect(S64, And, One, Zero); 1968 1969 // TODO: Should this propagate fast-math-flags? 1970 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1971 return true; 1972 } 1973 1974 bool AMDGPULegalizerInfo::legalizeFrem( 1975 MachineInstr &MI, MachineRegisterInfo &MRI, 1976 MachineIRBuilder &B) const { 1977 Register DstReg = MI.getOperand(0).getReg(); 1978 Register Src0Reg = MI.getOperand(1).getReg(); 1979 Register Src1Reg = MI.getOperand(2).getReg(); 1980 auto Flags = MI.getFlags(); 1981 LLT Ty = MRI.getType(DstReg); 1982 1983 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1984 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1985 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1986 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 static MachineInstrBuilder extractF64Exponent(Register Hi, 1992 MachineIRBuilder &B) { 1993 const unsigned FractBits = 52; 1994 const unsigned ExpBits = 11; 1995 LLT S32 = LLT::scalar(32); 1996 1997 auto Const0 = B.buildConstant(S32, FractBits - 32); 1998 auto Const1 = B.buildConstant(S32, ExpBits); 1999 2000 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2001 .addUse(Hi) 2002 .addUse(Const0.getReg(0)) 2003 .addUse(Const1.getReg(0)); 2004 2005 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 const LLT S1 = LLT::scalar(1); 2012 const LLT S32 = LLT::scalar(32); 2013 const LLT S64 = LLT::scalar(64); 2014 2015 Register Src = MI.getOperand(1).getReg(); 2016 assert(MRI.getType(Src) == S64); 2017 2018 // TODO: Should this use extract since the low half is unused? 2019 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2020 Register Hi = Unmerge.getReg(1); 2021 2022 // Extract the upper half, since this is where we will find the sign and 2023 // exponent. 2024 auto Exp = extractF64Exponent(Hi, B); 2025 2026 const unsigned FractBits = 52; 2027 2028 // Extract the sign bit. 2029 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2030 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2031 2032 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2033 2034 const auto Zero32 = B.buildConstant(S32, 0); 2035 2036 // Extend back to 64-bits. 2037 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2038 2039 auto Shr = B.buildAShr(S64, FractMask, Exp); 2040 auto Not = B.buildNot(S64, Shr); 2041 auto Tmp0 = B.buildAnd(S64, Src, Not); 2042 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2043 2044 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2045 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2046 2047 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2048 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2049 MI.eraseFromParent(); 2050 return true; 2051 } 2052 2053 bool AMDGPULegalizerInfo::legalizeITOFP( 2054 MachineInstr &MI, MachineRegisterInfo &MRI, 2055 MachineIRBuilder &B, bool Signed) const { 2056 2057 Register Dst = MI.getOperand(0).getReg(); 2058 Register Src = MI.getOperand(1).getReg(); 2059 2060 const LLT S64 = LLT::scalar(64); 2061 const LLT S32 = LLT::scalar(32); 2062 2063 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2064 2065 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2066 2067 auto CvtHi = Signed ? 2068 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2069 B.buildUITOFP(S64, Unmerge.getReg(1)); 2070 2071 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2072 2073 auto ThirtyTwo = B.buildConstant(S32, 32); 2074 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2075 .addUse(CvtHi.getReg(0)) 2076 .addUse(ThirtyTwo.getReg(0)); 2077 2078 // TODO: Should this propagate fast-math-flags? 2079 B.buildFAdd(Dst, LdExp, CvtLo); 2080 MI.eraseFromParent(); 2081 return true; 2082 } 2083 2084 // TODO: Copied from DAG implementation. Verify logic and document how this 2085 // actually works. 2086 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2087 MachineRegisterInfo &MRI, 2088 MachineIRBuilder &B, 2089 bool Signed) const { 2090 2091 Register Dst = MI.getOperand(0).getReg(); 2092 Register Src = MI.getOperand(1).getReg(); 2093 2094 const LLT S64 = LLT::scalar(64); 2095 const LLT S32 = LLT::scalar(32); 2096 2097 const LLT SrcLT = MRI.getType(Src); 2098 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2099 2100 unsigned Flags = MI.getFlags(); 2101 2102 // The basic idea of converting a floating point number into a pair of 32-bit 2103 // integers is illustrated as follows: 2104 // 2105 // tf := trunc(val); 2106 // hif := floor(tf * 2^-32); 2107 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2108 // hi := fptoi(hif); 2109 // lo := fptoi(lof); 2110 // 2111 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2112 MachineInstrBuilder Sign; 2113 if (Signed && SrcLT == S32) { 2114 // However, a 32-bit floating point number has only 23 bits mantissa and 2115 // it's not enough to hold all the significant bits of `lof` if val is 2116 // negative. To avoid the loss of precision, We need to take the absolute 2117 // value after truncating and flip the result back based on the original 2118 // signedness. 2119 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2120 Trunc = B.buildFAbs(S32, Trunc, Flags); 2121 } 2122 MachineInstrBuilder K0, K1; 2123 if (SrcLT == S64) { 2124 K0 = B.buildFConstant(S64, 2125 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2126 K1 = B.buildFConstant(S64, 2127 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2128 } else { 2129 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2130 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2131 } 2132 2133 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2134 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2135 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2136 2137 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2138 : B.buildFPTOUI(S32, FloorMul); 2139 auto Lo = B.buildFPTOUI(S32, Fma); 2140 2141 if (Signed && SrcLT == S32) { 2142 // Flip the result based on the signedness, which is either all 0s or 1s. 2143 Sign = B.buildMerge(S64, {Sign, Sign}); 2144 // r := xor({lo, hi}, sign) - sign; 2145 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2146 } else 2147 B.buildMerge(Dst, {Lo, Hi}); 2148 MI.eraseFromParent(); 2149 2150 return true; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2154 MachineInstr &MI) const { 2155 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2156 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2157 2158 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2159 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2160 2161 // With ieee_mode disabled, the instructions have the correct behavior 2162 // already for G_FMINNUM/G_FMAXNUM 2163 if (!MFI->getMode().IEEE) 2164 return !IsIEEEOp; 2165 2166 if (IsIEEEOp) 2167 return true; 2168 2169 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2170 } 2171 2172 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2173 MachineInstr &MI, MachineRegisterInfo &MRI, 2174 MachineIRBuilder &B) const { 2175 // TODO: Should move some of this into LegalizerHelper. 2176 2177 // TODO: Promote dynamic indexing of s16 to s32 2178 2179 // FIXME: Artifact combiner probably should have replaced the truncated 2180 // constant before this, so we shouldn't need 2181 // getConstantVRegValWithLookThrough. 2182 Optional<ValueAndVReg> MaybeIdxVal = 2183 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2184 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2185 return true; 2186 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2187 2188 Register Dst = MI.getOperand(0).getReg(); 2189 Register Vec = MI.getOperand(1).getReg(); 2190 2191 LLT VecTy = MRI.getType(Vec); 2192 LLT EltTy = VecTy.getElementType(); 2193 assert(EltTy == MRI.getType(Dst)); 2194 2195 if (IdxVal < VecTy.getNumElements()) 2196 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2197 else 2198 B.buildUndef(Dst); 2199 2200 MI.eraseFromParent(); 2201 return true; 2202 } 2203 2204 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2205 MachineInstr &MI, MachineRegisterInfo &MRI, 2206 MachineIRBuilder &B) const { 2207 // TODO: Should move some of this into LegalizerHelper. 2208 2209 // TODO: Promote dynamic indexing of s16 to s32 2210 2211 // FIXME: Artifact combiner probably should have replaced the truncated 2212 // constant before this, so we shouldn't need 2213 // getConstantVRegValWithLookThrough. 2214 Optional<ValueAndVReg> MaybeIdxVal = 2215 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2216 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2217 return true; 2218 2219 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2220 Register Dst = MI.getOperand(0).getReg(); 2221 Register Vec = MI.getOperand(1).getReg(); 2222 Register Ins = MI.getOperand(2).getReg(); 2223 2224 LLT VecTy = MRI.getType(Vec); 2225 LLT EltTy = VecTy.getElementType(); 2226 assert(EltTy == MRI.getType(Ins)); 2227 2228 if (IdxVal < VecTy.getNumElements()) 2229 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2230 else 2231 B.buildUndef(Dst); 2232 2233 MI.eraseFromParent(); 2234 return true; 2235 } 2236 2237 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2238 MachineInstr &MI, MachineRegisterInfo &MRI, 2239 MachineIRBuilder &B) const { 2240 const LLT V2S16 = LLT::fixed_vector(2, 16); 2241 2242 Register Dst = MI.getOperand(0).getReg(); 2243 Register Src0 = MI.getOperand(1).getReg(); 2244 LLT DstTy = MRI.getType(Dst); 2245 LLT SrcTy = MRI.getType(Src0); 2246 2247 if (SrcTy == V2S16 && DstTy == V2S16 && 2248 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2249 return true; 2250 2251 MachineIRBuilder HelperBuilder(MI); 2252 GISelObserverWrapper DummyObserver; 2253 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2254 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2255 } 2256 2257 bool AMDGPULegalizerInfo::legalizeSinCos( 2258 MachineInstr &MI, MachineRegisterInfo &MRI, 2259 MachineIRBuilder &B) const { 2260 2261 Register DstReg = MI.getOperand(0).getReg(); 2262 Register SrcReg = MI.getOperand(1).getReg(); 2263 LLT Ty = MRI.getType(DstReg); 2264 unsigned Flags = MI.getFlags(); 2265 2266 Register TrigVal; 2267 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2268 if (ST.hasTrigReducedRange()) { 2269 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2270 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2271 .addUse(MulVal.getReg(0)) 2272 .setMIFlags(Flags).getReg(0); 2273 } else 2274 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2275 2276 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2277 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2278 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2279 .addUse(TrigVal) 2280 .setMIFlags(Flags); 2281 MI.eraseFromParent(); 2282 return true; 2283 } 2284 2285 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2286 MachineIRBuilder &B, 2287 const GlobalValue *GV, 2288 int64_t Offset, 2289 unsigned GAFlags) const { 2290 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2291 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2292 // to the following code sequence: 2293 // 2294 // For constant address space: 2295 // s_getpc_b64 s[0:1] 2296 // s_add_u32 s0, s0, $symbol 2297 // s_addc_u32 s1, s1, 0 2298 // 2299 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2300 // a fixup or relocation is emitted to replace $symbol with a literal 2301 // constant, which is a pc-relative offset from the encoding of the $symbol 2302 // operand to the global variable. 2303 // 2304 // For global address space: 2305 // s_getpc_b64 s[0:1] 2306 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2307 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2308 // 2309 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2310 // fixups or relocations are emitted to replace $symbol@*@lo and 2311 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2312 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2313 // operand to the global variable. 2314 // 2315 // What we want here is an offset from the value returned by s_getpc 2316 // (which is the address of the s_add_u32 instruction) to the global 2317 // variable, but since the encoding of $symbol starts 4 bytes after the start 2318 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2319 // small. This requires us to add 4 to the global variable offset in order to 2320 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2321 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2322 // instruction. 2323 2324 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2325 2326 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2327 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2328 2329 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2330 .addDef(PCReg); 2331 2332 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2333 if (GAFlags == SIInstrInfo::MO_NONE) 2334 MIB.addImm(0); 2335 else 2336 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2337 2338 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2339 2340 if (PtrTy.getSizeInBits() == 32) 2341 B.buildExtract(DstReg, PCReg, 0); 2342 return true; 2343 } 2344 2345 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2346 MachineInstr &MI, MachineRegisterInfo &MRI, 2347 MachineIRBuilder &B) const { 2348 Register DstReg = MI.getOperand(0).getReg(); 2349 LLT Ty = MRI.getType(DstReg); 2350 unsigned AS = Ty.getAddressSpace(); 2351 2352 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2353 MachineFunction &MF = B.getMF(); 2354 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2355 2356 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2357 if (!MFI->isModuleEntryFunction() && 2358 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2359 const Function &Fn = MF.getFunction(); 2360 DiagnosticInfoUnsupported BadLDSDecl( 2361 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2362 DS_Warning); 2363 Fn.getContext().diagnose(BadLDSDecl); 2364 2365 // We currently don't have a way to correctly allocate LDS objects that 2366 // aren't directly associated with a kernel. We do force inlining of 2367 // functions that use local objects. However, if these dead functions are 2368 // not eliminated, we don't want a compile time error. Just emit a warning 2369 // and a trap, since there should be no callable path here. 2370 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2371 B.buildUndef(DstReg); 2372 MI.eraseFromParent(); 2373 return true; 2374 } 2375 2376 // TODO: We could emit code to handle the initialization somewhere. 2377 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2378 const SITargetLowering *TLI = ST.getTargetLowering(); 2379 if (!TLI->shouldUseLDSConstAddress(GV)) { 2380 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2381 return true; // Leave in place; 2382 } 2383 2384 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2385 Type *Ty = GV->getValueType(); 2386 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2387 // zero-sized type in other languages to declare the dynamic shared 2388 // memory which size is not known at the compile time. They will be 2389 // allocated by the runtime and placed directly after the static 2390 // allocated ones. They all share the same offset. 2391 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2392 // Adjust alignment for that dynamic shared memory array. 2393 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2394 LLT S32 = LLT::scalar(32); 2395 auto Sz = 2396 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2397 B.buildIntToPtr(DstReg, Sz); 2398 MI.eraseFromParent(); 2399 return true; 2400 } 2401 } 2402 2403 B.buildConstant( 2404 DstReg, 2405 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2406 MI.eraseFromParent(); 2407 return true; 2408 } 2409 2410 const Function &Fn = MF.getFunction(); 2411 DiagnosticInfoUnsupported BadInit( 2412 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2413 Fn.getContext().diagnose(BadInit); 2414 return true; 2415 } 2416 2417 const SITargetLowering *TLI = ST.getTargetLowering(); 2418 2419 if (TLI->shouldEmitFixup(GV)) { 2420 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2421 MI.eraseFromParent(); 2422 return true; 2423 } 2424 2425 if (TLI->shouldEmitPCReloc(GV)) { 2426 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2427 MI.eraseFromParent(); 2428 return true; 2429 } 2430 2431 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2432 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2433 2434 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2435 MachinePointerInfo::getGOT(MF), 2436 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2437 MachineMemOperand::MOInvariant, 2438 8 /*Size*/, Align(8)); 2439 2440 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2441 2442 if (Ty.getSizeInBits() == 32) { 2443 // Truncate if this is a 32-bit constant adrdess. 2444 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2445 B.buildExtract(DstReg, Load, 0); 2446 } else 2447 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2448 2449 MI.eraseFromParent(); 2450 return true; 2451 } 2452 2453 static LLT widenToNextPowerOf2(LLT Ty) { 2454 if (Ty.isVector()) 2455 return Ty.changeElementCount( 2456 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2457 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2458 } 2459 2460 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2461 MachineInstr &MI) const { 2462 MachineIRBuilder &B = Helper.MIRBuilder; 2463 MachineRegisterInfo &MRI = *B.getMRI(); 2464 GISelChangeObserver &Observer = Helper.Observer; 2465 2466 Register PtrReg = MI.getOperand(1).getReg(); 2467 LLT PtrTy = MRI.getType(PtrReg); 2468 unsigned AddrSpace = PtrTy.getAddressSpace(); 2469 2470 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2471 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2472 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2473 Observer.changingInstr(MI); 2474 MI.getOperand(1).setReg(Cast.getReg(0)); 2475 Observer.changedInstr(MI); 2476 return true; 2477 } 2478 2479 if (MI.getOpcode() != AMDGPU::G_LOAD) 2480 return false; 2481 2482 Register ValReg = MI.getOperand(0).getReg(); 2483 LLT ValTy = MRI.getType(ValReg); 2484 2485 MachineMemOperand *MMO = *MI.memoperands_begin(); 2486 const unsigned ValSize = ValTy.getSizeInBits(); 2487 const unsigned MemSize = 8 * MMO->getSize(); 2488 const Align MemAlign = MMO->getAlign(); 2489 const unsigned AlignInBits = 8 * MemAlign.value(); 2490 2491 // Widen non-power-of-2 loads to the alignment if needed 2492 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2493 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2494 2495 // This was already the correct extending load result type, so just adjust 2496 // the memory type. 2497 if (WideMemSize == ValSize) { 2498 MachineFunction &MF = B.getMF(); 2499 2500 MachineMemOperand *WideMMO = 2501 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2502 Observer.changingInstr(MI); 2503 MI.setMemRefs(MF, {WideMMO}); 2504 Observer.changedInstr(MI); 2505 return true; 2506 } 2507 2508 // Don't bother handling edge case that should probably never be produced. 2509 if (ValSize > WideMemSize) 2510 return false; 2511 2512 LLT WideTy = widenToNextPowerOf2(ValTy); 2513 2514 Register WideLoad; 2515 if (!WideTy.isVector()) { 2516 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2517 B.buildTrunc(ValReg, WideLoad).getReg(0); 2518 } else { 2519 // Extract the subvector. 2520 2521 if (isRegisterType(ValTy)) { 2522 // If this a case where G_EXTRACT is legal, use it. 2523 // (e.g. <3 x s32> -> <4 x s32>) 2524 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2525 B.buildExtract(ValReg, WideLoad, 0); 2526 } else { 2527 // For cases where the widened type isn't a nice register value, unmerge 2528 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2529 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2530 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2531 B.setInsertPt(B.getMBB(), MI.getIterator()); 2532 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2533 } 2534 } 2535 2536 MI.eraseFromParent(); 2537 return true; 2538 } 2539 2540 return false; 2541 } 2542 2543 bool AMDGPULegalizerInfo::legalizeFMad( 2544 MachineInstr &MI, MachineRegisterInfo &MRI, 2545 MachineIRBuilder &B) const { 2546 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2547 assert(Ty.isScalar()); 2548 2549 MachineFunction &MF = B.getMF(); 2550 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2551 2552 // TODO: Always legal with future ftz flag. 2553 // FIXME: Do we need just output? 2554 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2555 return true; 2556 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2557 return true; 2558 2559 MachineIRBuilder HelperBuilder(MI); 2560 GISelObserverWrapper DummyObserver; 2561 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2562 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2563 } 2564 2565 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2566 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2567 Register DstReg = MI.getOperand(0).getReg(); 2568 Register PtrReg = MI.getOperand(1).getReg(); 2569 Register CmpVal = MI.getOperand(2).getReg(); 2570 Register NewVal = MI.getOperand(3).getReg(); 2571 2572 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2573 "this should not have been custom lowered"); 2574 2575 LLT ValTy = MRI.getType(CmpVal); 2576 LLT VecTy = LLT::fixed_vector(2, ValTy); 2577 2578 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2579 2580 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2581 .addDef(DstReg) 2582 .addUse(PtrReg) 2583 .addUse(PackedVal) 2584 .setMemRefs(MI.memoperands()); 2585 2586 MI.eraseFromParent(); 2587 return true; 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizeFlog( 2591 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2592 Register Dst = MI.getOperand(0).getReg(); 2593 Register Src = MI.getOperand(1).getReg(); 2594 LLT Ty = B.getMRI()->getType(Dst); 2595 unsigned Flags = MI.getFlags(); 2596 2597 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2598 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2599 2600 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2601 MI.eraseFromParent(); 2602 return true; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2606 MachineIRBuilder &B) const { 2607 Register Dst = MI.getOperand(0).getReg(); 2608 Register Src = MI.getOperand(1).getReg(); 2609 unsigned Flags = MI.getFlags(); 2610 LLT Ty = B.getMRI()->getType(Dst); 2611 2612 auto K = B.buildFConstant(Ty, numbers::log2e); 2613 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2614 B.buildFExp2(Dst, Mul, Flags); 2615 MI.eraseFromParent(); 2616 return true; 2617 } 2618 2619 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2620 MachineIRBuilder &B) const { 2621 Register Dst = MI.getOperand(0).getReg(); 2622 Register Src0 = MI.getOperand(1).getReg(); 2623 Register Src1 = MI.getOperand(2).getReg(); 2624 unsigned Flags = MI.getFlags(); 2625 LLT Ty = B.getMRI()->getType(Dst); 2626 const LLT S16 = LLT::scalar(16); 2627 const LLT S32 = LLT::scalar(32); 2628 2629 if (Ty == S32) { 2630 auto Log = B.buildFLog2(S32, Src0, Flags); 2631 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2632 .addUse(Log.getReg(0)) 2633 .addUse(Src1) 2634 .setMIFlags(Flags); 2635 B.buildFExp2(Dst, Mul, Flags); 2636 } else if (Ty == S16) { 2637 // There's no f16 fmul_legacy, so we need to convert for it. 2638 auto Log = B.buildFLog2(S16, Src0, Flags); 2639 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2640 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2641 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2642 .addUse(Ext0.getReg(0)) 2643 .addUse(Ext1.getReg(0)) 2644 .setMIFlags(Flags); 2645 2646 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2647 } else 2648 return false; 2649 2650 MI.eraseFromParent(); 2651 return true; 2652 } 2653 2654 // Find a source register, ignoring any possible source modifiers. 2655 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2656 Register ModSrc = OrigSrc; 2657 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2658 ModSrc = SrcFNeg->getOperand(1).getReg(); 2659 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2660 ModSrc = SrcFAbs->getOperand(1).getReg(); 2661 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2662 ModSrc = SrcFAbs->getOperand(1).getReg(); 2663 return ModSrc; 2664 } 2665 2666 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2667 MachineRegisterInfo &MRI, 2668 MachineIRBuilder &B) const { 2669 2670 const LLT S1 = LLT::scalar(1); 2671 const LLT S64 = LLT::scalar(64); 2672 Register Dst = MI.getOperand(0).getReg(); 2673 Register OrigSrc = MI.getOperand(1).getReg(); 2674 unsigned Flags = MI.getFlags(); 2675 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2676 "this should not have been custom lowered"); 2677 2678 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2679 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2680 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2681 // V_FRACT bug is: 2682 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2683 // 2684 // Convert floor(x) to (x - fract(x)) 2685 2686 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2687 .addUse(OrigSrc) 2688 .setMIFlags(Flags); 2689 2690 // Give source modifier matching some assistance before obscuring a foldable 2691 // pattern. 2692 2693 // TODO: We can avoid the neg on the fract? The input sign to fract 2694 // shouldn't matter? 2695 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2696 2697 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2698 2699 Register Min = MRI.createGenericVirtualRegister(S64); 2700 2701 // We don't need to concern ourselves with the snan handling difference, so 2702 // use the one which will directly select. 2703 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2704 if (MFI->getMode().IEEE) 2705 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2706 else 2707 B.buildFMinNum(Min, Fract, Const, Flags); 2708 2709 Register CorrectedFract = Min; 2710 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2711 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2712 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2713 } 2714 2715 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2716 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2717 2718 MI.eraseFromParent(); 2719 return true; 2720 } 2721 2722 // Turn an illegal packed v2s16 build vector into bit operations. 2723 // TODO: This should probably be a bitcast action in LegalizerHelper. 2724 bool AMDGPULegalizerInfo::legalizeBuildVector( 2725 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2726 Register Dst = MI.getOperand(0).getReg(); 2727 const LLT S32 = LLT::scalar(32); 2728 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2729 2730 Register Src0 = MI.getOperand(1).getReg(); 2731 Register Src1 = MI.getOperand(2).getReg(); 2732 assert(MRI.getType(Src0) == LLT::scalar(16)); 2733 2734 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2735 B.buildBitcast(Dst, Merge); 2736 2737 MI.eraseFromParent(); 2738 return true; 2739 } 2740 2741 // Check that this is a G_XOR x, -1 2742 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2743 if (MI.getOpcode() != TargetOpcode::G_XOR) 2744 return false; 2745 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2746 return ConstVal && *ConstVal == -1; 2747 } 2748 2749 // Return the use branch instruction, otherwise null if the usage is invalid. 2750 static MachineInstr * 2751 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2752 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2753 Register CondDef = MI.getOperand(0).getReg(); 2754 if (!MRI.hasOneNonDBGUse(CondDef)) 2755 return nullptr; 2756 2757 MachineBasicBlock *Parent = MI.getParent(); 2758 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2759 2760 if (isNot(MRI, *UseMI)) { 2761 Register NegatedCond = UseMI->getOperand(0).getReg(); 2762 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2763 return nullptr; 2764 2765 // We're deleting the def of this value, so we need to remove it. 2766 UseMI->eraseFromParent(); 2767 2768 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2769 Negated = true; 2770 } 2771 2772 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2773 return nullptr; 2774 2775 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2776 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2777 if (Next == Parent->end()) { 2778 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2779 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2780 return nullptr; 2781 UncondBrTarget = &*NextMBB; 2782 } else { 2783 if (Next->getOpcode() != AMDGPU::G_BR) 2784 return nullptr; 2785 Br = &*Next; 2786 UncondBrTarget = Br->getOperand(0).getMBB(); 2787 } 2788 2789 return UseMI; 2790 } 2791 2792 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2793 const ArgDescriptor *Arg, 2794 const TargetRegisterClass *ArgRC, 2795 LLT ArgTy) const { 2796 MCRegister SrcReg = Arg->getRegister(); 2797 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2798 assert(DstReg.isVirtual() && "Virtual register expected"); 2799 2800 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2801 ArgTy); 2802 if (Arg->isMasked()) { 2803 // TODO: Should we try to emit this once in the entry block? 2804 const LLT S32 = LLT::scalar(32); 2805 const unsigned Mask = Arg->getMask(); 2806 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2807 2808 Register AndMaskSrc = LiveIn; 2809 2810 if (Shift != 0) { 2811 auto ShiftAmt = B.buildConstant(S32, Shift); 2812 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2813 } 2814 2815 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2816 } else { 2817 B.buildCopy(DstReg, LiveIn); 2818 } 2819 2820 return true; 2821 } 2822 2823 bool AMDGPULegalizerInfo::loadInputValue( 2824 Register DstReg, MachineIRBuilder &B, 2825 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2826 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2827 const ArgDescriptor *Arg; 2828 const TargetRegisterClass *ArgRC; 2829 LLT ArgTy; 2830 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2831 2832 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2833 return false; // TODO: Handle these 2834 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2835 } 2836 2837 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2838 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2839 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2840 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2841 return false; 2842 2843 MI.eraseFromParent(); 2844 return true; 2845 } 2846 2847 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2848 MachineRegisterInfo &MRI, 2849 MachineIRBuilder &B) const { 2850 Register Dst = MI.getOperand(0).getReg(); 2851 LLT DstTy = MRI.getType(Dst); 2852 LLT S16 = LLT::scalar(16); 2853 LLT S32 = LLT::scalar(32); 2854 LLT S64 = LLT::scalar(64); 2855 2856 if (DstTy == S16) 2857 return legalizeFDIV16(MI, MRI, B); 2858 if (DstTy == S32) 2859 return legalizeFDIV32(MI, MRI, B); 2860 if (DstTy == S64) 2861 return legalizeFDIV64(MI, MRI, B); 2862 2863 return false; 2864 } 2865 2866 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2867 Register DstDivReg, 2868 Register DstRemReg, 2869 Register X, 2870 Register Y) const { 2871 const LLT S1 = LLT::scalar(1); 2872 const LLT S32 = LLT::scalar(32); 2873 2874 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2875 // algorithm used here. 2876 2877 // Initial estimate of inv(y). 2878 auto FloatY = B.buildUITOFP(S32, Y); 2879 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2880 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2881 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2882 auto Z = B.buildFPTOUI(S32, ScaledY); 2883 2884 // One round of UNR. 2885 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2886 auto NegYZ = B.buildMul(S32, NegY, Z); 2887 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2888 2889 // Quotient/remainder estimate. 2890 auto Q = B.buildUMulH(S32, X, Z); 2891 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2892 2893 // First quotient/remainder refinement. 2894 auto One = B.buildConstant(S32, 1); 2895 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2896 if (DstDivReg) 2897 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2898 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2899 2900 // Second quotient/remainder refinement. 2901 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2902 if (DstDivReg) 2903 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2904 2905 if (DstRemReg) 2906 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2907 } 2908 2909 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2910 // 2911 // Return lo, hi of result 2912 // 2913 // %cvt.lo = G_UITOFP Val.lo 2914 // %cvt.hi = G_UITOFP Val.hi 2915 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2916 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2917 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2918 // %mul2 = G_FMUL %mul1, 2**(-32) 2919 // %trunc = G_INTRINSIC_TRUNC %mul2 2920 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2921 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2922 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2923 Register Val) { 2924 const LLT S32 = LLT::scalar(32); 2925 auto Unmerge = B.buildUnmerge(S32, Val); 2926 2927 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2928 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2929 2930 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2931 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2932 2933 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2934 auto Mul1 = 2935 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2936 2937 // 2**(-32) 2938 auto Mul2 = 2939 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2940 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2941 2942 // -(2**32) 2943 auto Mad2 = B.buildFMAD(S32, Trunc, 2944 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2945 2946 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2947 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2948 2949 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2950 } 2951 2952 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2953 Register DstDivReg, 2954 Register DstRemReg, 2955 Register Numer, 2956 Register Denom) const { 2957 const LLT S32 = LLT::scalar(32); 2958 const LLT S64 = LLT::scalar(64); 2959 const LLT S1 = LLT::scalar(1); 2960 Register RcpLo, RcpHi; 2961 2962 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2963 2964 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2965 2966 auto Zero64 = B.buildConstant(S64, 0); 2967 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2968 2969 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2970 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2971 2972 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2973 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2974 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2975 2976 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2977 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2978 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2979 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2980 2981 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2982 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2983 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2984 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2985 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2986 2987 auto Zero32 = B.buildConstant(S32, 0); 2988 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2989 auto Add2_HiC = 2990 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2991 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2992 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2993 2994 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2995 Register NumerLo = UnmergeNumer.getReg(0); 2996 Register NumerHi = UnmergeNumer.getReg(1); 2997 2998 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2999 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3000 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3001 Register Mul3_Lo = UnmergeMul3.getReg(0); 3002 Register Mul3_Hi = UnmergeMul3.getReg(1); 3003 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3004 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3005 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3006 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3007 3008 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3009 Register DenomLo = UnmergeDenom.getReg(0); 3010 Register DenomHi = UnmergeDenom.getReg(1); 3011 3012 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3013 auto C1 = B.buildSExt(S32, CmpHi); 3014 3015 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3016 auto C2 = B.buildSExt(S32, CmpLo); 3017 3018 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3019 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3020 3021 // TODO: Here and below portions of the code can be enclosed into if/endif. 3022 // Currently control flow is unconditional and we have 4 selects after 3023 // potential endif to substitute PHIs. 3024 3025 // if C3 != 0 ... 3026 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3027 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3028 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3029 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3030 3031 auto One64 = B.buildConstant(S64, 1); 3032 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3033 3034 auto C4 = 3035 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3036 auto C5 = 3037 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3038 auto C6 = B.buildSelect( 3039 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3040 3041 // if (C6 != 0) 3042 auto Add4 = B.buildAdd(S64, Add3, One64); 3043 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3044 3045 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3046 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3047 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3048 3049 // endif C6 3050 // endif C3 3051 3052 if (DstDivReg) { 3053 auto Sel1 = B.buildSelect( 3054 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3055 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3056 Sel1, MulHi3); 3057 } 3058 3059 if (DstRemReg) { 3060 auto Sel2 = B.buildSelect( 3061 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3062 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3063 Sel2, Sub1); 3064 } 3065 } 3066 3067 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3068 MachineRegisterInfo &MRI, 3069 MachineIRBuilder &B) const { 3070 Register DstDivReg, DstRemReg; 3071 switch (MI.getOpcode()) { 3072 default: 3073 llvm_unreachable("Unexpected opcode!"); 3074 case AMDGPU::G_UDIV: { 3075 DstDivReg = MI.getOperand(0).getReg(); 3076 break; 3077 } 3078 case AMDGPU::G_UREM: { 3079 DstRemReg = MI.getOperand(0).getReg(); 3080 break; 3081 } 3082 case AMDGPU::G_UDIVREM: { 3083 DstDivReg = MI.getOperand(0).getReg(); 3084 DstRemReg = MI.getOperand(1).getReg(); 3085 break; 3086 } 3087 } 3088 3089 const LLT S64 = LLT::scalar(64); 3090 const LLT S32 = LLT::scalar(32); 3091 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3092 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3093 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3094 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3095 3096 if (Ty == S32) 3097 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3098 else if (Ty == S64) 3099 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3100 else 3101 return false; 3102 3103 MI.eraseFromParent(); 3104 return true; 3105 } 3106 3107 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3108 MachineRegisterInfo &MRI, 3109 MachineIRBuilder &B) const { 3110 const LLT S64 = LLT::scalar(64); 3111 const LLT S32 = LLT::scalar(32); 3112 3113 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3114 if (Ty != S32 && Ty != S64) 3115 return false; 3116 3117 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3118 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3119 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3120 3121 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3122 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3123 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3124 3125 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3126 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3127 3128 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3129 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3130 3131 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3132 switch (MI.getOpcode()) { 3133 default: 3134 llvm_unreachable("Unexpected opcode!"); 3135 case AMDGPU::G_SDIV: { 3136 DstDivReg = MI.getOperand(0).getReg(); 3137 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3138 break; 3139 } 3140 case AMDGPU::G_SREM: { 3141 DstRemReg = MI.getOperand(0).getReg(); 3142 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3143 break; 3144 } 3145 case AMDGPU::G_SDIVREM: { 3146 DstDivReg = MI.getOperand(0).getReg(); 3147 DstRemReg = MI.getOperand(1).getReg(); 3148 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3149 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3150 break; 3151 } 3152 } 3153 3154 if (Ty == S32) 3155 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3156 else 3157 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3158 3159 if (DstDivReg) { 3160 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3161 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3162 B.buildSub(DstDivReg, SignXor, Sign); 3163 } 3164 3165 if (DstRemReg) { 3166 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3167 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3168 B.buildSub(DstRemReg, SignXor, Sign); 3169 } 3170 3171 MI.eraseFromParent(); 3172 return true; 3173 } 3174 3175 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3176 MachineRegisterInfo &MRI, 3177 MachineIRBuilder &B) const { 3178 Register Res = MI.getOperand(0).getReg(); 3179 Register LHS = MI.getOperand(1).getReg(); 3180 Register RHS = MI.getOperand(2).getReg(); 3181 uint16_t Flags = MI.getFlags(); 3182 LLT ResTy = MRI.getType(Res); 3183 3184 const MachineFunction &MF = B.getMF(); 3185 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3186 MI.getFlag(MachineInstr::FmAfn); 3187 3188 if (!AllowInaccurateRcp) 3189 return false; 3190 3191 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3192 // 1 / x -> RCP(x) 3193 if (CLHS->isExactlyValue(1.0)) { 3194 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3195 .addUse(RHS) 3196 .setMIFlags(Flags); 3197 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 // -1 / x -> RCP( FNEG(x) ) 3203 if (CLHS->isExactlyValue(-1.0)) { 3204 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3205 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3206 .addUse(FNeg.getReg(0)) 3207 .setMIFlags(Flags); 3208 3209 MI.eraseFromParent(); 3210 return true; 3211 } 3212 } 3213 3214 // x / y -> x * (1.0 / y) 3215 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3216 .addUse(RHS) 3217 .setMIFlags(Flags); 3218 B.buildFMul(Res, LHS, RCP, Flags); 3219 3220 MI.eraseFromParent(); 3221 return true; 3222 } 3223 3224 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3225 MachineRegisterInfo &MRI, 3226 MachineIRBuilder &B) const { 3227 Register Res = MI.getOperand(0).getReg(); 3228 Register X = MI.getOperand(1).getReg(); 3229 Register Y = MI.getOperand(2).getReg(); 3230 uint16_t Flags = MI.getFlags(); 3231 LLT ResTy = MRI.getType(Res); 3232 3233 const MachineFunction &MF = B.getMF(); 3234 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3235 MI.getFlag(MachineInstr::FmAfn); 3236 3237 if (!AllowInaccurateRcp) 3238 return false; 3239 3240 auto NegY = B.buildFNeg(ResTy, Y); 3241 auto One = B.buildFConstant(ResTy, 1.0); 3242 3243 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3244 .addUse(Y) 3245 .setMIFlags(Flags); 3246 3247 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3248 R = B.buildFMA(ResTy, Tmp0, R, R); 3249 3250 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3251 R = B.buildFMA(ResTy, Tmp1, R, R); 3252 3253 auto Ret = B.buildFMul(ResTy, X, R); 3254 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3255 3256 B.buildFMA(Res, Tmp2, R, Ret); 3257 MI.eraseFromParent(); 3258 return true; 3259 } 3260 3261 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3262 MachineRegisterInfo &MRI, 3263 MachineIRBuilder &B) const { 3264 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3265 return true; 3266 3267 Register Res = MI.getOperand(0).getReg(); 3268 Register LHS = MI.getOperand(1).getReg(); 3269 Register RHS = MI.getOperand(2).getReg(); 3270 3271 uint16_t Flags = MI.getFlags(); 3272 3273 LLT S16 = LLT::scalar(16); 3274 LLT S32 = LLT::scalar(32); 3275 3276 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3277 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3278 3279 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3280 .addUse(RHSExt.getReg(0)) 3281 .setMIFlags(Flags); 3282 3283 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3284 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3285 3286 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3287 .addUse(RDst.getReg(0)) 3288 .addUse(RHS) 3289 .addUse(LHS) 3290 .setMIFlags(Flags); 3291 3292 MI.eraseFromParent(); 3293 return true; 3294 } 3295 3296 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3297 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3298 static void toggleSPDenormMode(bool Enable, 3299 MachineIRBuilder &B, 3300 const GCNSubtarget &ST, 3301 AMDGPU::SIModeRegisterDefaults Mode) { 3302 // Set SP denorm mode to this value. 3303 unsigned SPDenormMode = 3304 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3305 3306 if (ST.hasDenormModeInst()) { 3307 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3308 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3309 3310 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3311 B.buildInstr(AMDGPU::S_DENORM_MODE) 3312 .addImm(NewDenormModeValue); 3313 3314 } else { 3315 // Select FP32 bit field in mode register. 3316 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3317 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3318 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3319 3320 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3321 .addImm(SPDenormMode) 3322 .addImm(SPDenormModeBitField); 3323 } 3324 } 3325 3326 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3327 MachineRegisterInfo &MRI, 3328 MachineIRBuilder &B) const { 3329 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3330 return true; 3331 3332 Register Res = MI.getOperand(0).getReg(); 3333 Register LHS = MI.getOperand(1).getReg(); 3334 Register RHS = MI.getOperand(2).getReg(); 3335 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3336 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3337 3338 uint16_t Flags = MI.getFlags(); 3339 3340 LLT S32 = LLT::scalar(32); 3341 LLT S1 = LLT::scalar(1); 3342 3343 auto One = B.buildFConstant(S32, 1.0f); 3344 3345 auto DenominatorScaled = 3346 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3347 .addUse(LHS) 3348 .addUse(RHS) 3349 .addImm(0) 3350 .setMIFlags(Flags); 3351 auto NumeratorScaled = 3352 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3353 .addUse(LHS) 3354 .addUse(RHS) 3355 .addImm(1) 3356 .setMIFlags(Flags); 3357 3358 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3359 .addUse(DenominatorScaled.getReg(0)) 3360 .setMIFlags(Flags); 3361 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3362 3363 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3364 // aren't modeled as reading it. 3365 if (!Mode.allFP32Denormals()) 3366 toggleSPDenormMode(true, B, ST, Mode); 3367 3368 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3369 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3370 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3371 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3372 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3373 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3374 3375 if (!Mode.allFP32Denormals()) 3376 toggleSPDenormMode(false, B, ST, Mode); 3377 3378 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3379 .addUse(Fma4.getReg(0)) 3380 .addUse(Fma1.getReg(0)) 3381 .addUse(Fma3.getReg(0)) 3382 .addUse(NumeratorScaled.getReg(1)) 3383 .setMIFlags(Flags); 3384 3385 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3386 .addUse(Fmas.getReg(0)) 3387 .addUse(RHS) 3388 .addUse(LHS) 3389 .setMIFlags(Flags); 3390 3391 MI.eraseFromParent(); 3392 return true; 3393 } 3394 3395 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3396 MachineRegisterInfo &MRI, 3397 MachineIRBuilder &B) const { 3398 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3399 return true; 3400 3401 Register Res = MI.getOperand(0).getReg(); 3402 Register LHS = MI.getOperand(1).getReg(); 3403 Register RHS = MI.getOperand(2).getReg(); 3404 3405 uint16_t Flags = MI.getFlags(); 3406 3407 LLT S64 = LLT::scalar(64); 3408 LLT S1 = LLT::scalar(1); 3409 3410 auto One = B.buildFConstant(S64, 1.0); 3411 3412 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3413 .addUse(LHS) 3414 .addUse(RHS) 3415 .addImm(0) 3416 .setMIFlags(Flags); 3417 3418 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3419 3420 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3421 .addUse(DivScale0.getReg(0)) 3422 .setMIFlags(Flags); 3423 3424 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3425 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3426 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3427 3428 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3429 .addUse(LHS) 3430 .addUse(RHS) 3431 .addImm(1) 3432 .setMIFlags(Flags); 3433 3434 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3435 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3436 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3437 3438 Register Scale; 3439 if (!ST.hasUsableDivScaleConditionOutput()) { 3440 // Workaround a hardware bug on SI where the condition output from div_scale 3441 // is not usable. 3442 3443 LLT S32 = LLT::scalar(32); 3444 3445 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3446 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3447 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3448 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3449 3450 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3451 Scale1Unmerge.getReg(1)); 3452 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3453 Scale0Unmerge.getReg(1)); 3454 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3455 } else { 3456 Scale = DivScale1.getReg(1); 3457 } 3458 3459 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3460 .addUse(Fma4.getReg(0)) 3461 .addUse(Fma3.getReg(0)) 3462 .addUse(Mul.getReg(0)) 3463 .addUse(Scale) 3464 .setMIFlags(Flags); 3465 3466 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3467 .addUse(Fmas.getReg(0)) 3468 .addUse(RHS) 3469 .addUse(LHS) 3470 .setMIFlags(Flags); 3471 3472 MI.eraseFromParent(); 3473 return true; 3474 } 3475 3476 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3477 MachineRegisterInfo &MRI, 3478 MachineIRBuilder &B) const { 3479 Register Res = MI.getOperand(0).getReg(); 3480 Register LHS = MI.getOperand(2).getReg(); 3481 Register RHS = MI.getOperand(3).getReg(); 3482 uint16_t Flags = MI.getFlags(); 3483 3484 LLT S32 = LLT::scalar(32); 3485 LLT S1 = LLT::scalar(1); 3486 3487 auto Abs = B.buildFAbs(S32, RHS, Flags); 3488 const APFloat C0Val(1.0f); 3489 3490 auto C0 = B.buildConstant(S32, 0x6f800000); 3491 auto C1 = B.buildConstant(S32, 0x2f800000); 3492 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3493 3494 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3495 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3496 3497 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3498 3499 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3500 .addUse(Mul0.getReg(0)) 3501 .setMIFlags(Flags); 3502 3503 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3504 3505 B.buildFMul(Res, Sel, Mul1, Flags); 3506 3507 MI.eraseFromParent(); 3508 return true; 3509 } 3510 3511 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3512 // FIXME: Why do we handle this one but not other removed instructions? 3513 // 3514 // Reciprocal square root. The clamp prevents infinite results, clamping 3515 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3516 // +-max_float. 3517 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3518 MachineRegisterInfo &MRI, 3519 MachineIRBuilder &B) const { 3520 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3521 return true; 3522 3523 Register Dst = MI.getOperand(0).getReg(); 3524 Register Src = MI.getOperand(2).getReg(); 3525 auto Flags = MI.getFlags(); 3526 3527 LLT Ty = MRI.getType(Dst); 3528 3529 const fltSemantics *FltSemantics; 3530 if (Ty == LLT::scalar(32)) 3531 FltSemantics = &APFloat::IEEEsingle(); 3532 else if (Ty == LLT::scalar(64)) 3533 FltSemantics = &APFloat::IEEEdouble(); 3534 else 3535 return false; 3536 3537 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3538 .addUse(Src) 3539 .setMIFlags(Flags); 3540 3541 // We don't need to concern ourselves with the snan handling difference, since 3542 // the rsq quieted (or not) so use the one which will directly select. 3543 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3544 const bool UseIEEE = MFI->getMode().IEEE; 3545 3546 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3547 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3548 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3549 3550 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3551 3552 if (UseIEEE) 3553 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3554 else 3555 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3556 MI.eraseFromParent(); 3557 return true; 3558 } 3559 3560 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3561 switch (IID) { 3562 case Intrinsic::amdgcn_ds_fadd: 3563 return AMDGPU::G_ATOMICRMW_FADD; 3564 case Intrinsic::amdgcn_ds_fmin: 3565 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3566 case Intrinsic::amdgcn_ds_fmax: 3567 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3568 default: 3569 llvm_unreachable("not a DS FP intrinsic"); 3570 } 3571 } 3572 3573 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3574 MachineInstr &MI, 3575 Intrinsic::ID IID) const { 3576 GISelChangeObserver &Observer = Helper.Observer; 3577 Observer.changingInstr(MI); 3578 3579 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3580 3581 // The remaining operands were used to set fields in the MemOperand on 3582 // construction. 3583 for (int I = 6; I > 3; --I) 3584 MI.RemoveOperand(I); 3585 3586 MI.RemoveOperand(1); // Remove the intrinsic ID. 3587 Observer.changedInstr(MI); 3588 return true; 3589 } 3590 3591 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3592 MachineRegisterInfo &MRI, 3593 MachineIRBuilder &B) const { 3594 uint64_t Offset = 3595 ST.getTargetLowering()->getImplicitParameterOffset( 3596 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3597 LLT DstTy = MRI.getType(DstReg); 3598 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3599 3600 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3601 if (!loadInputValue(KernargPtrReg, B, 3602 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3603 return false; 3604 3605 // FIXME: This should be nuw 3606 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3607 return true; 3608 } 3609 3610 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3611 MachineRegisterInfo &MRI, 3612 MachineIRBuilder &B) const { 3613 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3614 if (!MFI->isEntryFunction()) { 3615 return legalizePreloadedArgIntrin(MI, MRI, B, 3616 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3617 } 3618 3619 Register DstReg = MI.getOperand(0).getReg(); 3620 if (!getImplicitArgPtr(DstReg, MRI, B)) 3621 return false; 3622 3623 MI.eraseFromParent(); 3624 return true; 3625 } 3626 3627 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3628 MachineRegisterInfo &MRI, 3629 MachineIRBuilder &B, 3630 unsigned AddrSpace) const { 3631 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3632 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3633 Register Hi32 = Unmerge.getReg(1); 3634 3635 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3636 MI.eraseFromParent(); 3637 return true; 3638 } 3639 3640 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3641 // offset (the offset that is included in bounds checking and swizzling, to be 3642 // split between the instruction's voffset and immoffset fields) and soffset 3643 // (the offset that is excluded from bounds checking and swizzling, to go in 3644 // the instruction's soffset field). This function takes the first kind of 3645 // offset and figures out how to split it between voffset and immoffset. 3646 std::tuple<Register, unsigned, unsigned> 3647 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3648 Register OrigOffset) const { 3649 const unsigned MaxImm = 4095; 3650 Register BaseReg; 3651 unsigned TotalConstOffset; 3652 const LLT S32 = LLT::scalar(32); 3653 MachineRegisterInfo &MRI = *B.getMRI(); 3654 3655 std::tie(BaseReg, TotalConstOffset) = 3656 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3657 3658 unsigned ImmOffset = TotalConstOffset; 3659 3660 // If BaseReg is a pointer, convert it to int. 3661 if (MRI.getType(BaseReg).isPointer()) 3662 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3663 3664 // If the immediate value is too big for the immoffset field, put the value 3665 // and -4096 into the immoffset field so that the value that is copied/added 3666 // for the voffset field is a multiple of 4096, and it stands more chance 3667 // of being CSEd with the copy/add for another similar load/store. 3668 // However, do not do that rounding down to a multiple of 4096 if that is a 3669 // negative number, as it appears to be illegal to have a negative offset 3670 // in the vgpr, even if adding the immediate offset makes it positive. 3671 unsigned Overflow = ImmOffset & ~MaxImm; 3672 ImmOffset -= Overflow; 3673 if ((int32_t)Overflow < 0) { 3674 Overflow += ImmOffset; 3675 ImmOffset = 0; 3676 } 3677 3678 if (Overflow != 0) { 3679 if (!BaseReg) { 3680 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3681 } else { 3682 auto OverflowVal = B.buildConstant(S32, Overflow); 3683 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3684 } 3685 } 3686 3687 if (!BaseReg) 3688 BaseReg = B.buildConstant(S32, 0).getReg(0); 3689 3690 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3691 } 3692 3693 /// Handle register layout difference for f16 images for some subtargets. 3694 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3695 MachineRegisterInfo &MRI, 3696 Register Reg, 3697 bool ImageStore) const { 3698 const LLT S16 = LLT::scalar(16); 3699 const LLT S32 = LLT::scalar(32); 3700 LLT StoreVT = MRI.getType(Reg); 3701 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3702 3703 if (ST.hasUnpackedD16VMem()) { 3704 auto Unmerge = B.buildUnmerge(S16, Reg); 3705 3706 SmallVector<Register, 4> WideRegs; 3707 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3708 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3709 3710 int NumElts = StoreVT.getNumElements(); 3711 3712 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3713 .getReg(0); 3714 } 3715 3716 if (ImageStore && ST.hasImageStoreD16Bug()) { 3717 if (StoreVT.getNumElements() == 2) { 3718 SmallVector<Register, 4> PackedRegs; 3719 Reg = B.buildBitcast(S32, Reg).getReg(0); 3720 PackedRegs.push_back(Reg); 3721 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3722 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3723 .getReg(0); 3724 } 3725 3726 if (StoreVT.getNumElements() == 3) { 3727 SmallVector<Register, 4> PackedRegs; 3728 auto Unmerge = B.buildUnmerge(S16, Reg); 3729 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3730 PackedRegs.push_back(Unmerge.getReg(I)); 3731 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3732 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3733 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3734 } 3735 3736 if (StoreVT.getNumElements() == 4) { 3737 SmallVector<Register, 4> PackedRegs; 3738 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3739 auto Unmerge = B.buildUnmerge(S32, Reg); 3740 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3741 PackedRegs.push_back(Unmerge.getReg(I)); 3742 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3743 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3744 .getReg(0); 3745 } 3746 3747 llvm_unreachable("invalid data type"); 3748 } 3749 3750 return Reg; 3751 } 3752 3753 Register AMDGPULegalizerInfo::fixStoreSourceType( 3754 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3755 MachineRegisterInfo *MRI = B.getMRI(); 3756 LLT Ty = MRI->getType(VData); 3757 3758 const LLT S16 = LLT::scalar(16); 3759 3760 // Fixup illegal register types for i8 stores. 3761 if (Ty == LLT::scalar(8) || Ty == S16) { 3762 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3763 return AnyExt; 3764 } 3765 3766 if (Ty.isVector()) { 3767 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3768 if (IsFormat) 3769 return handleD16VData(B, *MRI, VData); 3770 } 3771 } 3772 3773 return VData; 3774 } 3775 3776 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3777 MachineRegisterInfo &MRI, 3778 MachineIRBuilder &B, 3779 bool IsTyped, 3780 bool IsFormat) const { 3781 Register VData = MI.getOperand(1).getReg(); 3782 LLT Ty = MRI.getType(VData); 3783 LLT EltTy = Ty.getScalarType(); 3784 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3785 const LLT S32 = LLT::scalar(32); 3786 3787 VData = fixStoreSourceType(B, VData, IsFormat); 3788 Register RSrc = MI.getOperand(2).getReg(); 3789 3790 MachineMemOperand *MMO = *MI.memoperands_begin(); 3791 const int MemSize = MMO->getSize(); 3792 3793 unsigned ImmOffset; 3794 unsigned TotalOffset; 3795 3796 // The typed intrinsics add an immediate after the registers. 3797 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3798 3799 // The struct intrinsic variants add one additional operand over raw. 3800 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3801 Register VIndex; 3802 int OpOffset = 0; 3803 if (HasVIndex) { 3804 VIndex = MI.getOperand(3).getReg(); 3805 OpOffset = 1; 3806 } 3807 3808 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3809 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3810 3811 unsigned Format = 0; 3812 if (IsTyped) { 3813 Format = MI.getOperand(5 + OpOffset).getImm(); 3814 ++OpOffset; 3815 } 3816 3817 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3818 3819 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3820 if (TotalOffset != 0) 3821 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3822 3823 unsigned Opc; 3824 if (IsTyped) { 3825 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3826 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3827 } else if (IsFormat) { 3828 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3829 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3830 } else { 3831 switch (MemSize) { 3832 case 1: 3833 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3834 break; 3835 case 2: 3836 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3837 break; 3838 default: 3839 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3840 break; 3841 } 3842 } 3843 3844 if (!VIndex) 3845 VIndex = B.buildConstant(S32, 0).getReg(0); 3846 3847 auto MIB = B.buildInstr(Opc) 3848 .addUse(VData) // vdata 3849 .addUse(RSrc) // rsrc 3850 .addUse(VIndex) // vindex 3851 .addUse(VOffset) // voffset 3852 .addUse(SOffset) // soffset 3853 .addImm(ImmOffset); // offset(imm) 3854 3855 if (IsTyped) 3856 MIB.addImm(Format); 3857 3858 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3859 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3860 .addMemOperand(MMO); 3861 3862 MI.eraseFromParent(); 3863 return true; 3864 } 3865 3866 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3867 MachineRegisterInfo &MRI, 3868 MachineIRBuilder &B, 3869 bool IsFormat, 3870 bool IsTyped) const { 3871 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3872 MachineMemOperand *MMO = *MI.memoperands_begin(); 3873 const int MemSize = MMO->getSize(); 3874 const LLT S32 = LLT::scalar(32); 3875 3876 Register Dst = MI.getOperand(0).getReg(); 3877 Register RSrc = MI.getOperand(2).getReg(); 3878 3879 // The typed intrinsics add an immediate after the registers. 3880 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3881 3882 // The struct intrinsic variants add one additional operand over raw. 3883 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3884 Register VIndex; 3885 int OpOffset = 0; 3886 if (HasVIndex) { 3887 VIndex = MI.getOperand(3).getReg(); 3888 OpOffset = 1; 3889 } 3890 3891 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3892 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3893 3894 unsigned Format = 0; 3895 if (IsTyped) { 3896 Format = MI.getOperand(5 + OpOffset).getImm(); 3897 ++OpOffset; 3898 } 3899 3900 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3901 unsigned ImmOffset; 3902 unsigned TotalOffset; 3903 3904 LLT Ty = MRI.getType(Dst); 3905 LLT EltTy = Ty.getScalarType(); 3906 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3907 const bool Unpacked = ST.hasUnpackedD16VMem(); 3908 3909 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3910 if (TotalOffset != 0) 3911 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3912 3913 unsigned Opc; 3914 3915 if (IsTyped) { 3916 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3917 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3918 } else if (IsFormat) { 3919 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3920 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3921 } else { 3922 switch (MemSize) { 3923 case 1: 3924 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3925 break; 3926 case 2: 3927 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3928 break; 3929 default: 3930 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3931 break; 3932 } 3933 } 3934 3935 Register LoadDstReg; 3936 3937 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3938 LLT UnpackedTy = Ty.changeElementSize(32); 3939 3940 if (IsExtLoad) 3941 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3942 else if (Unpacked && IsD16 && Ty.isVector()) 3943 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3944 else 3945 LoadDstReg = Dst; 3946 3947 if (!VIndex) 3948 VIndex = B.buildConstant(S32, 0).getReg(0); 3949 3950 auto MIB = B.buildInstr(Opc) 3951 .addDef(LoadDstReg) // vdata 3952 .addUse(RSrc) // rsrc 3953 .addUse(VIndex) // vindex 3954 .addUse(VOffset) // voffset 3955 .addUse(SOffset) // soffset 3956 .addImm(ImmOffset); // offset(imm) 3957 3958 if (IsTyped) 3959 MIB.addImm(Format); 3960 3961 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3962 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3963 .addMemOperand(MMO); 3964 3965 if (LoadDstReg != Dst) { 3966 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3967 3968 // Widen result for extending loads was widened. 3969 if (IsExtLoad) 3970 B.buildTrunc(Dst, LoadDstReg); 3971 else { 3972 // Repack to original 16-bit vector result 3973 // FIXME: G_TRUNC should work, but legalization currently fails 3974 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3975 SmallVector<Register, 4> Repack; 3976 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3977 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3978 B.buildMerge(Dst, Repack); 3979 } 3980 } 3981 3982 MI.eraseFromParent(); 3983 return true; 3984 } 3985 3986 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3987 MachineIRBuilder &B, 3988 bool IsInc) const { 3989 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3990 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3991 B.buildInstr(Opc) 3992 .addDef(MI.getOperand(0).getReg()) 3993 .addUse(MI.getOperand(2).getReg()) 3994 .addUse(MI.getOperand(3).getReg()) 3995 .cloneMemRefs(MI); 3996 MI.eraseFromParent(); 3997 return true; 3998 } 3999 4000 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4001 switch (IntrID) { 4002 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4003 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4005 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4006 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4007 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4008 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4009 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4011 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4012 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4014 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4015 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4016 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4017 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4019 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4020 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4021 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4022 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4023 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4024 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4025 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4026 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4027 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4028 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4029 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4030 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4031 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4032 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4033 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4034 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4035 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4037 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4038 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4039 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4040 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4041 case Intrinsic::amdgcn_buffer_atomic_fadd: 4042 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4043 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4044 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4045 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4046 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4047 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4048 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4049 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4050 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4051 default: 4052 llvm_unreachable("unhandled atomic opcode"); 4053 } 4054 } 4055 4056 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4057 MachineIRBuilder &B, 4058 Intrinsic::ID IID) const { 4059 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4060 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4061 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4062 4063 Register Dst; 4064 4065 int OpOffset = 0; 4066 if (HasReturn) { 4067 // A few FP atomics do not support return values. 4068 Dst = MI.getOperand(0).getReg(); 4069 } else { 4070 OpOffset = -1; 4071 } 4072 4073 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4074 Register CmpVal; 4075 4076 if (IsCmpSwap) { 4077 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4078 ++OpOffset; 4079 } 4080 4081 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4082 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4083 4084 // The struct intrinsic variants add one additional operand over raw. 4085 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4086 Register VIndex; 4087 if (HasVIndex) { 4088 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4089 ++OpOffset; 4090 } 4091 4092 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4093 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4094 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4095 4096 MachineMemOperand *MMO = *MI.memoperands_begin(); 4097 4098 unsigned ImmOffset; 4099 unsigned TotalOffset; 4100 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4101 if (TotalOffset != 0) 4102 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4103 4104 if (!VIndex) 4105 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4106 4107 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4108 4109 if (HasReturn) 4110 MIB.addDef(Dst); 4111 4112 MIB.addUse(VData); // vdata 4113 4114 if (IsCmpSwap) 4115 MIB.addReg(CmpVal); 4116 4117 MIB.addUse(RSrc) // rsrc 4118 .addUse(VIndex) // vindex 4119 .addUse(VOffset) // voffset 4120 .addUse(SOffset) // soffset 4121 .addImm(ImmOffset) // offset(imm) 4122 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4123 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4124 .addMemOperand(MMO); 4125 4126 MI.eraseFromParent(); 4127 return true; 4128 } 4129 4130 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4131 /// vector with s16 typed elements. 4132 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4133 SmallVectorImpl<Register> &PackedAddrs, 4134 unsigned ArgOffset, 4135 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4136 bool IsA16, bool IsG16) { 4137 const LLT S16 = LLT::scalar(16); 4138 const LLT V2S16 = LLT::fixed_vector(2, 16); 4139 auto EndIdx = Intr->VAddrEnd; 4140 4141 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4142 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4143 if (!SrcOp.isReg()) 4144 continue; // _L to _LZ may have eliminated this. 4145 4146 Register AddrReg = SrcOp.getReg(); 4147 4148 if (I < Intr->GradientStart) { 4149 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4150 PackedAddrs.push_back(AddrReg); 4151 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4152 (I >= Intr->CoordStart && !IsA16)) { 4153 // Handle any gradient or coordinate operands that should not be packed 4154 PackedAddrs.push_back(AddrReg); 4155 } else { 4156 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4157 // derivatives dx/dh and dx/dv are packed with undef. 4158 if (((I + 1) >= EndIdx) || 4159 ((Intr->NumGradients / 2) % 2 == 1 && 4160 (I == static_cast<unsigned>(Intr->GradientStart + 4161 (Intr->NumGradients / 2) - 1) || 4162 I == static_cast<unsigned>(Intr->GradientStart + 4163 Intr->NumGradients - 1))) || 4164 // Check for _L to _LZ optimization 4165 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4166 PackedAddrs.push_back( 4167 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4168 .getReg(0)); 4169 } else { 4170 PackedAddrs.push_back( 4171 B.buildBuildVector( 4172 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4173 .getReg(0)); 4174 ++I; 4175 } 4176 } 4177 } 4178 } 4179 4180 /// Convert from separate vaddr components to a single vector address register, 4181 /// and replace the remaining operands with $noreg. 4182 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4183 int DimIdx, int NumVAddrs) { 4184 const LLT S32 = LLT::scalar(32); 4185 4186 SmallVector<Register, 8> AddrRegs; 4187 for (int I = 0; I != NumVAddrs; ++I) { 4188 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4189 if (SrcOp.isReg()) { 4190 AddrRegs.push_back(SrcOp.getReg()); 4191 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4192 } 4193 } 4194 4195 int NumAddrRegs = AddrRegs.size(); 4196 if (NumAddrRegs != 1) { 4197 // Round up to 8 elements for v5-v7 4198 // FIXME: Missing intermediate sized register classes and instructions. 4199 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4200 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4201 auto Undef = B.buildUndef(S32); 4202 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4203 NumAddrRegs = RoundedNumRegs; 4204 } 4205 4206 auto VAddr = 4207 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4208 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4209 } 4210 4211 for (int I = 1; I != NumVAddrs; ++I) { 4212 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4213 if (SrcOp.isReg()) 4214 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4215 } 4216 } 4217 4218 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4219 /// 4220 /// Depending on the subtarget, load/store with 16-bit element data need to be 4221 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4222 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4223 /// registers. 4224 /// 4225 /// We don't want to directly select image instructions just yet, but also want 4226 /// to exposes all register repacking to the legalizer/combiners. We also don't 4227 /// want a selected instrution entering RegBankSelect. In order to avoid 4228 /// defining a multitude of intermediate image instructions, directly hack on 4229 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4230 /// now unnecessary arguments with $noreg. 4231 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4232 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4233 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4234 4235 const unsigned NumDefs = MI.getNumExplicitDefs(); 4236 const unsigned ArgOffset = NumDefs + 1; 4237 bool IsTFE = NumDefs == 2; 4238 // We are only processing the operands of d16 image operations on subtargets 4239 // that use the unpacked register layout, or need to repack the TFE result. 4240 4241 // TODO: Do we need to guard against already legalized intrinsics? 4242 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4243 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4244 4245 MachineRegisterInfo *MRI = B.getMRI(); 4246 const LLT S32 = LLT::scalar(32); 4247 const LLT S16 = LLT::scalar(16); 4248 const LLT V2S16 = LLT::fixed_vector(2, 16); 4249 4250 unsigned DMask = 0; 4251 4252 // Check for 16 bit addresses and pack if true. 4253 LLT GradTy = 4254 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4255 LLT AddrTy = 4256 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4257 const bool IsG16 = GradTy == S16; 4258 const bool IsA16 = AddrTy == S16; 4259 4260 int DMaskLanes = 0; 4261 if (!BaseOpcode->Atomic) { 4262 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4263 if (BaseOpcode->Gather4) { 4264 DMaskLanes = 4; 4265 } else if (DMask != 0) { 4266 DMaskLanes = countPopulation(DMask); 4267 } else if (!IsTFE && !BaseOpcode->Store) { 4268 // If dmask is 0, this is a no-op load. This can be eliminated. 4269 B.buildUndef(MI.getOperand(0)); 4270 MI.eraseFromParent(); 4271 return true; 4272 } 4273 } 4274 4275 Observer.changingInstr(MI); 4276 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4277 4278 unsigned NewOpcode = NumDefs == 0 ? 4279 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4280 4281 // Track that we legalized this 4282 MI.setDesc(B.getTII().get(NewOpcode)); 4283 4284 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4285 // dmask to be at least 1 otherwise the instruction will fail 4286 if (IsTFE && DMask == 0) { 4287 DMask = 0x1; 4288 DMaskLanes = 1; 4289 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4290 } 4291 4292 if (BaseOpcode->Atomic) { 4293 Register VData0 = MI.getOperand(2).getReg(); 4294 LLT Ty = MRI->getType(VData0); 4295 4296 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4297 if (Ty.isVector()) 4298 return false; 4299 4300 if (BaseOpcode->AtomicX2) { 4301 Register VData1 = MI.getOperand(3).getReg(); 4302 // The two values are packed in one register. 4303 LLT PackedTy = LLT::fixed_vector(2, Ty); 4304 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4305 MI.getOperand(2).setReg(Concat.getReg(0)); 4306 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4307 } 4308 } 4309 4310 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4311 4312 // Optimize _L to _LZ when _L is zero 4313 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4314 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4315 const ConstantFP *ConstantLod; 4316 4317 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4318 m_GFCst(ConstantLod))) { 4319 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4320 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4321 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4322 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4323 Intr->Dim); 4324 4325 // The starting indexes should remain in the same place. 4326 --CorrectedNumVAddrs; 4327 4328 MI.getOperand(MI.getNumExplicitDefs()) 4329 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4330 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4331 Intr = NewImageDimIntr; 4332 } 4333 } 4334 } 4335 4336 // Optimize _mip away, when 'lod' is zero 4337 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4338 int64_t ConstantLod; 4339 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4340 m_ICst(ConstantLod))) { 4341 if (ConstantLod == 0) { 4342 // TODO: Change intrinsic opcode and remove operand instead or replacing 4343 // it with 0, as the _L to _LZ handling is done above. 4344 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4345 --CorrectedNumVAddrs; 4346 } 4347 } 4348 } 4349 4350 // Rewrite the addressing register layout before doing anything else. 4351 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4352 // 16 bit gradients are supported, but are tied to the A16 control 4353 // so both gradients and addresses must be 16 bit 4354 return false; 4355 } 4356 4357 if (IsA16 && !ST.hasA16()) { 4358 // A16 not supported 4359 return false; 4360 } 4361 4362 if (IsA16 || IsG16) { 4363 if (Intr->NumVAddrs > 1) { 4364 SmallVector<Register, 4> PackedRegs; 4365 4366 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4367 IsG16); 4368 4369 // See also below in the non-a16 branch 4370 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4371 4372 if (!UseNSA && PackedRegs.size() > 1) { 4373 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4374 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4375 PackedRegs[0] = Concat.getReg(0); 4376 PackedRegs.resize(1); 4377 } 4378 4379 const unsigned NumPacked = PackedRegs.size(); 4380 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4381 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4382 if (!SrcOp.isReg()) { 4383 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4384 continue; 4385 } 4386 4387 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4388 4389 if (I - Intr->VAddrStart < NumPacked) 4390 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4391 else 4392 SrcOp.setReg(AMDGPU::NoRegister); 4393 } 4394 } 4395 } else { 4396 // If the register allocator cannot place the address registers contiguously 4397 // without introducing moves, then using the non-sequential address encoding 4398 // is always preferable, since it saves VALU instructions and is usually a 4399 // wash in terms of code size or even better. 4400 // 4401 // However, we currently have no way of hinting to the register allocator 4402 // that MIMG addresses should be placed contiguously when it is possible to 4403 // do so, so force non-NSA for the common 2-address case as a heuristic. 4404 // 4405 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4406 // allocation when possible. 4407 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4408 4409 if (!UseNSA && Intr->NumVAddrs > 1) 4410 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4411 Intr->NumVAddrs); 4412 } 4413 4414 int Flags = 0; 4415 if (IsA16) 4416 Flags |= 1; 4417 if (IsG16) 4418 Flags |= 2; 4419 MI.addOperand(MachineOperand::CreateImm(Flags)); 4420 4421 if (BaseOpcode->Store) { // No TFE for stores? 4422 // TODO: Handle dmask trim 4423 Register VData = MI.getOperand(1).getReg(); 4424 LLT Ty = MRI->getType(VData); 4425 if (!Ty.isVector() || Ty.getElementType() != S16) 4426 return true; 4427 4428 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4429 if (RepackedReg != VData) { 4430 MI.getOperand(1).setReg(RepackedReg); 4431 } 4432 4433 return true; 4434 } 4435 4436 Register DstReg = MI.getOperand(0).getReg(); 4437 LLT Ty = MRI->getType(DstReg); 4438 const LLT EltTy = Ty.getScalarType(); 4439 const bool IsD16 = Ty.getScalarType() == S16; 4440 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4441 4442 // Confirm that the return type is large enough for the dmask specified 4443 if (NumElts < DMaskLanes) 4444 return false; 4445 4446 if (NumElts > 4 || DMaskLanes > 4) 4447 return false; 4448 4449 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4450 const LLT AdjustedTy = 4451 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4452 4453 // The raw dword aligned data component of the load. The only legal cases 4454 // where this matters should be when using the packed D16 format, for 4455 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4456 LLT RoundedTy; 4457 4458 // S32 vector to to cover all data, plus TFE result element. 4459 LLT TFETy; 4460 4461 // Register type to use for each loaded component. Will be S32 or V2S16. 4462 LLT RegTy; 4463 4464 if (IsD16 && ST.hasUnpackedD16VMem()) { 4465 RoundedTy = 4466 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4467 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4468 RegTy = S32; 4469 } else { 4470 unsigned EltSize = EltTy.getSizeInBits(); 4471 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4472 unsigned RoundedSize = 32 * RoundedElts; 4473 RoundedTy = LLT::scalarOrVector( 4474 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4475 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4476 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4477 } 4478 4479 // The return type does not need adjustment. 4480 // TODO: Should we change s16 case to s32 or <2 x s16>? 4481 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4482 return true; 4483 4484 Register Dst1Reg; 4485 4486 // Insert after the instruction. 4487 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4488 4489 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4490 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4491 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4492 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4493 4494 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4495 4496 MI.getOperand(0).setReg(NewResultReg); 4497 4498 // In the IR, TFE is supposed to be used with a 2 element struct return 4499 // type. The intruction really returns these two values in one contiguous 4500 // register, with one additional dword beyond the loaded data. Rewrite the 4501 // return type to use a single register result. 4502 4503 if (IsTFE) { 4504 Dst1Reg = MI.getOperand(1).getReg(); 4505 if (MRI->getType(Dst1Reg) != S32) 4506 return false; 4507 4508 // TODO: Make sure the TFE operand bit is set. 4509 MI.RemoveOperand(1); 4510 4511 // Handle the easy case that requires no repack instructions. 4512 if (Ty == S32) { 4513 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4514 return true; 4515 } 4516 } 4517 4518 // Now figure out how to copy the new result register back into the old 4519 // result. 4520 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4521 4522 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4523 4524 if (ResultNumRegs == 1) { 4525 assert(!IsTFE); 4526 ResultRegs[0] = NewResultReg; 4527 } else { 4528 // We have to repack into a new vector of some kind. 4529 for (int I = 0; I != NumDataRegs; ++I) 4530 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4531 B.buildUnmerge(ResultRegs, NewResultReg); 4532 4533 // Drop the final TFE element to get the data part. The TFE result is 4534 // directly written to the right place already. 4535 if (IsTFE) 4536 ResultRegs.resize(NumDataRegs); 4537 } 4538 4539 // For an s16 scalar result, we form an s32 result with a truncate regardless 4540 // of packed vs. unpacked. 4541 if (IsD16 && !Ty.isVector()) { 4542 B.buildTrunc(DstReg, ResultRegs[0]); 4543 return true; 4544 } 4545 4546 // Avoid a build/concat_vector of 1 entry. 4547 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4548 B.buildBitcast(DstReg, ResultRegs[0]); 4549 return true; 4550 } 4551 4552 assert(Ty.isVector()); 4553 4554 if (IsD16) { 4555 // For packed D16 results with TFE enabled, all the data components are 4556 // S32. Cast back to the expected type. 4557 // 4558 // TODO: We don't really need to use load s32 elements. We would only need one 4559 // cast for the TFE result if a multiple of v2s16 was used. 4560 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4561 for (Register &Reg : ResultRegs) 4562 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4563 } else if (ST.hasUnpackedD16VMem()) { 4564 for (Register &Reg : ResultRegs) 4565 Reg = B.buildTrunc(S16, Reg).getReg(0); 4566 } 4567 } 4568 4569 auto padWithUndef = [&](LLT Ty, int NumElts) { 4570 if (NumElts == 0) 4571 return; 4572 Register Undef = B.buildUndef(Ty).getReg(0); 4573 for (int I = 0; I != NumElts; ++I) 4574 ResultRegs.push_back(Undef); 4575 }; 4576 4577 // Pad out any elements eliminated due to the dmask. 4578 LLT ResTy = MRI->getType(ResultRegs[0]); 4579 if (!ResTy.isVector()) { 4580 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4581 B.buildBuildVector(DstReg, ResultRegs); 4582 return true; 4583 } 4584 4585 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4586 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4587 4588 // Deal with the one annoying legal case. 4589 const LLT V3S16 = LLT::fixed_vector(3, 16); 4590 if (Ty == V3S16) { 4591 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4592 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4593 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4594 return true; 4595 } 4596 4597 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4598 B.buildConcatVectors(DstReg, ResultRegs); 4599 return true; 4600 } 4601 4602 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4603 LegalizerHelper &Helper, MachineInstr &MI) const { 4604 MachineIRBuilder &B = Helper.MIRBuilder; 4605 GISelChangeObserver &Observer = Helper.Observer; 4606 4607 Register Dst = MI.getOperand(0).getReg(); 4608 LLT Ty = B.getMRI()->getType(Dst); 4609 unsigned Size = Ty.getSizeInBits(); 4610 MachineFunction &MF = B.getMF(); 4611 4612 Observer.changingInstr(MI); 4613 4614 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4615 Ty = getBitcastRegisterType(Ty); 4616 Helper.bitcastDst(MI, Ty, 0); 4617 Dst = MI.getOperand(0).getReg(); 4618 B.setInsertPt(B.getMBB(), MI); 4619 } 4620 4621 // FIXME: We don't really need this intermediate instruction. The intrinsic 4622 // should be fixed to have a memory operand. Since it's readnone, we're not 4623 // allowed to add one. 4624 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4625 MI.RemoveOperand(1); // Remove intrinsic ID 4626 4627 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4628 // TODO: Should this use datalayout alignment? 4629 const unsigned MemSize = (Size + 7) / 8; 4630 const Align MemAlign(4); 4631 MachineMemOperand *MMO = MF.getMachineMemOperand( 4632 MachinePointerInfo(), 4633 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4634 MachineMemOperand::MOInvariant, 4635 MemSize, MemAlign); 4636 MI.addMemOperand(MF, MMO); 4637 4638 // There are no 96-bit result scalar loads, but widening to 128-bit should 4639 // always be legal. We may need to restore this to a 96-bit result if it turns 4640 // out this needs to be converted to a vector load during RegBankSelect. 4641 if (!isPowerOf2_32(Size)) { 4642 if (Ty.isVector()) 4643 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4644 else 4645 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4646 } 4647 4648 Observer.changedInstr(MI); 4649 return true; 4650 } 4651 4652 // TODO: Move to selection 4653 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4654 MachineRegisterInfo &MRI, 4655 MachineIRBuilder &B) const { 4656 if (!ST.isTrapHandlerEnabled() || 4657 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4658 return legalizeTrapEndpgm(MI, MRI, B); 4659 4660 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4661 switch (*HsaAbiVer) { 4662 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4663 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4664 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4665 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4666 return ST.supportsGetDoorbellID() ? 4667 legalizeTrapHsa(MI, MRI, B) : 4668 legalizeTrapHsaQueuePtr(MI, MRI, B); 4669 } 4670 } 4671 4672 llvm_unreachable("Unknown trap handler"); 4673 } 4674 4675 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4676 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4677 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4678 MI.eraseFromParent(); 4679 return true; 4680 } 4681 4682 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4683 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4684 // Pass queue pointer to trap handler as input, and insert trap instruction 4685 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4686 Register LiveIn = 4687 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4688 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4689 return false; 4690 4691 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4692 B.buildCopy(SGPR01, LiveIn); 4693 B.buildInstr(AMDGPU::S_TRAP) 4694 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4695 .addReg(SGPR01, RegState::Implicit); 4696 4697 MI.eraseFromParent(); 4698 return true; 4699 } 4700 4701 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4702 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4703 B.buildInstr(AMDGPU::S_TRAP) 4704 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4705 MI.eraseFromParent(); 4706 return true; 4707 } 4708 4709 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4710 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4711 // Is non-HSA path or trap-handler disabled? then, report a warning 4712 // accordingly 4713 if (!ST.isTrapHandlerEnabled() || 4714 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4715 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4716 "debugtrap handler not supported", 4717 MI.getDebugLoc(), DS_Warning); 4718 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4719 Ctx.diagnose(NoTrap); 4720 } else { 4721 // Insert debug-trap instruction 4722 B.buildInstr(AMDGPU::S_TRAP) 4723 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4724 } 4725 4726 MI.eraseFromParent(); 4727 return true; 4728 } 4729 4730 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4731 MachineIRBuilder &B) const { 4732 MachineRegisterInfo &MRI = *B.getMRI(); 4733 const LLT S16 = LLT::scalar(16); 4734 const LLT S32 = LLT::scalar(32); 4735 4736 Register DstReg = MI.getOperand(0).getReg(); 4737 Register NodePtr = MI.getOperand(2).getReg(); 4738 Register RayExtent = MI.getOperand(3).getReg(); 4739 Register RayOrigin = MI.getOperand(4).getReg(); 4740 Register RayDir = MI.getOperand(5).getReg(); 4741 Register RayInvDir = MI.getOperand(6).getReg(); 4742 Register TDescr = MI.getOperand(7).getReg(); 4743 4744 if (!ST.hasGFX10_AEncoding()) { 4745 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4746 "intrinsic not supported on subtarget", 4747 MI.getDebugLoc()); 4748 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4749 return false; 4750 } 4751 4752 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4753 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4754 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4755 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4756 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4757 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4758 4759 SmallVector<Register, 12> Ops; 4760 if (Is64) { 4761 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4762 Ops.push_back(Unmerge.getReg(0)); 4763 Ops.push_back(Unmerge.getReg(1)); 4764 } else { 4765 Ops.push_back(NodePtr); 4766 } 4767 Ops.push_back(RayExtent); 4768 4769 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4770 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4771 Ops.push_back(Unmerge.getReg(0)); 4772 Ops.push_back(Unmerge.getReg(1)); 4773 Ops.push_back(Unmerge.getReg(2)); 4774 }; 4775 4776 packLanes(RayOrigin); 4777 if (IsA16) { 4778 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4779 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4780 Register R1 = MRI.createGenericVirtualRegister(S32); 4781 Register R2 = MRI.createGenericVirtualRegister(S32); 4782 Register R3 = MRI.createGenericVirtualRegister(S32); 4783 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4784 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4785 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4786 Ops.push_back(R1); 4787 Ops.push_back(R2); 4788 Ops.push_back(R3); 4789 } else { 4790 packLanes(RayDir); 4791 packLanes(RayInvDir); 4792 } 4793 4794 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4795 .addDef(DstReg) 4796 .addImm(Opcode); 4797 4798 for (Register R : Ops) { 4799 MIB.addUse(R); 4800 } 4801 4802 MIB.addUse(TDescr) 4803 .addImm(IsA16 ? 1 : 0) 4804 .cloneMemRefs(MI); 4805 4806 MI.eraseFromParent(); 4807 return true; 4808 } 4809 4810 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4811 MachineInstr &MI) const { 4812 MachineIRBuilder &B = Helper.MIRBuilder; 4813 MachineRegisterInfo &MRI = *B.getMRI(); 4814 4815 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4816 auto IntrID = MI.getIntrinsicID(); 4817 switch (IntrID) { 4818 case Intrinsic::amdgcn_if: 4819 case Intrinsic::amdgcn_else: { 4820 MachineInstr *Br = nullptr; 4821 MachineBasicBlock *UncondBrTarget = nullptr; 4822 bool Negated = false; 4823 if (MachineInstr *BrCond = 4824 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4825 const SIRegisterInfo *TRI 4826 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4827 4828 Register Def = MI.getOperand(1).getReg(); 4829 Register Use = MI.getOperand(3).getReg(); 4830 4831 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4832 4833 if (Negated) 4834 std::swap(CondBrTarget, UncondBrTarget); 4835 4836 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4837 if (IntrID == Intrinsic::amdgcn_if) { 4838 B.buildInstr(AMDGPU::SI_IF) 4839 .addDef(Def) 4840 .addUse(Use) 4841 .addMBB(UncondBrTarget); 4842 } else { 4843 B.buildInstr(AMDGPU::SI_ELSE) 4844 .addDef(Def) 4845 .addUse(Use) 4846 .addMBB(UncondBrTarget); 4847 } 4848 4849 if (Br) { 4850 Br->getOperand(0).setMBB(CondBrTarget); 4851 } else { 4852 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4853 // since we're swapping branch targets it needs to be reinserted. 4854 // FIXME: IRTranslator should probably not do this 4855 B.buildBr(*CondBrTarget); 4856 } 4857 4858 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4859 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4860 MI.eraseFromParent(); 4861 BrCond->eraseFromParent(); 4862 return true; 4863 } 4864 4865 return false; 4866 } 4867 case Intrinsic::amdgcn_loop: { 4868 MachineInstr *Br = nullptr; 4869 MachineBasicBlock *UncondBrTarget = nullptr; 4870 bool Negated = false; 4871 if (MachineInstr *BrCond = 4872 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4873 const SIRegisterInfo *TRI 4874 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4875 4876 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4877 Register Reg = MI.getOperand(2).getReg(); 4878 4879 if (Negated) 4880 std::swap(CondBrTarget, UncondBrTarget); 4881 4882 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4883 B.buildInstr(AMDGPU::SI_LOOP) 4884 .addUse(Reg) 4885 .addMBB(UncondBrTarget); 4886 4887 if (Br) 4888 Br->getOperand(0).setMBB(CondBrTarget); 4889 else 4890 B.buildBr(*CondBrTarget); 4891 4892 MI.eraseFromParent(); 4893 BrCond->eraseFromParent(); 4894 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4895 return true; 4896 } 4897 4898 return false; 4899 } 4900 case Intrinsic::amdgcn_kernarg_segment_ptr: 4901 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4902 // This only makes sense to call in a kernel, so just lower to null. 4903 B.buildConstant(MI.getOperand(0).getReg(), 0); 4904 MI.eraseFromParent(); 4905 return true; 4906 } 4907 4908 return legalizePreloadedArgIntrin( 4909 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4910 case Intrinsic::amdgcn_implicitarg_ptr: 4911 return legalizeImplicitArgPtr(MI, MRI, B); 4912 case Intrinsic::amdgcn_workitem_id_x: 4913 return legalizePreloadedArgIntrin(MI, MRI, B, 4914 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4915 case Intrinsic::amdgcn_workitem_id_y: 4916 return legalizePreloadedArgIntrin(MI, MRI, B, 4917 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4918 case Intrinsic::amdgcn_workitem_id_z: 4919 return legalizePreloadedArgIntrin(MI, MRI, B, 4920 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4921 case Intrinsic::amdgcn_workgroup_id_x: 4922 return legalizePreloadedArgIntrin(MI, MRI, B, 4923 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4924 case Intrinsic::amdgcn_workgroup_id_y: 4925 return legalizePreloadedArgIntrin(MI, MRI, B, 4926 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4927 case Intrinsic::amdgcn_workgroup_id_z: 4928 return legalizePreloadedArgIntrin(MI, MRI, B, 4929 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4930 case Intrinsic::amdgcn_dispatch_ptr: 4931 return legalizePreloadedArgIntrin(MI, MRI, B, 4932 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4933 case Intrinsic::amdgcn_queue_ptr: 4934 return legalizePreloadedArgIntrin(MI, MRI, B, 4935 AMDGPUFunctionArgInfo::QUEUE_PTR); 4936 case Intrinsic::amdgcn_implicit_buffer_ptr: 4937 return legalizePreloadedArgIntrin( 4938 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4939 case Intrinsic::amdgcn_dispatch_id: 4940 return legalizePreloadedArgIntrin(MI, MRI, B, 4941 AMDGPUFunctionArgInfo::DISPATCH_ID); 4942 case Intrinsic::amdgcn_fdiv_fast: 4943 return legalizeFDIVFastIntrin(MI, MRI, B); 4944 case Intrinsic::amdgcn_is_shared: 4945 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4946 case Intrinsic::amdgcn_is_private: 4947 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4948 case Intrinsic::amdgcn_wavefrontsize: { 4949 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4950 MI.eraseFromParent(); 4951 return true; 4952 } 4953 case Intrinsic::amdgcn_s_buffer_load: 4954 return legalizeSBufferLoad(Helper, MI); 4955 case Intrinsic::amdgcn_raw_buffer_store: 4956 case Intrinsic::amdgcn_struct_buffer_store: 4957 return legalizeBufferStore(MI, MRI, B, false, false); 4958 case Intrinsic::amdgcn_raw_buffer_store_format: 4959 case Intrinsic::amdgcn_struct_buffer_store_format: 4960 return legalizeBufferStore(MI, MRI, B, false, true); 4961 case Intrinsic::amdgcn_raw_tbuffer_store: 4962 case Intrinsic::amdgcn_struct_tbuffer_store: 4963 return legalizeBufferStore(MI, MRI, B, true, true); 4964 case Intrinsic::amdgcn_raw_buffer_load: 4965 case Intrinsic::amdgcn_struct_buffer_load: 4966 return legalizeBufferLoad(MI, MRI, B, false, false); 4967 case Intrinsic::amdgcn_raw_buffer_load_format: 4968 case Intrinsic::amdgcn_struct_buffer_load_format: 4969 return legalizeBufferLoad(MI, MRI, B, true, false); 4970 case Intrinsic::amdgcn_raw_tbuffer_load: 4971 case Intrinsic::amdgcn_struct_tbuffer_load: 4972 return legalizeBufferLoad(MI, MRI, B, true, true); 4973 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4974 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4975 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4976 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4977 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4978 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4979 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4980 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4981 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4982 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4985 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4986 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4987 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4988 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4989 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4990 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4991 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4992 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4993 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4994 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4995 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4996 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4997 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4998 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4999 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5000 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5001 case Intrinsic::amdgcn_buffer_atomic_fadd: 5002 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5003 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5004 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5005 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5006 return legalizeBufferAtomic(MI, B, IntrID); 5007 case Intrinsic::amdgcn_atomic_inc: 5008 return legalizeAtomicIncDec(MI, B, true); 5009 case Intrinsic::amdgcn_atomic_dec: 5010 return legalizeAtomicIncDec(MI, B, false); 5011 case Intrinsic::trap: 5012 return legalizeTrapIntrinsic(MI, MRI, B); 5013 case Intrinsic::debugtrap: 5014 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5015 case Intrinsic::amdgcn_rsq_clamp: 5016 return legalizeRsqClampIntrinsic(MI, MRI, B); 5017 case Intrinsic::amdgcn_ds_fadd: 5018 case Intrinsic::amdgcn_ds_fmin: 5019 case Intrinsic::amdgcn_ds_fmax: 5020 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5021 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5022 return legalizeBVHIntrinsic(MI, B); 5023 default: { 5024 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5025 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5026 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5027 return true; 5028 } 5029 } 5030 5031 return true; 5032 } 5033