1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 339 LLT EltTy = Ty.getElementType(); 340 if (EltTy.isPointer()) 341 return true; 342 343 unsigned EltSize = EltTy.getSizeInBits(); 344 return EltSize != 32 && EltSize != 64; 345 } 346 347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 348 unsigned Opcode) { 349 const LLT Ty = Query.Types[0]; 350 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 351 !loadStoreBitcastWorkaround(Ty); 352 } 353 354 /// Return true if a load or store of the type should be lowered with a bitcast 355 /// to a different type. 356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 357 const unsigned MemSizeInBits) { 358 const unsigned Size = Ty.getSizeInBits(); 359 if (Size != MemSizeInBits) 360 return Size <= 32 && Ty.isVector(); 361 362 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 363 return true; 364 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 365 !isRegisterVectorElementType(Ty.getElementType()); 366 } 367 368 /// Return true if we should legalize a load by widening an odd sized memory 369 /// access up to the alignment. Note this case when the memory access itself 370 /// changes, not the size of the result register. 371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 372 unsigned AlignInBits, unsigned AddrSpace, 373 unsigned Opcode) { 374 // We don't want to widen cases that are naturally legal. 375 if (isPowerOf2_32(SizeInBits)) 376 return false; 377 378 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 379 // end up widening these for a scalar load during RegBankSelect, since there 380 // aren't 96-bit scalar loads. 381 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 382 return false; 383 384 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 385 return false; 386 387 // A load is known dereferenceable up to the alignment, so it's legal to widen 388 // to it. 389 // 390 // TODO: Could check dereferenceable for less aligned cases. 391 unsigned RoundedSize = NextPowerOf2(SizeInBits); 392 if (AlignInBits < RoundedSize) 393 return false; 394 395 // Do not widen if it would introduce a slow unaligned load. 396 const SITargetLowering *TLI = ST.getTargetLowering(); 397 bool Fast = false; 398 return TLI->allowsMisalignedMemoryAccessesImpl( 399 RoundedSize, AddrSpace, Align(AlignInBits / 8), 400 MachineMemOperand::MOLoad, &Fast) && 401 Fast; 402 } 403 404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 405 unsigned Opcode) { 406 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 407 return false; 408 409 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 410 Query.MMODescrs[0].AlignInBits, 411 Query.Types[1].getAddressSpace(), Opcode); 412 } 413 414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 415 const GCNTargetMachine &TM) 416 : ST(ST_) { 417 using namespace TargetOpcode; 418 419 auto GetAddrSpacePtr = [&TM](unsigned AS) { 420 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 421 }; 422 423 const LLT S1 = LLT::scalar(1); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S16 = LLT::vector(2, 16); 433 const LLT V4S16 = LLT::vector(4, 16); 434 435 const LLT V2S32 = LLT::vector(2, 32); 436 const LLT V3S32 = LLT::vector(3, 32); 437 const LLT V4S32 = LLT::vector(4, 32); 438 const LLT V5S32 = LLT::vector(5, 32); 439 const LLT V6S32 = LLT::vector(6, 32); 440 const LLT V7S32 = LLT::vector(7, 32); 441 const LLT V8S32 = LLT::vector(8, 32); 442 const LLT V9S32 = LLT::vector(9, 32); 443 const LLT V10S32 = LLT::vector(10, 32); 444 const LLT V11S32 = LLT::vector(11, 32); 445 const LLT V12S32 = LLT::vector(12, 32); 446 const LLT V13S32 = LLT::vector(13, 32); 447 const LLT V14S32 = LLT::vector(14, 32); 448 const LLT V15S32 = LLT::vector(15, 32); 449 const LLT V16S32 = LLT::vector(16, 32); 450 const LLT V32S32 = LLT::vector(32, 32); 451 452 const LLT V2S64 = LLT::vector(2, 64); 453 const LLT V3S64 = LLT::vector(3, 64); 454 const LLT V4S64 = LLT::vector(4, 64); 455 const LLT V5S64 = LLT::vector(5, 64); 456 const LLT V6S64 = LLT::vector(6, 64); 457 const LLT V7S64 = LLT::vector(7, 64); 458 const LLT V8S64 = LLT::vector(8, 64); 459 const LLT V16S64 = LLT::vector(16, 64); 460 461 std::initializer_list<LLT> AllS32Vectors = 462 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 463 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 464 std::initializer_list<LLT> AllS64Vectors = 465 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 466 467 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 468 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 469 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 470 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 471 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 472 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 473 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 474 475 const LLT CodePtr = FlatPtr; 476 477 const std::initializer_list<LLT> AddrSpaces64 = { 478 GlobalPtr, ConstantPtr, FlatPtr 479 }; 480 481 const std::initializer_list<LLT> AddrSpaces32 = { 482 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 483 }; 484 485 const std::initializer_list<LLT> FPTypesBase = { 486 S32, S64 487 }; 488 489 const std::initializer_list<LLT> FPTypes16 = { 490 S32, S64, S16 491 }; 492 493 const std::initializer_list<LLT> FPTypesPK16 = { 494 S32, S64, S16, V2S16 495 }; 496 497 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 498 499 setAction({G_BRCOND, S1}, Legal); // VCC branches 500 setAction({G_BRCOND, S32}, Legal); // SCC branches 501 502 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 503 // elements for v3s16 504 getActionDefinitionsBuilder(G_PHI) 505 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 506 .legalFor(AllS32Vectors) 507 .legalFor(AllS64Vectors) 508 .legalFor(AddrSpaces64) 509 .legalFor(AddrSpaces32) 510 .legalIf(isPointer(0)) 511 .clampScalar(0, S16, S256) 512 .widenScalarToNextPow2(0, 32) 513 .clampMaxNumElements(0, S32, 16) 514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 515 .scalarize(0); 516 517 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 518 // Full set of gfx9 features. 519 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 520 .legalFor({S32, S16, V2S16}) 521 .clampScalar(0, S16, S32) 522 .clampMaxNumElements(0, S16, 2) 523 .scalarize(0) 524 .widenScalarToNextPow2(0, 32); 525 526 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 527 .legalFor({S32, S16, V2S16}) // Clamp modifier 528 .minScalarOrElt(0, S16) 529 .clampMaxNumElements(0, S16, 2) 530 .scalarize(0) 531 .widenScalarToNextPow2(0, 32) 532 .lower(); 533 } else if (ST.has16BitInsts()) { 534 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 535 .legalFor({S32, S16}) 536 .clampScalar(0, S16, S32) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 539 540 // Technically the saturating operations require clamp bit support, but this 541 // was introduced at the same time as 16-bit operations. 542 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 543 .legalFor({S32, S16}) // Clamp modifier 544 .minScalar(0, S16) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 16) 547 .lower(); 548 549 // We're just lowering this, but it helps get a better result to try to 550 // coerce to the desired type first. 551 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 552 .minScalar(0, S16) 553 .scalarize(0) 554 .lower(); 555 } else { 556 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 557 .legalFor({S32}) 558 .clampScalar(0, S32, S32) 559 .scalarize(0); 560 561 if (ST.hasIntClamp()) { 562 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 563 .legalFor({S32}) // Clamp modifier. 564 .scalarize(0) 565 .minScalarOrElt(0, S32) 566 .lower(); 567 } else { 568 // Clamp bit support was added in VI, along with 16-bit operations. 569 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 570 .minScalar(0, S32) 571 .scalarize(0) 572 .lower(); 573 } 574 575 // FIXME: DAG expansion gets better results. The widening uses the smaller 576 // range values and goes for the min/max lowering directly. 577 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 584 .customFor({S32, S64}) 585 .clampScalar(0, S32, S64) 586 .widenScalarToNextPow2(0, 32) 587 .scalarize(0); 588 589 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 590 .legalFor({S32}) 591 .clampScalar(0, S32, S32) 592 .scalarize(0); 593 594 // Report legal for any types we can handle anywhere. For the cases only legal 595 // on the SALU, RegBankSelect will be able to re-legalize. 596 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 597 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 598 .clampScalar(0, S32, S64) 599 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 600 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 601 .widenScalarToNextPow2(0) 602 .scalarize(0); 603 604 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 605 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 606 .legalFor({{S32, S1}, {S32, S32}}) 607 .minScalar(0, S32) 608 // TODO: .scalarize(0) 609 .lower(); 610 611 getActionDefinitionsBuilder(G_BITCAST) 612 // Don't worry about the size constraint. 613 .legalIf(all(isRegisterType(0), isRegisterType(1))) 614 .lower(); 615 616 617 getActionDefinitionsBuilder(G_CONSTANT) 618 .legalFor({S1, S32, S64, S16, GlobalPtr, 619 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 620 .legalIf(isPointer(0)) 621 .clampScalar(0, S32, S64) 622 .widenScalarToNextPow2(0); 623 624 getActionDefinitionsBuilder(G_FCONSTANT) 625 .legalFor({S32, S64, S16}) 626 .clampScalar(0, S16, S64); 627 628 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 629 .legalIf(isRegisterType(0)) 630 // s1 and s16 are special cases because they have legal operations on 631 // them, but don't really occupy registers in the normal way. 632 .legalFor({S1, S16}) 633 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 634 .clampScalarOrElt(0, S32, MaxScalar) 635 .widenScalarToNextPow2(0, 32) 636 .clampMaxNumElements(0, S32, 16); 637 638 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 639 640 // If the amount is divergent, we have to do a wave reduction to get the 641 // maximum value, so this is expanded during RegBankSelect. 642 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 643 .legalFor({{PrivatePtr, S32}}); 644 645 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 646 .customIf(typeIsNot(0, PrivatePtr)); 647 648 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 649 650 auto &FPOpActions = getActionDefinitionsBuilder( 651 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 652 .legalFor({S32, S64}); 653 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 654 .customFor({S32, S64}); 655 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 656 .customFor({S32, S64}); 657 658 if (ST.has16BitInsts()) { 659 if (ST.hasVOP3PInsts()) 660 FPOpActions.legalFor({S16, V2S16}); 661 else 662 FPOpActions.legalFor({S16}); 663 664 TrigActions.customFor({S16}); 665 FDIVActions.customFor({S16}); 666 } 667 668 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 669 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 670 671 if (ST.hasVOP3PInsts()) { 672 MinNumMaxNum.customFor(FPTypesPK16) 673 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 674 .clampMaxNumElements(0, S16, 2) 675 .clampScalar(0, S16, S64) 676 .scalarize(0); 677 } else if (ST.has16BitInsts()) { 678 MinNumMaxNum.customFor(FPTypes16) 679 .clampScalar(0, S16, S64) 680 .scalarize(0); 681 } else { 682 MinNumMaxNum.customFor(FPTypesBase) 683 .clampScalar(0, S32, S64) 684 .scalarize(0); 685 } 686 687 if (ST.hasVOP3PInsts()) 688 FPOpActions.clampMaxNumElements(0, S16, 2); 689 690 FPOpActions 691 .scalarize(0) 692 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 693 694 TrigActions 695 .scalarize(0) 696 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 697 698 FDIVActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 703 .legalFor(FPTypesPK16) 704 .clampMaxNumElements(0, S16, 2) 705 .scalarize(0) 706 .clampScalar(0, S16, S64); 707 708 if (ST.has16BitInsts()) { 709 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 710 .legalFor({S32, S64, S16}) 711 .scalarize(0) 712 .clampScalar(0, S16, S64); 713 } else { 714 getActionDefinitionsBuilder(G_FSQRT) 715 .legalFor({S32, S64}) 716 .scalarize(0) 717 .clampScalar(0, S32, S64); 718 719 if (ST.hasFractBug()) { 720 getActionDefinitionsBuilder(G_FFLOOR) 721 .customFor({S64}) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } else { 726 getActionDefinitionsBuilder(G_FFLOOR) 727 .legalFor({S32, S64}) 728 .scalarize(0) 729 .clampScalar(0, S32, S64); 730 } 731 } 732 733 getActionDefinitionsBuilder(G_FPTRUNC) 734 .legalFor({{S32, S64}, {S16, S32}}) 735 .scalarize(0) 736 .lower(); 737 738 getActionDefinitionsBuilder(G_FPEXT) 739 .legalFor({{S64, S32}, {S32, S16}}) 740 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 741 .scalarize(0); 742 743 getActionDefinitionsBuilder(G_FSUB) 744 // Use actual fsub instruction 745 .legalFor({S32}) 746 // Must use fadd + fneg 747 .lowerFor({S64, S16, V2S16}) 748 .scalarize(0) 749 .clampScalar(0, S32, S64); 750 751 // Whether this is legal depends on the floating point mode for the function. 752 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 753 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 754 FMad.customFor({S32, S16}); 755 else if (ST.hasMadMacF32Insts()) 756 FMad.customFor({S32}); 757 else if (ST.hasMadF16()) 758 FMad.customFor({S16}); 759 FMad.scalarize(0) 760 .lower(); 761 762 auto &FRem = getActionDefinitionsBuilder(G_FREM); 763 if (ST.has16BitInsts()) { 764 FRem.customFor({S16, S32, S64}); 765 } else { 766 FRem.minScalar(0, S32) 767 .customFor({S32, S64}); 768 } 769 FRem.scalarize(0); 770 771 // TODO: Do we need to clamp maximum bitwidth? 772 getActionDefinitionsBuilder(G_TRUNC) 773 .legalIf(isScalar(0)) 774 .legalFor({{V2S16, V2S32}}) 775 .clampMaxNumElements(0, S16, 2) 776 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 777 // situations (like an invalid implicit use), we don't want to infinite loop 778 // in the legalizer. 779 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 780 .alwaysLegal(); 781 782 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 783 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 784 {S32, S1}, {S64, S1}, {S16, S1}}) 785 .scalarize(0) 786 .clampScalar(0, S32, S64) 787 .widenScalarToNextPow2(1, 32); 788 789 // TODO: Split s1->s64 during regbankselect for VALU. 790 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 791 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 792 .lowerFor({{S32, S64}}) 793 .lowerIf(typeIs(1, S1)) 794 .customFor({{S64, S64}}); 795 if (ST.has16BitInsts()) 796 IToFP.legalFor({{S16, S16}}); 797 IToFP.clampScalar(1, S32, S64) 798 .minScalar(0, S32) 799 .scalarize(0) 800 .widenScalarToNextPow2(1); 801 802 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 803 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 804 .customFor({{S64, S64}}) 805 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 806 if (ST.has16BitInsts()) 807 FPToI.legalFor({{S16, S16}}); 808 else 809 FPToI.minScalar(1, S32); 810 811 FPToI.minScalar(0, S32) 812 .scalarize(0) 813 .lower(); 814 815 // Lower roundeven into G_FRINT 816 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 817 .scalarize(0) 818 .lower(); 819 820 if (ST.has16BitInsts()) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S16, S32, S64}) 823 .clampScalar(0, S16, S64) 824 .scalarize(0); 825 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32, S64}) 828 .clampScalar(0, S32, S64) 829 .scalarize(0); 830 } else { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S32}) 833 .customFor({S64}) 834 .clampScalar(0, S32, S64) 835 .scalarize(0); 836 } 837 838 getActionDefinitionsBuilder(G_PTR_ADD) 839 .legalIf(all(isPointer(0), sameSize(0, 1))) 840 .scalarize(0) 841 .scalarSameSizeAs(1, 0); 842 843 getActionDefinitionsBuilder(G_PTRMASK) 844 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 845 .scalarSameSizeAs(1, 0) 846 .scalarize(0); 847 848 auto &CmpBuilder = 849 getActionDefinitionsBuilder(G_ICMP) 850 // The compare output type differs based on the register bank of the output, 851 // so make both s1 and s32 legal. 852 // 853 // Scalar compares producing output in scc will be promoted to s32, as that 854 // is the allocatable register type that will be needed for the copy from 855 // scc. This will be promoted during RegBankSelect, and we assume something 856 // before that won't try to use s32 result types. 857 // 858 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 859 // bank. 860 .legalForCartesianProduct( 861 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 862 .legalForCartesianProduct( 863 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 864 if (ST.has16BitInsts()) { 865 CmpBuilder.legalFor({{S1, S16}}); 866 } 867 868 CmpBuilder 869 .widenScalarToNextPow2(1) 870 .clampScalar(1, S32, S64) 871 .scalarize(0) 872 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 873 874 getActionDefinitionsBuilder(G_FCMP) 875 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 876 .widenScalarToNextPow2(1) 877 .clampScalar(1, S32, S64) 878 .scalarize(0); 879 880 // FIXME: fpow has a selection pattern that should move to custom lowering. 881 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 882 if (ST.has16BitInsts()) 883 Exp2Ops.legalFor({S32, S16}); 884 else 885 Exp2Ops.legalFor({S32}); 886 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 887 Exp2Ops.scalarize(0); 888 889 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 890 if (ST.has16BitInsts()) 891 ExpOps.customFor({{S32}, {S16}}); 892 else 893 ExpOps.customFor({S32}); 894 ExpOps.clampScalar(0, MinScalarFPTy, S32) 895 .scalarize(0); 896 897 getActionDefinitionsBuilder(G_FPOWI) 898 .clampScalar(0, MinScalarFPTy, S32) 899 .lower(); 900 901 // The 64-bit versions produce 32-bit results, but only on the SALU. 902 getActionDefinitionsBuilder(G_CTPOP) 903 .legalFor({{S32, S32}, {S32, S64}}) 904 .clampScalar(0, S32, S32) 905 .clampScalar(1, S32, S64) 906 .scalarize(0) 907 .widenScalarToNextPow2(0, 32) 908 .widenScalarToNextPow2(1, 32); 909 910 // The hardware instructions return a different result on 0 than the generic 911 // instructions expect. The hardware produces -1, but these produce the 912 // bitwidth. 913 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 914 .scalarize(0) 915 .clampScalar(0, S32, S32) 916 .clampScalar(1, S32, S64) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 getActionDefinitionsBuilder(G_BITREVERSE) 931 .legalFor({S32}) 932 .clampScalar(0, S32, S32) 933 .scalarize(0); 934 935 if (ST.has16BitInsts()) { 936 getActionDefinitionsBuilder(G_BSWAP) 937 .legalFor({S16, S32, V2S16}) 938 .clampMaxNumElements(0, S16, 2) 939 // FIXME: Fixing non-power-of-2 before clamp is workaround for 940 // narrowScalar limitation. 941 .widenScalarToNextPow2(0) 942 .clampScalar(0, S16, S32) 943 .scalarize(0); 944 945 if (ST.hasVOP3PInsts()) { 946 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 947 .legalFor({S32, S16, V2S16}) 948 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 949 .clampMaxNumElements(0, S16, 2) 950 .minScalar(0, S16) 951 .widenScalarToNextPow2(0) 952 .scalarize(0) 953 .lower(); 954 } else { 955 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 956 .legalFor({S32, S16}) 957 .widenScalarToNextPow2(0) 958 .minScalar(0, S16) 959 .scalarize(0) 960 .lower(); 961 } 962 } else { 963 // TODO: Should have same legality without v_perm_b32 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S32}) 966 .lowerIf(scalarNarrowerThan(0, 32)) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .maxScalar(0, S32) 971 .scalarize(0) 972 .lower(); 973 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 975 .legalFor({S32}) 976 .minScalar(0, S32) 977 .widenScalarToNextPow2(0) 978 .scalarize(0) 979 .lower(); 980 } 981 982 getActionDefinitionsBuilder(G_INTTOPTR) 983 // List the common cases 984 .legalForCartesianProduct(AddrSpaces64, {S64}) 985 .legalForCartesianProduct(AddrSpaces32, {S32}) 986 .scalarize(0) 987 // Accept any address space as long as the size matches 988 .legalIf(sameSize(0, 1)) 989 .widenScalarIf(smallerThan(1, 0), 990 [](const LegalityQuery &Query) { 991 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 992 }) 993 .narrowScalarIf(largerThan(1, 0), 994 [](const LegalityQuery &Query) { 995 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 996 }); 997 998 getActionDefinitionsBuilder(G_PTRTOINT) 999 // List the common cases 1000 .legalForCartesianProduct(AddrSpaces64, {S64}) 1001 .legalForCartesianProduct(AddrSpaces32, {S32}) 1002 .scalarize(0) 1003 // Accept any address space as long as the size matches 1004 .legalIf(sameSize(0, 1)) 1005 .widenScalarIf(smallerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }) 1009 .narrowScalarIf( 1010 largerThan(0, 1), 1011 [](const LegalityQuery &Query) { 1012 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1013 }); 1014 1015 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1016 .scalarize(0) 1017 .custom(); 1018 1019 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1020 bool IsLoad) -> bool { 1021 const LLT DstTy = Query.Types[0]; 1022 1023 // Split vector extloads. 1024 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1025 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1026 1027 if (MemSize < DstTy.getSizeInBits()) 1028 MemSize = std::max(MemSize, AlignBits); 1029 1030 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1031 return true; 1032 1033 const LLT PtrTy = Query.Types[1]; 1034 unsigned AS = PtrTy.getAddressSpace(); 1035 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1036 return true; 1037 1038 // Catch weird sized loads that don't evenly divide into the access sizes 1039 // TODO: May be able to widen depending on alignment etc. 1040 unsigned NumRegs = (MemSize + 31) / 32; 1041 if (NumRegs == 3) { 1042 if (!ST.hasDwordx3LoadStores()) 1043 return true; 1044 } else { 1045 // If the alignment allows, these should have been widened. 1046 if (!isPowerOf2_32(NumRegs)) 1047 return true; 1048 } 1049 1050 if (AlignBits < MemSize) { 1051 const SITargetLowering *TLI = ST.getTargetLowering(); 1052 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1053 Align(AlignBits / 8)); 1054 } 1055 1056 return false; 1057 }; 1058 1059 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1060 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1061 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1062 1063 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1064 // LDS 1065 // TODO: Unsupported flat for SI. 1066 1067 for (unsigned Op : {G_LOAD, G_STORE}) { 1068 const bool IsStore = Op == G_STORE; 1069 1070 auto &Actions = getActionDefinitionsBuilder(Op); 1071 // Explicitly list some common cases. 1072 // TODO: Does this help compile time at all? 1073 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1074 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1075 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1076 {S64, GlobalPtr, 64, GlobalAlign32}, 1077 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1078 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1079 {S32, GlobalPtr, 8, GlobalAlign8}, 1080 {S32, GlobalPtr, 16, GlobalAlign16}, 1081 1082 {S32, LocalPtr, 32, 32}, 1083 {S64, LocalPtr, 64, 32}, 1084 {V2S32, LocalPtr, 64, 32}, 1085 {S32, LocalPtr, 8, 8}, 1086 {S32, LocalPtr, 16, 16}, 1087 {V2S16, LocalPtr, 32, 32}, 1088 1089 {S32, PrivatePtr, 32, 32}, 1090 {S32, PrivatePtr, 8, 8}, 1091 {S32, PrivatePtr, 16, 16}, 1092 {V2S16, PrivatePtr, 32, 32}, 1093 1094 {S32, ConstantPtr, 32, GlobalAlign32}, 1095 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1096 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1097 {S64, ConstantPtr, 64, GlobalAlign32}, 1098 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1099 Actions.legalIf( 1100 [=](const LegalityQuery &Query) -> bool { 1101 return isLoadStoreLegal(ST, Query, Op); 1102 }); 1103 1104 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1105 // 64-bits. 1106 // 1107 // TODO: Should generalize bitcast action into coerce, which will also cover 1108 // inserting addrspacecasts. 1109 Actions.customIf(typeIs(1, Constant32Ptr)); 1110 1111 // Turn any illegal element vectors into something easier to deal 1112 // with. These will ultimately produce 32-bit scalar shifts to extract the 1113 // parts anyway. 1114 // 1115 // For odd 16-bit element vectors, prefer to split those into pieces with 1116 // 16-bit vector parts. 1117 Actions.bitcastIf( 1118 [=](const LegalityQuery &Query) -> bool { 1119 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1120 Query.MMODescrs[0].SizeInBits); 1121 }, bitcastToRegisterType(0)); 1122 1123 if (!IsStore) { 1124 // Widen suitably aligned loads by loading extra bytes. The standard 1125 // legalization actions can't properly express widening memory operands. 1126 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1127 return shouldWidenLoad(ST, Query, G_LOAD); 1128 }); 1129 } 1130 1131 // FIXME: load/store narrowing should be moved to lower action 1132 Actions 1133 .narrowScalarIf( 1134 [=](const LegalityQuery &Query) -> bool { 1135 return !Query.Types[0].isVector() && 1136 needToSplitMemOp(Query, Op == G_LOAD); 1137 }, 1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1139 const LLT DstTy = Query.Types[0]; 1140 const LLT PtrTy = Query.Types[1]; 1141 1142 const unsigned DstSize = DstTy.getSizeInBits(); 1143 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1144 1145 // Split extloads. 1146 if (DstSize > MemSize) 1147 return std::make_pair(0, LLT::scalar(MemSize)); 1148 1149 if (!isPowerOf2_32(DstSize)) { 1150 // We're probably decomposing an odd sized store. Try to split 1151 // to the widest type. TODO: Account for alignment. As-is it 1152 // should be OK, since the new parts will be further legalized. 1153 unsigned FloorSize = PowerOf2Floor(DstSize); 1154 return std::make_pair(0, LLT::scalar(FloorSize)); 1155 } 1156 1157 if (DstSize > 32 && (DstSize % 32 != 0)) { 1158 // FIXME: Need a way to specify non-extload of larger size if 1159 // suitably aligned. 1160 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1161 } 1162 1163 unsigned MaxSize = maxSizeForAddrSpace(ST, 1164 PtrTy.getAddressSpace(), 1165 Op == G_LOAD); 1166 if (MemSize > MaxSize) 1167 return std::make_pair(0, LLT::scalar(MaxSize)); 1168 1169 unsigned Align = Query.MMODescrs[0].AlignInBits; 1170 return std::make_pair(0, LLT::scalar(Align)); 1171 }) 1172 .fewerElementsIf( 1173 [=](const LegalityQuery &Query) -> bool { 1174 return Query.Types[0].isVector() && 1175 needToSplitMemOp(Query, Op == G_LOAD); 1176 }, 1177 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1178 const LLT DstTy = Query.Types[0]; 1179 const LLT PtrTy = Query.Types[1]; 1180 1181 LLT EltTy = DstTy.getElementType(); 1182 unsigned MaxSize = maxSizeForAddrSpace(ST, 1183 PtrTy.getAddressSpace(), 1184 Op == G_LOAD); 1185 1186 // FIXME: Handle widened to power of 2 results better. This ends 1187 // up scalarizing. 1188 // FIXME: 3 element stores scalarized on SI 1189 1190 // Split if it's too large for the address space. 1191 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1192 unsigned NumElts = DstTy.getNumElements(); 1193 unsigned EltSize = EltTy.getSizeInBits(); 1194 1195 if (MaxSize % EltSize == 0) { 1196 return std::make_pair( 1197 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1198 } 1199 1200 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1201 1202 // FIXME: Refine when odd breakdowns handled 1203 // The scalars will need to be re-legalized. 1204 if (NumPieces == 1 || NumPieces >= NumElts || 1205 NumElts % NumPieces != 0) 1206 return std::make_pair(0, EltTy); 1207 1208 return std::make_pair(0, 1209 LLT::vector(NumElts / NumPieces, EltTy)); 1210 } 1211 1212 // FIXME: We could probably handle weird extending loads better. 1213 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1214 if (DstTy.getSizeInBits() > MemSize) 1215 return std::make_pair(0, EltTy); 1216 1217 unsigned EltSize = EltTy.getSizeInBits(); 1218 unsigned DstSize = DstTy.getSizeInBits(); 1219 if (!isPowerOf2_32(DstSize)) { 1220 // We're probably decomposing an odd sized store. Try to split 1221 // to the widest type. TODO: Account for alignment. As-is it 1222 // should be OK, since the new parts will be further legalized. 1223 unsigned FloorSize = PowerOf2Floor(DstSize); 1224 return std::make_pair( 1225 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1226 } 1227 1228 // Need to split because of alignment. 1229 unsigned Align = Query.MMODescrs[0].AlignInBits; 1230 if (EltSize > Align && 1231 (EltSize / Align < DstTy.getNumElements())) { 1232 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1233 } 1234 1235 // May need relegalization for the scalars. 1236 return std::make_pair(0, EltTy); 1237 }) 1238 .lowerIfMemSizeNotPow2() 1239 .minScalar(0, S32); 1240 1241 if (IsStore) 1242 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1243 1244 Actions 1245 .widenScalarToNextPow2(0) 1246 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1247 .lower(); 1248 } 1249 1250 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1251 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1252 {S32, GlobalPtr, 16, 2 * 8}, 1253 {S32, LocalPtr, 8, 8}, 1254 {S32, LocalPtr, 16, 16}, 1255 {S32, PrivatePtr, 8, 8}, 1256 {S32, PrivatePtr, 16, 16}, 1257 {S32, ConstantPtr, 8, 8}, 1258 {S32, ConstantPtr, 16, 2 * 8}}); 1259 if (ST.hasFlatAddressSpace()) { 1260 ExtLoads.legalForTypesWithMemDesc( 1261 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1262 } 1263 1264 ExtLoads.clampScalar(0, S32, S32) 1265 .widenScalarToNextPow2(0) 1266 .unsupportedIfMemSizeNotPow2() 1267 .lower(); 1268 1269 auto &Atomics = getActionDefinitionsBuilder( 1270 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1271 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1272 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1273 G_ATOMICRMW_UMIN}) 1274 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1275 {S64, GlobalPtr}, {S64, LocalPtr}, 1276 {S32, RegionPtr}, {S64, RegionPtr}}); 1277 if (ST.hasFlatAddressSpace()) { 1278 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1279 } 1280 1281 if (ST.hasLDSFPAtomics()) { 1282 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1283 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1284 } 1285 1286 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1287 // demarshalling 1288 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1289 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1290 {S32, FlatPtr}, {S64, FlatPtr}}) 1291 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1292 {S32, RegionPtr}, {S64, RegionPtr}}); 1293 // TODO: Pointer types, any 32-bit or 64-bit vector 1294 1295 // Condition should be s32 for scalar, s1 for vector. 1296 getActionDefinitionsBuilder(G_SELECT) 1297 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1298 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1299 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1300 .clampScalar(0, S16, S64) 1301 .scalarize(1) 1302 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1303 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1304 .clampMaxNumElements(0, S32, 2) 1305 .clampMaxNumElements(0, LocalPtr, 2) 1306 .clampMaxNumElements(0, PrivatePtr, 2) 1307 .scalarize(0) 1308 .widenScalarToNextPow2(0) 1309 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1310 1311 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1312 // be more flexible with the shift amount type. 1313 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1314 .legalFor({{S32, S32}, {S64, S32}}); 1315 if (ST.has16BitInsts()) { 1316 if (ST.hasVOP3PInsts()) { 1317 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1318 .clampMaxNumElements(0, S16, 2); 1319 } else 1320 Shifts.legalFor({{S16, S16}}); 1321 1322 // TODO: Support 16-bit shift amounts for all types 1323 Shifts.widenScalarIf( 1324 [=](const LegalityQuery &Query) { 1325 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1326 // 32-bit amount. 1327 const LLT ValTy = Query.Types[0]; 1328 const LLT AmountTy = Query.Types[1]; 1329 return ValTy.getSizeInBits() <= 16 && 1330 AmountTy.getSizeInBits() < 16; 1331 }, changeTo(1, S16)); 1332 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1333 Shifts.clampScalar(1, S32, S32); 1334 Shifts.clampScalar(0, S16, S64); 1335 Shifts.widenScalarToNextPow2(0, 16); 1336 1337 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1338 .minScalar(0, S16) 1339 .scalarize(0) 1340 .lower(); 1341 } else { 1342 // Make sure we legalize the shift amount type first, as the general 1343 // expansion for the shifted type will produce much worse code if it hasn't 1344 // been truncated already. 1345 Shifts.clampScalar(1, S32, S32); 1346 Shifts.clampScalar(0, S32, S64); 1347 Shifts.widenScalarToNextPow2(0, 32); 1348 1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1350 .minScalar(0, S32) 1351 .scalarize(0) 1352 .lower(); 1353 } 1354 Shifts.scalarize(0); 1355 1356 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1357 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1358 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1359 unsigned IdxTypeIdx = 2; 1360 1361 getActionDefinitionsBuilder(Op) 1362 .customIf([=](const LegalityQuery &Query) { 1363 const LLT EltTy = Query.Types[EltTypeIdx]; 1364 const LLT VecTy = Query.Types[VecTypeIdx]; 1365 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1366 const unsigned EltSize = EltTy.getSizeInBits(); 1367 return (EltSize == 32 || EltSize == 64) && 1368 VecTy.getSizeInBits() % 32 == 0 && 1369 VecTy.getSizeInBits() <= MaxRegisterSize && 1370 IdxTy.getSizeInBits() == 32; 1371 }) 1372 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1373 bitcastToVectorElement32(VecTypeIdx)) 1374 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1375 .bitcastIf( 1376 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1377 [=](const LegalityQuery &Query) { 1378 // For > 64-bit element types, try to turn this into a 64-bit 1379 // element vector since we may be able to do better indexing 1380 // if this is scalar. If not, fall back to 32. 1381 const LLT EltTy = Query.Types[EltTypeIdx]; 1382 const LLT VecTy = Query.Types[VecTypeIdx]; 1383 const unsigned DstEltSize = EltTy.getSizeInBits(); 1384 const unsigned VecSize = VecTy.getSizeInBits(); 1385 1386 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1387 return std::make_pair( 1388 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1389 }) 1390 .clampScalar(EltTypeIdx, S32, S64) 1391 .clampScalar(VecTypeIdx, S32, S64) 1392 .clampScalar(IdxTypeIdx, S32, S32) 1393 .clampMaxNumElements(VecTypeIdx, S32, 32) 1394 // TODO: Clamp elements for 64-bit vectors? 1395 // It should only be necessary with variable indexes. 1396 // As a last resort, lower to the stack 1397 .lower(); 1398 } 1399 1400 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1401 .unsupportedIf([=](const LegalityQuery &Query) { 1402 const LLT &EltTy = Query.Types[1].getElementType(); 1403 return Query.Types[0] != EltTy; 1404 }); 1405 1406 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1407 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1408 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1409 1410 // FIXME: Doesn't handle extract of illegal sizes. 1411 getActionDefinitionsBuilder(Op) 1412 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1413 // FIXME: Multiples of 16 should not be legal. 1414 .legalIf([=](const LegalityQuery &Query) { 1415 const LLT BigTy = Query.Types[BigTyIdx]; 1416 const LLT LitTy = Query.Types[LitTyIdx]; 1417 return (BigTy.getSizeInBits() % 32 == 0) && 1418 (LitTy.getSizeInBits() % 16 == 0); 1419 }) 1420 .widenScalarIf( 1421 [=](const LegalityQuery &Query) { 1422 const LLT BigTy = Query.Types[BigTyIdx]; 1423 return (BigTy.getScalarSizeInBits() < 16); 1424 }, 1425 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1426 .widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT LitTy = Query.Types[LitTyIdx]; 1429 return (LitTy.getScalarSizeInBits() < 16); 1430 }, 1431 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1432 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1433 .widenScalarToNextPow2(BigTyIdx, 32); 1434 1435 } 1436 1437 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1438 .legalForCartesianProduct(AllS32Vectors, {S32}) 1439 .legalForCartesianProduct(AllS64Vectors, {S64}) 1440 .clampNumElements(0, V16S32, V32S32) 1441 .clampNumElements(0, V2S64, V16S64) 1442 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1443 1444 if (ST.hasScalarPackInsts()) { 1445 BuildVector 1446 // FIXME: Should probably widen s1 vectors straight to s32 1447 .minScalarOrElt(0, S16) 1448 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1449 .minScalar(1, S32); 1450 1451 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1452 .legalFor({V2S16, S32}) 1453 .lower(); 1454 BuildVector.minScalarOrElt(0, S32); 1455 } else { 1456 BuildVector.customFor({V2S16, S16}); 1457 BuildVector.minScalarOrElt(0, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .customFor({V2S16, S32}) 1461 .lower(); 1462 } 1463 1464 BuildVector.legalIf(isRegisterType(0)); 1465 1466 // FIXME: Clamp maximum size 1467 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1468 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1469 .clampMaxNumElements(0, S32, 32) 1470 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1471 .clampMaxNumElements(0, S16, 64); 1472 1473 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1474 // pre-legalize. 1475 if (ST.hasVOP3PInsts()) { 1476 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1477 .customFor({V2S16, V2S16}) 1478 .lower(); 1479 } else 1480 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1481 1482 // Merge/Unmerge 1483 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1484 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1485 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1486 1487 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1488 const LLT Ty = Query.Types[TypeIdx]; 1489 if (Ty.isVector()) { 1490 const LLT &EltTy = Ty.getElementType(); 1491 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1492 return true; 1493 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1494 return true; 1495 } 1496 return false; 1497 }; 1498 1499 auto &Builder = getActionDefinitionsBuilder(Op) 1500 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1501 .lowerFor({{S16, V2S16}}) 1502 .lowerIf([=](const LegalityQuery &Query) { 1503 const LLT BigTy = Query.Types[BigTyIdx]; 1504 return BigTy.getSizeInBits() == 32; 1505 }) 1506 // Try to widen to s16 first for small types. 1507 // TODO: Only do this on targets with legal s16 shifts 1508 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1509 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1510 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1511 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1512 elementTypeIs(1, S16)), 1513 changeTo(1, V2S16)) 1514 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1515 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1516 // valid. 1517 .clampScalar(LitTyIdx, S32, S512) 1518 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1519 // Break up vectors with weird elements into scalars 1520 .fewerElementsIf( 1521 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1522 scalarize(0)) 1523 .fewerElementsIf( 1524 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1525 scalarize(1)) 1526 .clampScalar(BigTyIdx, S32, MaxScalar); 1527 1528 if (Op == G_MERGE_VALUES) { 1529 Builder.widenScalarIf( 1530 // TODO: Use 16-bit shifts if legal for 8-bit values? 1531 [=](const LegalityQuery &Query) { 1532 const LLT Ty = Query.Types[LitTyIdx]; 1533 return Ty.getSizeInBits() < 32; 1534 }, 1535 changeTo(LitTyIdx, S32)); 1536 } 1537 1538 Builder.widenScalarIf( 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[BigTyIdx]; 1541 return !isPowerOf2_32(Ty.getSizeInBits()) && 1542 Ty.getSizeInBits() % 16 != 0; 1543 }, 1544 [=](const LegalityQuery &Query) { 1545 // Pick the next power of 2, or a multiple of 64 over 128. 1546 // Whichever is smaller. 1547 const LLT &Ty = Query.Types[BigTyIdx]; 1548 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1549 if (NewSizeInBits >= 256) { 1550 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1551 if (RoundedTo < NewSizeInBits) 1552 NewSizeInBits = RoundedTo; 1553 } 1554 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1555 }) 1556 // Any vectors left are the wrong size. Scalarize them. 1557 .scalarize(0) 1558 .scalarize(1); 1559 } 1560 1561 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1562 // RegBankSelect. 1563 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1564 .legalFor({{S32}, {S64}}); 1565 1566 if (ST.hasVOP3PInsts()) { 1567 SextInReg.lowerFor({{V2S16}}) 1568 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1569 // get more vector shift opportunities, since we'll get those when 1570 // expanded. 1571 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1572 } else if (ST.has16BitInsts()) { 1573 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1574 } else { 1575 // Prefer to promote to s32 before lowering if we don't have 16-bit 1576 // shifts. This avoid a lot of intermediate truncate and extend operations. 1577 SextInReg.lowerFor({{S32}, {S64}}); 1578 } 1579 1580 SextInReg 1581 .scalarize(0) 1582 .clampScalar(0, S32, S64) 1583 .lower(); 1584 1585 getActionDefinitionsBuilder(G_FSHR) 1586 .legalFor({{S32, S32}}) 1587 .scalarize(0) 1588 .lower(); 1589 1590 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1591 .legalFor({S64}); 1592 1593 getActionDefinitionsBuilder(G_FENCE) 1594 .alwaysLegal(); 1595 1596 getActionDefinitionsBuilder({ 1597 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1598 G_FCOPYSIGN, 1599 1600 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1601 G_ATOMICRMW_NAND, 1602 G_ATOMICRMW_FSUB, 1603 G_READ_REGISTER, 1604 G_WRITE_REGISTER, 1605 1606 G_SADDO, G_SSUBO, 1607 1608 // TODO: Implement 1609 G_FMINIMUM, G_FMAXIMUM, 1610 G_FSHL 1611 }).lower(); 1612 1613 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1614 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1615 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1616 .unsupported(); 1617 1618 computeTables(); 1619 verify(*ST.getInstrInfo()); 1620 } 1621 1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1623 MachineInstr &MI) const { 1624 MachineIRBuilder &B = Helper.MIRBuilder; 1625 MachineRegisterInfo &MRI = *B.getMRI(); 1626 1627 switch (MI.getOpcode()) { 1628 case TargetOpcode::G_ADDRSPACE_CAST: 1629 return legalizeAddrSpaceCast(MI, MRI, B); 1630 case TargetOpcode::G_FRINT: 1631 return legalizeFrint(MI, MRI, B); 1632 case TargetOpcode::G_FCEIL: 1633 return legalizeFceil(MI, MRI, B); 1634 case TargetOpcode::G_FREM: 1635 return legalizeFrem(MI, MRI, B); 1636 case TargetOpcode::G_INTRINSIC_TRUNC: 1637 return legalizeIntrinsicTrunc(MI, MRI, B); 1638 case TargetOpcode::G_SITOFP: 1639 return legalizeITOFP(MI, MRI, B, true); 1640 case TargetOpcode::G_UITOFP: 1641 return legalizeITOFP(MI, MRI, B, false); 1642 case TargetOpcode::G_FPTOSI: 1643 return legalizeFPTOI(MI, MRI, B, true); 1644 case TargetOpcode::G_FPTOUI: 1645 return legalizeFPTOI(MI, MRI, B, false); 1646 case TargetOpcode::G_FMINNUM: 1647 case TargetOpcode::G_FMAXNUM: 1648 case TargetOpcode::G_FMINNUM_IEEE: 1649 case TargetOpcode::G_FMAXNUM_IEEE: 1650 return legalizeMinNumMaxNum(Helper, MI); 1651 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1652 return legalizeExtractVectorElt(MI, MRI, B); 1653 case TargetOpcode::G_INSERT_VECTOR_ELT: 1654 return legalizeInsertVectorElt(MI, MRI, B); 1655 case TargetOpcode::G_SHUFFLE_VECTOR: 1656 return legalizeShuffleVector(MI, MRI, B); 1657 case TargetOpcode::G_FSIN: 1658 case TargetOpcode::G_FCOS: 1659 return legalizeSinCos(MI, MRI, B); 1660 case TargetOpcode::G_GLOBAL_VALUE: 1661 return legalizeGlobalValue(MI, MRI, B); 1662 case TargetOpcode::G_LOAD: 1663 return legalizeLoad(Helper, MI); 1664 case TargetOpcode::G_FMAD: 1665 return legalizeFMad(MI, MRI, B); 1666 case TargetOpcode::G_FDIV: 1667 return legalizeFDIV(MI, MRI, B); 1668 case TargetOpcode::G_UDIV: 1669 case TargetOpcode::G_UREM: 1670 return legalizeUDIV_UREM(MI, MRI, B); 1671 case TargetOpcode::G_SDIV: 1672 case TargetOpcode::G_SREM: 1673 return legalizeSDIV_SREM(MI, MRI, B); 1674 case TargetOpcode::G_ATOMIC_CMPXCHG: 1675 return legalizeAtomicCmpXChg(MI, MRI, B); 1676 case TargetOpcode::G_FLOG: 1677 return legalizeFlog(MI, B, numbers::ln2f); 1678 case TargetOpcode::G_FLOG10: 1679 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1680 case TargetOpcode::G_FEXP: 1681 return legalizeFExp(MI, B); 1682 case TargetOpcode::G_FPOW: 1683 return legalizeFPow(MI, B); 1684 case TargetOpcode::G_FFLOOR: 1685 return legalizeFFloor(MI, MRI, B); 1686 case TargetOpcode::G_BUILD_VECTOR: 1687 return legalizeBuildVector(MI, MRI, B); 1688 default: 1689 return false; 1690 } 1691 1692 llvm_unreachable("expected switch to return"); 1693 } 1694 1695 Register AMDGPULegalizerInfo::getSegmentAperture( 1696 unsigned AS, 1697 MachineRegisterInfo &MRI, 1698 MachineIRBuilder &B) const { 1699 MachineFunction &MF = B.getMF(); 1700 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1701 const LLT S32 = LLT::scalar(32); 1702 1703 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1704 1705 if (ST.hasApertureRegs()) { 1706 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1707 // getreg. 1708 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1709 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1710 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1711 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1712 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1713 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1714 unsigned Encoding = 1715 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1716 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1717 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1718 1719 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1720 1721 B.buildInstr(AMDGPU::S_GETREG_B32) 1722 .addDef(GetReg) 1723 .addImm(Encoding); 1724 MRI.setType(GetReg, S32); 1725 1726 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1727 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1728 } 1729 1730 Register QueuePtr = MRI.createGenericVirtualRegister( 1731 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1732 1733 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1734 return Register(); 1735 1736 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1737 // private_segment_aperture_base_hi. 1738 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1739 1740 // TODO: can we be smarter about machine pointer info? 1741 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1742 MachineMemOperand *MMO = MF.getMachineMemOperand( 1743 PtrInfo, 1744 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1745 MachineMemOperand::MOInvariant, 1746 4, commonAlignment(Align(64), StructOffset)); 1747 1748 Register LoadAddr; 1749 1750 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1751 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1752 } 1753 1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1755 MachineInstr &MI, MachineRegisterInfo &MRI, 1756 MachineIRBuilder &B) const { 1757 MachineFunction &MF = B.getMF(); 1758 1759 const LLT S32 = LLT::scalar(32); 1760 Register Dst = MI.getOperand(0).getReg(); 1761 Register Src = MI.getOperand(1).getReg(); 1762 1763 LLT DstTy = MRI.getType(Dst); 1764 LLT SrcTy = MRI.getType(Src); 1765 unsigned DestAS = DstTy.getAddressSpace(); 1766 unsigned SrcAS = SrcTy.getAddressSpace(); 1767 1768 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1769 // vector element. 1770 assert(!DstTy.isVector()); 1771 1772 const AMDGPUTargetMachine &TM 1773 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1774 1775 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1776 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1777 return true; 1778 } 1779 1780 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1781 // Truncate. 1782 B.buildExtract(Dst, Src, 0); 1783 MI.eraseFromParent(); 1784 return true; 1785 } 1786 1787 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1789 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1790 1791 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1792 // another. Merge operands are required to be the same type, but creating an 1793 // extra ptrtoint would be kind of pointless. 1794 auto HighAddr = B.buildConstant( 1795 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1796 B.buildMerge(Dst, {Src, HighAddr}); 1797 MI.eraseFromParent(); 1798 return true; 1799 } 1800 1801 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1802 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1803 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1804 unsigned NullVal = TM.getNullPointerValue(DestAS); 1805 1806 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1807 auto FlatNull = B.buildConstant(SrcTy, 0); 1808 1809 // Extract low 32-bits of the pointer. 1810 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1811 1812 auto CmpRes = 1813 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1814 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1821 return false; 1822 1823 if (!ST.hasFlatAddressSpace()) 1824 return false; 1825 1826 auto SegmentNull = 1827 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1828 auto FlatNull = 1829 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1830 1831 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1832 if (!ApertureReg.isValid()) 1833 return false; 1834 1835 auto CmpRes = 1836 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1837 1838 // Coerce the type of the low half of the result so we can use merge_values. 1839 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1840 1841 // TODO: Should we allow mismatched types but matching sizes in merges to 1842 // avoid the ptrtoint? 1843 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1844 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1845 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeFrint( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 Register Src = MI.getOperand(1).getReg(); 1854 LLT Ty = MRI.getType(Src); 1855 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1856 1857 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1858 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1859 1860 auto C1 = B.buildFConstant(Ty, C1Val); 1861 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1862 1863 // TODO: Should this propagate fast-math-flags? 1864 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1865 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1866 1867 auto C2 = B.buildFConstant(Ty, C2Val); 1868 auto Fabs = B.buildFAbs(Ty, Src); 1869 1870 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1871 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizeFceil( 1877 MachineInstr &MI, MachineRegisterInfo &MRI, 1878 MachineIRBuilder &B) const { 1879 1880 const LLT S1 = LLT::scalar(1); 1881 const LLT S64 = LLT::scalar(64); 1882 1883 Register Src = MI.getOperand(1).getReg(); 1884 assert(MRI.getType(Src) == S64); 1885 1886 // result = trunc(src) 1887 // if (src > 0.0 && src != result) 1888 // result += 1.0 1889 1890 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1891 1892 const auto Zero = B.buildFConstant(S64, 0.0); 1893 const auto One = B.buildFConstant(S64, 1.0); 1894 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1895 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1896 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1897 auto Add = B.buildSelect(S64, And, One, Zero); 1898 1899 // TODO: Should this propagate fast-math-flags? 1900 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeFrem( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B) const { 1907 Register DstReg = MI.getOperand(0).getReg(); 1908 Register Src0Reg = MI.getOperand(1).getReg(); 1909 Register Src1Reg = MI.getOperand(2).getReg(); 1910 auto Flags = MI.getFlags(); 1911 LLT Ty = MRI.getType(DstReg); 1912 1913 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1914 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1915 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1916 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1917 MI.eraseFromParent(); 1918 return true; 1919 } 1920 1921 static MachineInstrBuilder extractF64Exponent(Register Hi, 1922 MachineIRBuilder &B) { 1923 const unsigned FractBits = 52; 1924 const unsigned ExpBits = 11; 1925 LLT S32 = LLT::scalar(32); 1926 1927 auto Const0 = B.buildConstant(S32, FractBits - 32); 1928 auto Const1 = B.buildConstant(S32, ExpBits); 1929 1930 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1931 .addUse(Hi) 1932 .addUse(Const0.getReg(0)) 1933 .addUse(Const1.getReg(0)); 1934 1935 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1936 } 1937 1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1939 MachineInstr &MI, MachineRegisterInfo &MRI, 1940 MachineIRBuilder &B) const { 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S32 = LLT::scalar(32); 1943 const LLT S64 = LLT::scalar(64); 1944 1945 Register Src = MI.getOperand(1).getReg(); 1946 assert(MRI.getType(Src) == S64); 1947 1948 // TODO: Should this use extract since the low half is unused? 1949 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1950 Register Hi = Unmerge.getReg(1); 1951 1952 // Extract the upper half, since this is where we will find the sign and 1953 // exponent. 1954 auto Exp = extractF64Exponent(Hi, B); 1955 1956 const unsigned FractBits = 52; 1957 1958 // Extract the sign bit. 1959 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1960 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1961 1962 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1963 1964 const auto Zero32 = B.buildConstant(S32, 0); 1965 1966 // Extend back to 64-bits. 1967 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1968 1969 auto Shr = B.buildAShr(S64, FractMask, Exp); 1970 auto Not = B.buildNot(S64, Shr); 1971 auto Tmp0 = B.buildAnd(S64, Src, Not); 1972 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1973 1974 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1975 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1976 1977 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1978 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeITOFP( 1984 MachineInstr &MI, MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B, bool Signed) const { 1986 1987 Register Dst = MI.getOperand(0).getReg(); 1988 Register Src = MI.getOperand(1).getReg(); 1989 1990 const LLT S64 = LLT::scalar(64); 1991 const LLT S32 = LLT::scalar(32); 1992 1993 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1994 1995 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1996 1997 auto CvtHi = Signed ? 1998 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1999 B.buildUITOFP(S64, Unmerge.getReg(1)); 2000 2001 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2002 2003 auto ThirtyTwo = B.buildConstant(S32, 32); 2004 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2005 .addUse(CvtHi.getReg(0)) 2006 .addUse(ThirtyTwo.getReg(0)); 2007 2008 // TODO: Should this propagate fast-math-flags? 2009 B.buildFAdd(Dst, LdExp, CvtLo); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 // TODO: Copied from DAG implementation. Verify logic and document how this 2015 // actually works. 2016 bool AMDGPULegalizerInfo::legalizeFPTOI( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 MachineIRBuilder &B, bool Signed) const { 2019 2020 Register Dst = MI.getOperand(0).getReg(); 2021 Register Src = MI.getOperand(1).getReg(); 2022 2023 const LLT S64 = LLT::scalar(64); 2024 const LLT S32 = LLT::scalar(32); 2025 2026 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2027 2028 unsigned Flags = MI.getFlags(); 2029 2030 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2031 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2032 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2033 2034 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2035 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2036 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2037 2038 auto Hi = Signed ? 2039 B.buildFPTOSI(S32, FloorMul) : 2040 B.buildFPTOUI(S32, FloorMul); 2041 auto Lo = B.buildFPTOUI(S32, Fma); 2042 2043 B.buildMerge(Dst, { Lo, Hi }); 2044 MI.eraseFromParent(); 2045 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2050 MachineInstr &MI) const { 2051 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2052 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2053 2054 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2055 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2056 2057 // With ieee_mode disabled, the instructions have the correct behavior 2058 // already for G_FMINNUM/G_FMAXNUM 2059 if (!MFI->getMode().IEEE) 2060 return !IsIEEEOp; 2061 2062 if (IsIEEEOp) 2063 return true; 2064 2065 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2066 } 2067 2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2069 MachineInstr &MI, MachineRegisterInfo &MRI, 2070 MachineIRBuilder &B) const { 2071 // TODO: Should move some of this into LegalizerHelper. 2072 2073 // TODO: Promote dynamic indexing of s16 to s32 2074 2075 // FIXME: Artifact combiner probably should have replaced the truncated 2076 // constant before this, so we shouldn't need 2077 // getConstantVRegValWithLookThrough. 2078 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2079 MI.getOperand(2).getReg(), MRI); 2080 if (!IdxVal) // Dynamic case will be selected to register indexing. 2081 return true; 2082 2083 Register Dst = MI.getOperand(0).getReg(); 2084 Register Vec = MI.getOperand(1).getReg(); 2085 2086 LLT VecTy = MRI.getType(Vec); 2087 LLT EltTy = VecTy.getElementType(); 2088 assert(EltTy == MRI.getType(Dst)); 2089 2090 if (IdxVal->Value < VecTy.getNumElements()) 2091 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2092 else 2093 B.buildUndef(Dst); 2094 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2100 MachineInstr &MI, MachineRegisterInfo &MRI, 2101 MachineIRBuilder &B) const { 2102 // TODO: Should move some of this into LegalizerHelper. 2103 2104 // TODO: Promote dynamic indexing of s16 to s32 2105 2106 // FIXME: Artifact combiner probably should have replaced the truncated 2107 // constant before this, so we shouldn't need 2108 // getConstantVRegValWithLookThrough. 2109 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2110 MI.getOperand(3).getReg(), MRI); 2111 if (!IdxVal) // Dynamic case will be selected to register indexing. 2112 return true; 2113 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Vec = MI.getOperand(1).getReg(); 2116 Register Ins = MI.getOperand(2).getReg(); 2117 2118 LLT VecTy = MRI.getType(Vec); 2119 LLT EltTy = VecTy.getElementType(); 2120 assert(EltTy == MRI.getType(Ins)); 2121 2122 if (IdxVal->Value < VecTy.getNumElements()) 2123 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2124 else 2125 B.buildUndef(Dst); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2132 MachineInstr &MI, MachineRegisterInfo &MRI, 2133 MachineIRBuilder &B) const { 2134 const LLT V2S16 = LLT::vector(2, 16); 2135 2136 Register Dst = MI.getOperand(0).getReg(); 2137 Register Src0 = MI.getOperand(1).getReg(); 2138 LLT DstTy = MRI.getType(Dst); 2139 LLT SrcTy = MRI.getType(Src0); 2140 2141 if (SrcTy == V2S16 && DstTy == V2S16 && 2142 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2143 return true; 2144 2145 MachineIRBuilder HelperBuilder(MI); 2146 GISelObserverWrapper DummyObserver; 2147 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2148 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2149 } 2150 2151 bool AMDGPULegalizerInfo::legalizeSinCos( 2152 MachineInstr &MI, MachineRegisterInfo &MRI, 2153 MachineIRBuilder &B) const { 2154 2155 Register DstReg = MI.getOperand(0).getReg(); 2156 Register SrcReg = MI.getOperand(1).getReg(); 2157 LLT Ty = MRI.getType(DstReg); 2158 unsigned Flags = MI.getFlags(); 2159 2160 Register TrigVal; 2161 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2162 if (ST.hasTrigReducedRange()) { 2163 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2164 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2165 .addUse(MulVal.getReg(0)) 2166 .setMIFlags(Flags).getReg(0); 2167 } else 2168 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2169 2170 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2171 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2172 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2173 .addUse(TrigVal) 2174 .setMIFlags(Flags); 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2180 MachineIRBuilder &B, 2181 const GlobalValue *GV, 2182 int64_t Offset, 2183 unsigned GAFlags) const { 2184 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2185 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2186 // to the following code sequence: 2187 // 2188 // For constant address space: 2189 // s_getpc_b64 s[0:1] 2190 // s_add_u32 s0, s0, $symbol 2191 // s_addc_u32 s1, s1, 0 2192 // 2193 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2194 // a fixup or relocation is emitted to replace $symbol with a literal 2195 // constant, which is a pc-relative offset from the encoding of the $symbol 2196 // operand to the global variable. 2197 // 2198 // For global address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2201 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // fixups or relocations are emitted to replace $symbol@*@lo and 2205 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2206 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2207 // operand to the global variable. 2208 // 2209 // What we want here is an offset from the value returned by s_getpc 2210 // (which is the address of the s_add_u32 instruction) to the global 2211 // variable, but since the encoding of $symbol starts 4 bytes after the start 2212 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2213 // small. This requires us to add 4 to the global variable offset in order to 2214 // compute the correct address. 2215 2216 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2217 2218 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2219 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2220 2221 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2222 .addDef(PCReg); 2223 2224 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2225 if (GAFlags == SIInstrInfo::MO_NONE) 2226 MIB.addImm(0); 2227 else 2228 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2229 2230 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2231 2232 if (PtrTy.getSizeInBits() == 32) 2233 B.buildExtract(DstReg, PCReg, 0); 2234 return true; 2235 } 2236 2237 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2238 MachineInstr &MI, MachineRegisterInfo &MRI, 2239 MachineIRBuilder &B) const { 2240 Register DstReg = MI.getOperand(0).getReg(); 2241 LLT Ty = MRI.getType(DstReg); 2242 unsigned AS = Ty.getAddressSpace(); 2243 2244 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2245 MachineFunction &MF = B.getMF(); 2246 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2247 2248 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2249 if (!MFI->isEntryFunction()) { 2250 const Function &Fn = MF.getFunction(); 2251 DiagnosticInfoUnsupported BadLDSDecl( 2252 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2253 DS_Warning); 2254 Fn.getContext().diagnose(BadLDSDecl); 2255 2256 // We currently don't have a way to correctly allocate LDS objects that 2257 // aren't directly associated with a kernel. We do force inlining of 2258 // functions that use local objects. However, if these dead functions are 2259 // not eliminated, we don't want a compile time error. Just emit a warning 2260 // and a trap, since there should be no callable path here. 2261 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2262 B.buildUndef(DstReg); 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 // TODO: We could emit code to handle the initialization somewhere. 2268 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2269 const SITargetLowering *TLI = ST.getTargetLowering(); 2270 if (!TLI->shouldUseLDSConstAddress(GV)) { 2271 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2272 return true; // Leave in place; 2273 } 2274 2275 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2276 Type *Ty = GV->getValueType(); 2277 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2278 // zero-sized type in other languages to declare the dynamic shared 2279 // memory which size is not known at the compile time. They will be 2280 // allocated by the runtime and placed directly after the static 2281 // allocated ones. They all share the same offset. 2282 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2283 // Adjust alignment for that dynamic shared memory array. 2284 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2285 LLT S32 = LLT::scalar(32); 2286 auto Sz = 2287 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2288 B.buildIntToPtr(DstReg, Sz); 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 } 2293 2294 B.buildConstant( 2295 DstReg, 2296 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2297 MI.eraseFromParent(); 2298 return true; 2299 } 2300 2301 const Function &Fn = MF.getFunction(); 2302 DiagnosticInfoUnsupported BadInit( 2303 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2304 Fn.getContext().diagnose(BadInit); 2305 return true; 2306 } 2307 2308 const SITargetLowering *TLI = ST.getTargetLowering(); 2309 2310 if (TLI->shouldEmitFixup(GV)) { 2311 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2312 MI.eraseFromParent(); 2313 return true; 2314 } 2315 2316 if (TLI->shouldEmitPCReloc(GV)) { 2317 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2318 MI.eraseFromParent(); 2319 return true; 2320 } 2321 2322 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2323 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2324 2325 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2326 MachinePointerInfo::getGOT(MF), 2327 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2328 MachineMemOperand::MOInvariant, 2329 8 /*Size*/, Align(8)); 2330 2331 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2332 2333 if (Ty.getSizeInBits() == 32) { 2334 // Truncate if this is a 32-bit constant adrdess. 2335 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2336 B.buildExtract(DstReg, Load, 0); 2337 } else 2338 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2339 2340 MI.eraseFromParent(); 2341 return true; 2342 } 2343 2344 static LLT widenToNextPowerOf2(LLT Ty) { 2345 if (Ty.isVector()) 2346 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2347 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2351 MachineInstr &MI) const { 2352 MachineIRBuilder &B = Helper.MIRBuilder; 2353 MachineRegisterInfo &MRI = *B.getMRI(); 2354 GISelChangeObserver &Observer = Helper.Observer; 2355 2356 Register PtrReg = MI.getOperand(1).getReg(); 2357 LLT PtrTy = MRI.getType(PtrReg); 2358 unsigned AddrSpace = PtrTy.getAddressSpace(); 2359 2360 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2361 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2362 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2363 Observer.changingInstr(MI); 2364 MI.getOperand(1).setReg(Cast.getReg(0)); 2365 Observer.changedInstr(MI); 2366 return true; 2367 } 2368 2369 Register ValReg = MI.getOperand(0).getReg(); 2370 LLT ValTy = MRI.getType(ValReg); 2371 2372 MachineMemOperand *MMO = *MI.memoperands_begin(); 2373 const unsigned ValSize = ValTy.getSizeInBits(); 2374 const unsigned MemSize = 8 * MMO->getSize(); 2375 const Align MemAlign = MMO->getAlign(); 2376 const unsigned AlignInBits = 8 * MemAlign.value(); 2377 2378 // Widen non-power-of-2 loads to the alignment if needed 2379 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2380 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2381 2382 // This was already the correct extending load result type, so just adjust 2383 // the memory type. 2384 if (WideMemSize == ValSize) { 2385 MachineFunction &MF = B.getMF(); 2386 2387 MachineMemOperand *WideMMO = 2388 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2389 Observer.changingInstr(MI); 2390 MI.setMemRefs(MF, {WideMMO}); 2391 Observer.changedInstr(MI); 2392 return true; 2393 } 2394 2395 // Don't bother handling edge case that should probably never be produced. 2396 if (ValSize > WideMemSize) 2397 return false; 2398 2399 LLT WideTy = widenToNextPowerOf2(ValTy); 2400 2401 Register WideLoad; 2402 if (!WideTy.isVector()) { 2403 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2404 B.buildTrunc(ValReg, WideLoad).getReg(0); 2405 } else { 2406 // Extract the subvector. 2407 2408 if (isRegisterType(ValTy)) { 2409 // If this a case where G_EXTRACT is legal, use it. 2410 // (e.g. <3 x s32> -> <4 x s32>) 2411 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2412 B.buildExtract(ValReg, WideLoad, 0); 2413 } else { 2414 // For cases where the widened type isn't a nice register value, unmerge 2415 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2416 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2417 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2418 B.setInsertPt(B.getMBB(), MI.getIterator()); 2419 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2420 } 2421 } 2422 2423 MI.eraseFromParent(); 2424 return true; 2425 } 2426 2427 return false; 2428 } 2429 2430 bool AMDGPULegalizerInfo::legalizeFMad( 2431 MachineInstr &MI, MachineRegisterInfo &MRI, 2432 MachineIRBuilder &B) const { 2433 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2434 assert(Ty.isScalar()); 2435 2436 MachineFunction &MF = B.getMF(); 2437 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2438 2439 // TODO: Always legal with future ftz flag. 2440 // FIXME: Do we need just output? 2441 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2442 return true; 2443 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2444 return true; 2445 2446 MachineIRBuilder HelperBuilder(MI); 2447 GISelObserverWrapper DummyObserver; 2448 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2449 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2450 } 2451 2452 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2453 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2454 Register DstReg = MI.getOperand(0).getReg(); 2455 Register PtrReg = MI.getOperand(1).getReg(); 2456 Register CmpVal = MI.getOperand(2).getReg(); 2457 Register NewVal = MI.getOperand(3).getReg(); 2458 2459 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2460 "this should not have been custom lowered"); 2461 2462 LLT ValTy = MRI.getType(CmpVal); 2463 LLT VecTy = LLT::vector(2, ValTy); 2464 2465 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2466 2467 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2468 .addDef(DstReg) 2469 .addUse(PtrReg) 2470 .addUse(PackedVal) 2471 .setMemRefs(MI.memoperands()); 2472 2473 MI.eraseFromParent(); 2474 return true; 2475 } 2476 2477 bool AMDGPULegalizerInfo::legalizeFlog( 2478 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2479 Register Dst = MI.getOperand(0).getReg(); 2480 Register Src = MI.getOperand(1).getReg(); 2481 LLT Ty = B.getMRI()->getType(Dst); 2482 unsigned Flags = MI.getFlags(); 2483 2484 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2485 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2486 2487 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2488 MI.eraseFromParent(); 2489 return true; 2490 } 2491 2492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2493 MachineIRBuilder &B) const { 2494 Register Dst = MI.getOperand(0).getReg(); 2495 Register Src = MI.getOperand(1).getReg(); 2496 unsigned Flags = MI.getFlags(); 2497 LLT Ty = B.getMRI()->getType(Dst); 2498 2499 auto K = B.buildFConstant(Ty, numbers::log2e); 2500 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2501 B.buildFExp2(Dst, Mul, Flags); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2507 MachineIRBuilder &B) const { 2508 Register Dst = MI.getOperand(0).getReg(); 2509 Register Src0 = MI.getOperand(1).getReg(); 2510 Register Src1 = MI.getOperand(2).getReg(); 2511 unsigned Flags = MI.getFlags(); 2512 LLT Ty = B.getMRI()->getType(Dst); 2513 const LLT S16 = LLT::scalar(16); 2514 const LLT S32 = LLT::scalar(32); 2515 2516 if (Ty == S32) { 2517 auto Log = B.buildFLog2(S32, Src0, Flags); 2518 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2519 .addUse(Log.getReg(0)) 2520 .addUse(Src1) 2521 .setMIFlags(Flags); 2522 B.buildFExp2(Dst, Mul, Flags); 2523 } else if (Ty == S16) { 2524 // There's no f16 fmul_legacy, so we need to convert for it. 2525 auto Log = B.buildFLog2(S16, Src0, Flags); 2526 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2527 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2528 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2529 .addUse(Ext0.getReg(0)) 2530 .addUse(Ext1.getReg(0)) 2531 .setMIFlags(Flags); 2532 2533 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2534 } else 2535 return false; 2536 2537 MI.eraseFromParent(); 2538 return true; 2539 } 2540 2541 // Find a source register, ignoring any possible source modifiers. 2542 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2543 Register ModSrc = OrigSrc; 2544 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2545 ModSrc = SrcFNeg->getOperand(1).getReg(); 2546 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2547 ModSrc = SrcFAbs->getOperand(1).getReg(); 2548 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2549 ModSrc = SrcFAbs->getOperand(1).getReg(); 2550 return ModSrc; 2551 } 2552 2553 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2554 MachineRegisterInfo &MRI, 2555 MachineIRBuilder &B) const { 2556 2557 const LLT S1 = LLT::scalar(1); 2558 const LLT S64 = LLT::scalar(64); 2559 Register Dst = MI.getOperand(0).getReg(); 2560 Register OrigSrc = MI.getOperand(1).getReg(); 2561 unsigned Flags = MI.getFlags(); 2562 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2563 "this should not have been custom lowered"); 2564 2565 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2566 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2567 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2568 // V_FRACT bug is: 2569 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2570 // 2571 // Convert floor(x) to (x - fract(x)) 2572 2573 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2574 .addUse(OrigSrc) 2575 .setMIFlags(Flags); 2576 2577 // Give source modifier matching some assistance before obscuring a foldable 2578 // pattern. 2579 2580 // TODO: We can avoid the neg on the fract? The input sign to fract 2581 // shouldn't matter? 2582 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2583 2584 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2585 2586 Register Min = MRI.createGenericVirtualRegister(S64); 2587 2588 // We don't need to concern ourselves with the snan handling difference, so 2589 // use the one which will directly select. 2590 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2591 if (MFI->getMode().IEEE) 2592 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2593 else 2594 B.buildFMinNum(Min, Fract, Const, Flags); 2595 2596 Register CorrectedFract = Min; 2597 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2598 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2599 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2600 } 2601 2602 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2603 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2604 2605 MI.eraseFromParent(); 2606 return true; 2607 } 2608 2609 // Turn an illegal packed v2s16 build vector into bit operations. 2610 // TODO: This should probably be a bitcast action in LegalizerHelper. 2611 bool AMDGPULegalizerInfo::legalizeBuildVector( 2612 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2613 Register Dst = MI.getOperand(0).getReg(); 2614 const LLT S32 = LLT::scalar(32); 2615 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2616 2617 Register Src0 = MI.getOperand(1).getReg(); 2618 Register Src1 = MI.getOperand(2).getReg(); 2619 assert(MRI.getType(Src0) == LLT::scalar(16)); 2620 2621 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2622 B.buildBitcast(Dst, Merge); 2623 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 // Return the use branch instruction, otherwise null if the usage is invalid. 2629 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2630 MachineRegisterInfo &MRI, 2631 MachineInstr *&Br, 2632 MachineBasicBlock *&UncondBrTarget) { 2633 Register CondDef = MI.getOperand(0).getReg(); 2634 if (!MRI.hasOneNonDBGUse(CondDef)) 2635 return nullptr; 2636 2637 MachineBasicBlock *Parent = MI.getParent(); 2638 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2639 if (UseMI.getParent() != Parent || 2640 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2641 return nullptr; 2642 2643 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2644 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2645 if (Next == Parent->end()) { 2646 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2647 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2648 return nullptr; 2649 UncondBrTarget = &*NextMBB; 2650 } else { 2651 if (Next->getOpcode() != AMDGPU::G_BR) 2652 return nullptr; 2653 Br = &*Next; 2654 UncondBrTarget = Br->getOperand(0).getMBB(); 2655 } 2656 2657 return &UseMI; 2658 } 2659 2660 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2661 const ArgDescriptor *Arg, 2662 const TargetRegisterClass *ArgRC, 2663 LLT ArgTy) const { 2664 MCRegister SrcReg = Arg->getRegister(); 2665 assert(SrcReg.isPhysical() && "Physical register expected"); 2666 assert(DstReg.isVirtual() && "Virtual register expected"); 2667 2668 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2669 ArgTy); 2670 if (Arg->isMasked()) { 2671 // TODO: Should we try to emit this once in the entry block? 2672 const LLT S32 = LLT::scalar(32); 2673 const unsigned Mask = Arg->getMask(); 2674 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2675 2676 Register AndMaskSrc = LiveIn; 2677 2678 if (Shift != 0) { 2679 auto ShiftAmt = B.buildConstant(S32, Shift); 2680 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2681 } 2682 2683 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2684 } else { 2685 B.buildCopy(DstReg, LiveIn); 2686 } 2687 2688 return true; 2689 } 2690 2691 bool AMDGPULegalizerInfo::loadInputValue( 2692 Register DstReg, MachineIRBuilder &B, 2693 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2694 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2695 const ArgDescriptor *Arg; 2696 const TargetRegisterClass *ArgRC; 2697 LLT ArgTy; 2698 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2699 2700 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2701 return false; // TODO: Handle these 2702 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2703 } 2704 2705 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2706 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2707 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2708 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2709 return false; 2710 2711 MI.eraseFromParent(); 2712 return true; 2713 } 2714 2715 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2716 MachineRegisterInfo &MRI, 2717 MachineIRBuilder &B) const { 2718 Register Dst = MI.getOperand(0).getReg(); 2719 LLT DstTy = MRI.getType(Dst); 2720 LLT S16 = LLT::scalar(16); 2721 LLT S32 = LLT::scalar(32); 2722 LLT S64 = LLT::scalar(64); 2723 2724 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2725 return true; 2726 2727 if (DstTy == S16) 2728 return legalizeFDIV16(MI, MRI, B); 2729 if (DstTy == S32) 2730 return legalizeFDIV32(MI, MRI, B); 2731 if (DstTy == S64) 2732 return legalizeFDIV64(MI, MRI, B); 2733 2734 return false; 2735 } 2736 2737 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2738 Register DstReg, 2739 Register X, 2740 Register Y, 2741 bool IsDiv) const { 2742 const LLT S1 = LLT::scalar(1); 2743 const LLT S32 = LLT::scalar(32); 2744 2745 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2746 // algorithm used here. 2747 2748 // Initial estimate of inv(y). 2749 auto FloatY = B.buildUITOFP(S32, Y); 2750 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2751 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2752 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2753 auto Z = B.buildFPTOUI(S32, ScaledY); 2754 2755 // One round of UNR. 2756 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2757 auto NegYZ = B.buildMul(S32, NegY, Z); 2758 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2759 2760 // Quotient/remainder estimate. 2761 auto Q = B.buildUMulH(S32, X, Z); 2762 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2763 2764 // First quotient/remainder refinement. 2765 auto One = B.buildConstant(S32, 1); 2766 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2767 if (IsDiv) 2768 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2769 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2770 2771 // Second quotient/remainder refinement. 2772 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2773 if (IsDiv) 2774 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2775 else 2776 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2777 } 2778 2779 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2780 MachineRegisterInfo &MRI, 2781 MachineIRBuilder &B) const { 2782 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2783 Register DstReg = MI.getOperand(0).getReg(); 2784 Register Num = MI.getOperand(1).getReg(); 2785 Register Den = MI.getOperand(2).getReg(); 2786 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2787 MI.eraseFromParent(); 2788 return true; 2789 } 2790 2791 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2792 // 2793 // Return lo, hi of result 2794 // 2795 // %cvt.lo = G_UITOFP Val.lo 2796 // %cvt.hi = G_UITOFP Val.hi 2797 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2798 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2799 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2800 // %mul2 = G_FMUL %mul1, 2**(-32) 2801 // %trunc = G_INTRINSIC_TRUNC %mul2 2802 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2803 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2804 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2805 Register Val) { 2806 const LLT S32 = LLT::scalar(32); 2807 auto Unmerge = B.buildUnmerge(S32, Val); 2808 2809 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2810 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2811 2812 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2813 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2814 2815 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2816 auto Mul1 = 2817 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2818 2819 // 2**(-32) 2820 auto Mul2 = 2821 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2822 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2823 2824 // -(2**32) 2825 auto Mad2 = B.buildFMAD(S32, Trunc, 2826 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2827 2828 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2829 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2830 2831 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2832 } 2833 2834 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2835 Register DstReg, 2836 Register Numer, 2837 Register Denom, 2838 bool IsDiv) const { 2839 const LLT S32 = LLT::scalar(32); 2840 const LLT S64 = LLT::scalar(64); 2841 const LLT S1 = LLT::scalar(1); 2842 Register RcpLo, RcpHi; 2843 2844 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2845 2846 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2847 2848 auto Zero64 = B.buildConstant(S64, 0); 2849 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2850 2851 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2852 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2853 2854 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2855 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2856 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2857 2858 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2859 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2860 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2861 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2862 2863 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2864 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2865 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2866 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2867 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2868 2869 auto Zero32 = B.buildConstant(S32, 0); 2870 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2871 auto Add2_HiC = 2872 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2873 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2874 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2875 2876 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2877 Register NumerLo = UnmergeNumer.getReg(0); 2878 Register NumerHi = UnmergeNumer.getReg(1); 2879 2880 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2881 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2882 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2883 Register Mul3_Lo = UnmergeMul3.getReg(0); 2884 Register Mul3_Hi = UnmergeMul3.getReg(1); 2885 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2886 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2887 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2888 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2889 2890 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2891 Register DenomLo = UnmergeDenom.getReg(0); 2892 Register DenomHi = UnmergeDenom.getReg(1); 2893 2894 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2895 auto C1 = B.buildSExt(S32, CmpHi); 2896 2897 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2898 auto C2 = B.buildSExt(S32, CmpLo); 2899 2900 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2901 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2902 2903 // TODO: Here and below portions of the code can be enclosed into if/endif. 2904 // Currently control flow is unconditional and we have 4 selects after 2905 // potential endif to substitute PHIs. 2906 2907 // if C3 != 0 ... 2908 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2909 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2910 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2911 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2912 2913 auto One64 = B.buildConstant(S64, 1); 2914 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2915 2916 auto C4 = 2917 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2918 auto C5 = 2919 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2920 auto C6 = B.buildSelect( 2921 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2922 2923 // if (C6 != 0) 2924 auto Add4 = B.buildAdd(S64, Add3, One64); 2925 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2926 2927 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2928 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2929 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2930 2931 // endif C6 2932 // endif C3 2933 2934 if (IsDiv) { 2935 auto Sel1 = B.buildSelect( 2936 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2937 B.buildSelect(DstReg, 2938 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2939 } else { 2940 auto Sel2 = B.buildSelect( 2941 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2942 B.buildSelect(DstReg, 2943 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2944 } 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2948 MachineRegisterInfo &MRI, 2949 MachineIRBuilder &B) const { 2950 const LLT S64 = LLT::scalar(64); 2951 const LLT S32 = LLT::scalar(32); 2952 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2953 Register DstReg = MI.getOperand(0).getReg(); 2954 Register Num = MI.getOperand(1).getReg(); 2955 Register Den = MI.getOperand(2).getReg(); 2956 LLT Ty = MRI.getType(DstReg); 2957 2958 if (Ty == S32) 2959 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2960 else if (Ty == S64) 2961 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2962 else 2963 return false; 2964 2965 MI.eraseFromParent(); 2966 return true; 2967 2968 } 2969 2970 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2971 MachineRegisterInfo &MRI, 2972 MachineIRBuilder &B) const { 2973 const LLT S64 = LLT::scalar(64); 2974 const LLT S32 = LLT::scalar(32); 2975 2976 Register DstReg = MI.getOperand(0).getReg(); 2977 const LLT Ty = MRI.getType(DstReg); 2978 if (Ty != S32 && Ty != S64) 2979 return false; 2980 2981 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2982 2983 Register LHS = MI.getOperand(1).getReg(); 2984 Register RHS = MI.getOperand(2).getReg(); 2985 2986 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2987 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2988 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2989 2990 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2991 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2992 2993 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2994 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2995 2996 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2997 if (Ty == S32) 2998 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2999 else 3000 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3001 3002 Register Sign; 3003 if (IsDiv) 3004 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3005 else 3006 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3007 3008 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3009 B.buildSub(DstReg, UDivRem, Sign); 3010 3011 MI.eraseFromParent(); 3012 return true; 3013 } 3014 3015 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3016 MachineRegisterInfo &MRI, 3017 MachineIRBuilder &B) const { 3018 Register Res = MI.getOperand(0).getReg(); 3019 Register LHS = MI.getOperand(1).getReg(); 3020 Register RHS = MI.getOperand(2).getReg(); 3021 3022 uint16_t Flags = MI.getFlags(); 3023 3024 LLT ResTy = MRI.getType(Res); 3025 LLT S32 = LLT::scalar(32); 3026 LLT S64 = LLT::scalar(64); 3027 3028 const MachineFunction &MF = B.getMF(); 3029 bool Unsafe = 3030 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3031 3032 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3033 return false; 3034 3035 if (!Unsafe && ResTy == S32 && 3036 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3037 return false; 3038 3039 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3040 // 1 / x -> RCP(x) 3041 if (CLHS->isExactlyValue(1.0)) { 3042 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3043 .addUse(RHS) 3044 .setMIFlags(Flags); 3045 3046 MI.eraseFromParent(); 3047 return true; 3048 } 3049 3050 // -1 / x -> RCP( FNEG(x) ) 3051 if (CLHS->isExactlyValue(-1.0)) { 3052 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3053 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3054 .addUse(FNeg.getReg(0)) 3055 .setMIFlags(Flags); 3056 3057 MI.eraseFromParent(); 3058 return true; 3059 } 3060 } 3061 3062 // x / y -> x * (1.0 / y) 3063 if (Unsafe) { 3064 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3065 .addUse(RHS) 3066 .setMIFlags(Flags); 3067 B.buildFMul(Res, LHS, RCP, Flags); 3068 3069 MI.eraseFromParent(); 3070 return true; 3071 } 3072 3073 return false; 3074 } 3075 3076 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3077 MachineRegisterInfo &MRI, 3078 MachineIRBuilder &B) const { 3079 Register Res = MI.getOperand(0).getReg(); 3080 Register LHS = MI.getOperand(1).getReg(); 3081 Register RHS = MI.getOperand(2).getReg(); 3082 3083 uint16_t Flags = MI.getFlags(); 3084 3085 LLT S16 = LLT::scalar(16); 3086 LLT S32 = LLT::scalar(32); 3087 3088 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3089 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3090 3091 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3092 .addUse(RHSExt.getReg(0)) 3093 .setMIFlags(Flags); 3094 3095 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3096 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3097 3098 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3099 .addUse(RDst.getReg(0)) 3100 .addUse(RHS) 3101 .addUse(LHS) 3102 .setMIFlags(Flags); 3103 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3109 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3110 static void toggleSPDenormMode(bool Enable, 3111 MachineIRBuilder &B, 3112 const GCNSubtarget &ST, 3113 AMDGPU::SIModeRegisterDefaults Mode) { 3114 // Set SP denorm mode to this value. 3115 unsigned SPDenormMode = 3116 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3117 3118 if (ST.hasDenormModeInst()) { 3119 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3120 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3121 3122 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3123 B.buildInstr(AMDGPU::S_DENORM_MODE) 3124 .addImm(NewDenormModeValue); 3125 3126 } else { 3127 // Select FP32 bit field in mode register. 3128 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3129 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3130 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3131 3132 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3133 .addImm(SPDenormMode) 3134 .addImm(SPDenormModeBitField); 3135 } 3136 } 3137 3138 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3139 MachineRegisterInfo &MRI, 3140 MachineIRBuilder &B) const { 3141 Register Res = MI.getOperand(0).getReg(); 3142 Register LHS = MI.getOperand(1).getReg(); 3143 Register RHS = MI.getOperand(2).getReg(); 3144 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3145 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3146 3147 uint16_t Flags = MI.getFlags(); 3148 3149 LLT S32 = LLT::scalar(32); 3150 LLT S1 = LLT::scalar(1); 3151 3152 auto One = B.buildFConstant(S32, 1.0f); 3153 3154 auto DenominatorScaled = 3155 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3156 .addUse(LHS) 3157 .addUse(RHS) 3158 .addImm(0) 3159 .setMIFlags(Flags); 3160 auto NumeratorScaled = 3161 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3162 .addUse(LHS) 3163 .addUse(RHS) 3164 .addImm(1) 3165 .setMIFlags(Flags); 3166 3167 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3168 .addUse(DenominatorScaled.getReg(0)) 3169 .setMIFlags(Flags); 3170 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3171 3172 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3173 // aren't modeled as reading it. 3174 if (!Mode.allFP32Denormals()) 3175 toggleSPDenormMode(true, B, ST, Mode); 3176 3177 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3178 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3179 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3180 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3181 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3182 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3183 3184 if (!Mode.allFP32Denormals()) 3185 toggleSPDenormMode(false, B, ST, Mode); 3186 3187 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3188 .addUse(Fma4.getReg(0)) 3189 .addUse(Fma1.getReg(0)) 3190 .addUse(Fma3.getReg(0)) 3191 .addUse(NumeratorScaled.getReg(1)) 3192 .setMIFlags(Flags); 3193 3194 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3195 .addUse(Fmas.getReg(0)) 3196 .addUse(RHS) 3197 .addUse(LHS) 3198 .setMIFlags(Flags); 3199 3200 MI.eraseFromParent(); 3201 return true; 3202 } 3203 3204 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3205 MachineRegisterInfo &MRI, 3206 MachineIRBuilder &B) const { 3207 Register Res = MI.getOperand(0).getReg(); 3208 Register LHS = MI.getOperand(1).getReg(); 3209 Register RHS = MI.getOperand(2).getReg(); 3210 3211 uint16_t Flags = MI.getFlags(); 3212 3213 LLT S64 = LLT::scalar(64); 3214 LLT S1 = LLT::scalar(1); 3215 3216 auto One = B.buildFConstant(S64, 1.0); 3217 3218 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3219 .addUse(LHS) 3220 .addUse(RHS) 3221 .addImm(0) 3222 .setMIFlags(Flags); 3223 3224 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3225 3226 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3227 .addUse(DivScale0.getReg(0)) 3228 .setMIFlags(Flags); 3229 3230 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3231 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3232 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3233 3234 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3235 .addUse(LHS) 3236 .addUse(RHS) 3237 .addImm(1) 3238 .setMIFlags(Flags); 3239 3240 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3241 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3242 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3243 3244 Register Scale; 3245 if (!ST.hasUsableDivScaleConditionOutput()) { 3246 // Workaround a hardware bug on SI where the condition output from div_scale 3247 // is not usable. 3248 3249 LLT S32 = LLT::scalar(32); 3250 3251 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3252 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3253 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3254 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3255 3256 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3257 Scale1Unmerge.getReg(1)); 3258 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3259 Scale0Unmerge.getReg(1)); 3260 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3261 } else { 3262 Scale = DivScale1.getReg(1); 3263 } 3264 3265 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3266 .addUse(Fma4.getReg(0)) 3267 .addUse(Fma3.getReg(0)) 3268 .addUse(Mul.getReg(0)) 3269 .addUse(Scale) 3270 .setMIFlags(Flags); 3271 3272 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3273 .addUse(Fmas.getReg(0)) 3274 .addUse(RHS) 3275 .addUse(LHS) 3276 .setMIFlags(Flags); 3277 3278 MI.eraseFromParent(); 3279 return true; 3280 } 3281 3282 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3283 MachineRegisterInfo &MRI, 3284 MachineIRBuilder &B) const { 3285 Register Res = MI.getOperand(0).getReg(); 3286 Register LHS = MI.getOperand(2).getReg(); 3287 Register RHS = MI.getOperand(3).getReg(); 3288 uint16_t Flags = MI.getFlags(); 3289 3290 LLT S32 = LLT::scalar(32); 3291 LLT S1 = LLT::scalar(1); 3292 3293 auto Abs = B.buildFAbs(S32, RHS, Flags); 3294 const APFloat C0Val(1.0f); 3295 3296 auto C0 = B.buildConstant(S32, 0x6f800000); 3297 auto C1 = B.buildConstant(S32, 0x2f800000); 3298 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3299 3300 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3301 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3302 3303 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3304 3305 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3306 .addUse(Mul0.getReg(0)) 3307 .setMIFlags(Flags); 3308 3309 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3310 3311 B.buildFMul(Res, Sel, Mul1, Flags); 3312 3313 MI.eraseFromParent(); 3314 return true; 3315 } 3316 3317 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3318 // FIXME: Why do we handle this one but not other removed instructions? 3319 // 3320 // Reciprocal square root. The clamp prevents infinite results, clamping 3321 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3322 // +-max_float. 3323 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3324 MachineRegisterInfo &MRI, 3325 MachineIRBuilder &B) const { 3326 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3327 return true; 3328 3329 Register Dst = MI.getOperand(0).getReg(); 3330 Register Src = MI.getOperand(2).getReg(); 3331 auto Flags = MI.getFlags(); 3332 3333 LLT Ty = MRI.getType(Dst); 3334 3335 const fltSemantics *FltSemantics; 3336 if (Ty == LLT::scalar(32)) 3337 FltSemantics = &APFloat::IEEEsingle(); 3338 else if (Ty == LLT::scalar(64)) 3339 FltSemantics = &APFloat::IEEEdouble(); 3340 else 3341 return false; 3342 3343 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3344 .addUse(Src) 3345 .setMIFlags(Flags); 3346 3347 // We don't need to concern ourselves with the snan handling difference, since 3348 // the rsq quieted (or not) so use the one which will directly select. 3349 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3350 const bool UseIEEE = MFI->getMode().IEEE; 3351 3352 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3353 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3354 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3355 3356 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3357 3358 if (UseIEEE) 3359 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3360 else 3361 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3362 MI.eraseFromParent(); 3363 return true; 3364 } 3365 3366 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3367 switch (IID) { 3368 case Intrinsic::amdgcn_ds_fadd: 3369 return AMDGPU::G_ATOMICRMW_FADD; 3370 case Intrinsic::amdgcn_ds_fmin: 3371 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3372 case Intrinsic::amdgcn_ds_fmax: 3373 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3374 default: 3375 llvm_unreachable("not a DS FP intrinsic"); 3376 } 3377 } 3378 3379 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3380 MachineInstr &MI, 3381 Intrinsic::ID IID) const { 3382 GISelChangeObserver &Observer = Helper.Observer; 3383 Observer.changingInstr(MI); 3384 3385 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3386 3387 // The remaining operands were used to set fields in the MemOperand on 3388 // construction. 3389 for (int I = 6; I > 3; --I) 3390 MI.RemoveOperand(I); 3391 3392 MI.RemoveOperand(1); // Remove the intrinsic ID. 3393 Observer.changedInstr(MI); 3394 return true; 3395 } 3396 3397 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3398 MachineRegisterInfo &MRI, 3399 MachineIRBuilder &B) const { 3400 uint64_t Offset = 3401 ST.getTargetLowering()->getImplicitParameterOffset( 3402 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3403 LLT DstTy = MRI.getType(DstReg); 3404 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3405 3406 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3407 if (!loadInputValue(KernargPtrReg, B, 3408 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3409 return false; 3410 3411 // FIXME: This should be nuw 3412 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3413 return true; 3414 } 3415 3416 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3417 MachineRegisterInfo &MRI, 3418 MachineIRBuilder &B) const { 3419 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3420 if (!MFI->isEntryFunction()) { 3421 return legalizePreloadedArgIntrin(MI, MRI, B, 3422 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3423 } 3424 3425 Register DstReg = MI.getOperand(0).getReg(); 3426 if (!getImplicitArgPtr(DstReg, MRI, B)) 3427 return false; 3428 3429 MI.eraseFromParent(); 3430 return true; 3431 } 3432 3433 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3434 MachineRegisterInfo &MRI, 3435 MachineIRBuilder &B, 3436 unsigned AddrSpace) const { 3437 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3438 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3439 Register Hi32 = Unmerge.getReg(1); 3440 3441 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3442 MI.eraseFromParent(); 3443 return true; 3444 } 3445 3446 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3447 // offset (the offset that is included in bounds checking and swizzling, to be 3448 // split between the instruction's voffset and immoffset fields) and soffset 3449 // (the offset that is excluded from bounds checking and swizzling, to go in 3450 // the instruction's soffset field). This function takes the first kind of 3451 // offset and figures out how to split it between voffset and immoffset. 3452 std::tuple<Register, unsigned, unsigned> 3453 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3454 Register OrigOffset) const { 3455 const unsigned MaxImm = 4095; 3456 Register BaseReg; 3457 unsigned TotalConstOffset; 3458 MachineInstr *OffsetDef; 3459 const LLT S32 = LLT::scalar(32); 3460 3461 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3462 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3463 3464 unsigned ImmOffset = TotalConstOffset; 3465 3466 // If the immediate value is too big for the immoffset field, put the value 3467 // and -4096 into the immoffset field so that the value that is copied/added 3468 // for the voffset field is a multiple of 4096, and it stands more chance 3469 // of being CSEd with the copy/add for another similar load/store. 3470 // However, do not do that rounding down to a multiple of 4096 if that is a 3471 // negative number, as it appears to be illegal to have a negative offset 3472 // in the vgpr, even if adding the immediate offset makes it positive. 3473 unsigned Overflow = ImmOffset & ~MaxImm; 3474 ImmOffset -= Overflow; 3475 if ((int32_t)Overflow < 0) { 3476 Overflow += ImmOffset; 3477 ImmOffset = 0; 3478 } 3479 3480 if (Overflow != 0) { 3481 if (!BaseReg) { 3482 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3483 } else { 3484 auto OverflowVal = B.buildConstant(S32, Overflow); 3485 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3486 } 3487 } 3488 3489 if (!BaseReg) 3490 BaseReg = B.buildConstant(S32, 0).getReg(0); 3491 3492 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3493 } 3494 3495 /// Handle register layout difference for f16 images for some subtargets. 3496 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3497 MachineRegisterInfo &MRI, 3498 Register Reg) const { 3499 if (!ST.hasUnpackedD16VMem()) 3500 return Reg; 3501 3502 const LLT S16 = LLT::scalar(16); 3503 const LLT S32 = LLT::scalar(32); 3504 LLT StoreVT = MRI.getType(Reg); 3505 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3506 3507 auto Unmerge = B.buildUnmerge(S16, Reg); 3508 3509 SmallVector<Register, 4> WideRegs; 3510 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3511 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3512 3513 int NumElts = StoreVT.getNumElements(); 3514 3515 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3516 } 3517 3518 Register AMDGPULegalizerInfo::fixStoreSourceType( 3519 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3520 MachineRegisterInfo *MRI = B.getMRI(); 3521 LLT Ty = MRI->getType(VData); 3522 3523 const LLT S16 = LLT::scalar(16); 3524 3525 // Fixup illegal register types for i8 stores. 3526 if (Ty == LLT::scalar(8) || Ty == S16) { 3527 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3528 return AnyExt; 3529 } 3530 3531 if (Ty.isVector()) { 3532 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3533 if (IsFormat) 3534 return handleD16VData(B, *MRI, VData); 3535 } 3536 } 3537 3538 return VData; 3539 } 3540 3541 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3542 MachineRegisterInfo &MRI, 3543 MachineIRBuilder &B, 3544 bool IsTyped, 3545 bool IsFormat) const { 3546 Register VData = MI.getOperand(1).getReg(); 3547 LLT Ty = MRI.getType(VData); 3548 LLT EltTy = Ty.getScalarType(); 3549 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3550 const LLT S32 = LLT::scalar(32); 3551 3552 VData = fixStoreSourceType(B, VData, IsFormat); 3553 Register RSrc = MI.getOperand(2).getReg(); 3554 3555 MachineMemOperand *MMO = *MI.memoperands_begin(); 3556 const int MemSize = MMO->getSize(); 3557 3558 unsigned ImmOffset; 3559 unsigned TotalOffset; 3560 3561 // The typed intrinsics add an immediate after the registers. 3562 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3563 3564 // The struct intrinsic variants add one additional operand over raw. 3565 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3566 Register VIndex; 3567 int OpOffset = 0; 3568 if (HasVIndex) { 3569 VIndex = MI.getOperand(3).getReg(); 3570 OpOffset = 1; 3571 } 3572 3573 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3574 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3575 3576 unsigned Format = 0; 3577 if (IsTyped) { 3578 Format = MI.getOperand(5 + OpOffset).getImm(); 3579 ++OpOffset; 3580 } 3581 3582 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3583 3584 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3585 if (TotalOffset != 0) 3586 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3587 3588 unsigned Opc; 3589 if (IsTyped) { 3590 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3591 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3592 } else if (IsFormat) { 3593 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3594 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3595 } else { 3596 switch (MemSize) { 3597 case 1: 3598 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3599 break; 3600 case 2: 3601 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3602 break; 3603 default: 3604 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3605 break; 3606 } 3607 } 3608 3609 if (!VIndex) 3610 VIndex = B.buildConstant(S32, 0).getReg(0); 3611 3612 auto MIB = B.buildInstr(Opc) 3613 .addUse(VData) // vdata 3614 .addUse(RSrc) // rsrc 3615 .addUse(VIndex) // vindex 3616 .addUse(VOffset) // voffset 3617 .addUse(SOffset) // soffset 3618 .addImm(ImmOffset); // offset(imm) 3619 3620 if (IsTyped) 3621 MIB.addImm(Format); 3622 3623 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3624 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3625 .addMemOperand(MMO); 3626 3627 MI.eraseFromParent(); 3628 return true; 3629 } 3630 3631 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3632 MachineRegisterInfo &MRI, 3633 MachineIRBuilder &B, 3634 bool IsFormat, 3635 bool IsTyped) const { 3636 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3637 MachineMemOperand *MMO = *MI.memoperands_begin(); 3638 const int MemSize = MMO->getSize(); 3639 const LLT S32 = LLT::scalar(32); 3640 3641 Register Dst = MI.getOperand(0).getReg(); 3642 Register RSrc = MI.getOperand(2).getReg(); 3643 3644 // The typed intrinsics add an immediate after the registers. 3645 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3646 3647 // The struct intrinsic variants add one additional operand over raw. 3648 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3649 Register VIndex; 3650 int OpOffset = 0; 3651 if (HasVIndex) { 3652 VIndex = MI.getOperand(3).getReg(); 3653 OpOffset = 1; 3654 } 3655 3656 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3657 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3658 3659 unsigned Format = 0; 3660 if (IsTyped) { 3661 Format = MI.getOperand(5 + OpOffset).getImm(); 3662 ++OpOffset; 3663 } 3664 3665 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3666 unsigned ImmOffset; 3667 unsigned TotalOffset; 3668 3669 LLT Ty = MRI.getType(Dst); 3670 LLT EltTy = Ty.getScalarType(); 3671 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3672 const bool Unpacked = ST.hasUnpackedD16VMem(); 3673 3674 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3675 if (TotalOffset != 0) 3676 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3677 3678 unsigned Opc; 3679 3680 if (IsTyped) { 3681 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3682 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3683 } else if (IsFormat) { 3684 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3685 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3686 } else { 3687 switch (MemSize) { 3688 case 1: 3689 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3690 break; 3691 case 2: 3692 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3693 break; 3694 default: 3695 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3696 break; 3697 } 3698 } 3699 3700 Register LoadDstReg; 3701 3702 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3703 LLT UnpackedTy = Ty.changeElementSize(32); 3704 3705 if (IsExtLoad) 3706 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3707 else if (Unpacked && IsD16 && Ty.isVector()) 3708 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3709 else 3710 LoadDstReg = Dst; 3711 3712 if (!VIndex) 3713 VIndex = B.buildConstant(S32, 0).getReg(0); 3714 3715 auto MIB = B.buildInstr(Opc) 3716 .addDef(LoadDstReg) // vdata 3717 .addUse(RSrc) // rsrc 3718 .addUse(VIndex) // vindex 3719 .addUse(VOffset) // voffset 3720 .addUse(SOffset) // soffset 3721 .addImm(ImmOffset); // offset(imm) 3722 3723 if (IsTyped) 3724 MIB.addImm(Format); 3725 3726 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3727 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3728 .addMemOperand(MMO); 3729 3730 if (LoadDstReg != Dst) { 3731 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3732 3733 // Widen result for extending loads was widened. 3734 if (IsExtLoad) 3735 B.buildTrunc(Dst, LoadDstReg); 3736 else { 3737 // Repack to original 16-bit vector result 3738 // FIXME: G_TRUNC should work, but legalization currently fails 3739 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3740 SmallVector<Register, 4> Repack; 3741 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3742 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3743 B.buildMerge(Dst, Repack); 3744 } 3745 } 3746 3747 MI.eraseFromParent(); 3748 return true; 3749 } 3750 3751 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3752 MachineIRBuilder &B, 3753 bool IsInc) const { 3754 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3755 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3756 B.buildInstr(Opc) 3757 .addDef(MI.getOperand(0).getReg()) 3758 .addUse(MI.getOperand(2).getReg()) 3759 .addUse(MI.getOperand(3).getReg()) 3760 .cloneMemRefs(MI); 3761 MI.eraseFromParent(); 3762 return true; 3763 } 3764 3765 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3766 switch (IntrID) { 3767 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3768 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3769 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3770 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3771 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3772 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3773 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3774 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3775 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3776 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3777 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3778 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3779 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3780 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3781 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3782 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3783 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3784 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3785 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3786 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3787 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3788 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3789 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3790 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3791 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3792 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3793 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3794 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3795 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3796 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3797 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3798 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3799 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3800 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3801 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3802 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3803 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3804 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3805 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3806 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3807 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3808 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3809 default: 3810 llvm_unreachable("unhandled atomic opcode"); 3811 } 3812 } 3813 3814 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3815 MachineIRBuilder &B, 3816 Intrinsic::ID IID) const { 3817 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3818 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3819 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3820 3821 Register Dst; 3822 3823 int OpOffset = 0; 3824 if (HasReturn) { 3825 // A few FP atomics do not support return values. 3826 Dst = MI.getOperand(0).getReg(); 3827 } else { 3828 OpOffset = -1; 3829 } 3830 3831 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3832 Register CmpVal; 3833 3834 if (IsCmpSwap) { 3835 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3836 ++OpOffset; 3837 } 3838 3839 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3840 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3841 3842 // The struct intrinsic variants add one additional operand over raw. 3843 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3844 Register VIndex; 3845 if (HasVIndex) { 3846 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3847 ++OpOffset; 3848 } 3849 3850 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3851 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3852 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3853 3854 MachineMemOperand *MMO = *MI.memoperands_begin(); 3855 3856 unsigned ImmOffset; 3857 unsigned TotalOffset; 3858 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3859 if (TotalOffset != 0) 3860 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3861 3862 if (!VIndex) 3863 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3864 3865 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3866 3867 if (HasReturn) 3868 MIB.addDef(Dst); 3869 3870 MIB.addUse(VData); // vdata 3871 3872 if (IsCmpSwap) 3873 MIB.addReg(CmpVal); 3874 3875 MIB.addUse(RSrc) // rsrc 3876 .addUse(VIndex) // vindex 3877 .addUse(VOffset) // voffset 3878 .addUse(SOffset) // soffset 3879 .addImm(ImmOffset) // offset(imm) 3880 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3881 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3882 .addMemOperand(MMO); 3883 3884 MI.eraseFromParent(); 3885 return true; 3886 } 3887 3888 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3889 /// vector with s16 typed elements. 3890 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3891 SmallVectorImpl<Register> &PackedAddrs, 3892 int AddrIdx, int DimIdx, int EndIdx, 3893 int NumGradients) { 3894 const LLT S16 = LLT::scalar(16); 3895 const LLT V2S16 = LLT::vector(2, 16); 3896 3897 for (int I = AddrIdx; I < EndIdx; ++I) { 3898 MachineOperand &SrcOp = MI.getOperand(I); 3899 if (!SrcOp.isReg()) 3900 continue; // _L to _LZ may have eliminated this. 3901 3902 Register AddrReg = SrcOp.getReg(); 3903 3904 if (I < DimIdx) { 3905 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3906 PackedAddrs.push_back(AddrReg); 3907 } else { 3908 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3909 // derivatives dx/dh and dx/dv are packed with undef. 3910 if (((I + 1) >= EndIdx) || 3911 ((NumGradients / 2) % 2 == 1 && 3912 (I == DimIdx + (NumGradients / 2) - 1 || 3913 I == DimIdx + NumGradients - 1)) || 3914 // Check for _L to _LZ optimization 3915 !MI.getOperand(I + 1).isReg()) { 3916 PackedAddrs.push_back( 3917 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3918 .getReg(0)); 3919 } else { 3920 PackedAddrs.push_back( 3921 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3922 .getReg(0)); 3923 ++I; 3924 } 3925 } 3926 } 3927 } 3928 3929 /// Convert from separate vaddr components to a single vector address register, 3930 /// and replace the remaining operands with $noreg. 3931 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3932 int DimIdx, int NumVAddrs) { 3933 const LLT S32 = LLT::scalar(32); 3934 3935 SmallVector<Register, 8> AddrRegs; 3936 for (int I = 0; I != NumVAddrs; ++I) { 3937 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3938 if (SrcOp.isReg()) { 3939 AddrRegs.push_back(SrcOp.getReg()); 3940 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3941 } 3942 } 3943 3944 int NumAddrRegs = AddrRegs.size(); 3945 if (NumAddrRegs != 1) { 3946 // Round up to 8 elements for v5-v7 3947 // FIXME: Missing intermediate sized register classes and instructions. 3948 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3949 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3950 auto Undef = B.buildUndef(S32); 3951 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3952 NumAddrRegs = RoundedNumRegs; 3953 } 3954 3955 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3956 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3957 } 3958 3959 for (int I = 1; I != NumVAddrs; ++I) { 3960 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3961 if (SrcOp.isReg()) 3962 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3963 } 3964 } 3965 3966 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3967 /// 3968 /// Depending on the subtarget, load/store with 16-bit element data need to be 3969 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3970 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3971 /// registers. 3972 /// 3973 /// We don't want to directly select image instructions just yet, but also want 3974 /// to exposes all register repacking to the legalizer/combiners. We also don't 3975 /// want a selected instrution entering RegBankSelect. In order to avoid 3976 /// defining a multitude of intermediate image instructions, directly hack on 3977 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3978 /// now unnecessary arguments with $noreg. 3979 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3980 MachineInstr &MI, MachineIRBuilder &B, 3981 GISelChangeObserver &Observer, 3982 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3983 3984 const int NumDefs = MI.getNumExplicitDefs(); 3985 bool IsTFE = NumDefs == 2; 3986 // We are only processing the operands of d16 image operations on subtargets 3987 // that use the unpacked register layout, or need to repack the TFE result. 3988 3989 // TODO: Do we need to guard against already legalized intrinsics? 3990 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3991 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3992 3993 MachineRegisterInfo *MRI = B.getMRI(); 3994 const LLT S32 = LLT::scalar(32); 3995 const LLT S16 = LLT::scalar(16); 3996 const LLT V2S16 = LLT::vector(2, 16); 3997 3998 // Index of first address argument 3999 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 4000 4001 int NumVAddrs, NumGradients; 4002 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 4003 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 4004 getDMaskIdx(BaseOpcode, NumDefs); 4005 unsigned DMask = 0; 4006 4007 // Check for 16 bit addresses and pack if true. 4008 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 4009 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 4010 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 4011 const bool IsG16 = GradTy == S16; 4012 const bool IsA16 = AddrTy == S16; 4013 4014 int DMaskLanes = 0; 4015 if (!BaseOpcode->Atomic) { 4016 DMask = MI.getOperand(DMaskIdx).getImm(); 4017 if (BaseOpcode->Gather4) { 4018 DMaskLanes = 4; 4019 } else if (DMask != 0) { 4020 DMaskLanes = countPopulation(DMask); 4021 } else if (!IsTFE && !BaseOpcode->Store) { 4022 // If dmask is 0, this is a no-op load. This can be eliminated. 4023 B.buildUndef(MI.getOperand(0)); 4024 MI.eraseFromParent(); 4025 return true; 4026 } 4027 } 4028 4029 Observer.changingInstr(MI); 4030 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4031 4032 unsigned NewOpcode = NumDefs == 0 ? 4033 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4034 4035 // Track that we legalized this 4036 MI.setDesc(B.getTII().get(NewOpcode)); 4037 4038 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4039 // dmask to be at least 1 otherwise the instruction will fail 4040 if (IsTFE && DMask == 0) { 4041 DMask = 0x1; 4042 DMaskLanes = 1; 4043 MI.getOperand(DMaskIdx).setImm(DMask); 4044 } 4045 4046 if (BaseOpcode->Atomic) { 4047 Register VData0 = MI.getOperand(2).getReg(); 4048 LLT Ty = MRI->getType(VData0); 4049 4050 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4051 if (Ty.isVector()) 4052 return false; 4053 4054 if (BaseOpcode->AtomicX2) { 4055 Register VData1 = MI.getOperand(3).getReg(); 4056 // The two values are packed in one register. 4057 LLT PackedTy = LLT::vector(2, Ty); 4058 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4059 MI.getOperand(2).setReg(Concat.getReg(0)); 4060 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4061 } 4062 } 4063 4064 int CorrectedNumVAddrs = NumVAddrs; 4065 4066 // Optimize _L to _LZ when _L is zero 4067 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4068 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4069 const ConstantFP *ConstantLod; 4070 const int LodIdx = AddrIdx + NumVAddrs - 1; 4071 4072 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4073 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4074 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4075 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4076 LZMappingInfo->LZ, ImageDimIntr->Dim); 4077 4078 // The starting indexes should remain in the same place. 4079 --NumVAddrs; 4080 --CorrectedNumVAddrs; 4081 4082 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4083 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4084 MI.RemoveOperand(LodIdx); 4085 } 4086 } 4087 } 4088 4089 // Optimize _mip away, when 'lod' is zero 4090 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4091 int64_t ConstantLod; 4092 const int LodIdx = AddrIdx + NumVAddrs - 1; 4093 4094 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4095 if (ConstantLod == 0) { 4096 // TODO: Change intrinsic opcode and remove operand instead or replacing 4097 // it with 0, as the _L to _LZ handling is done above. 4098 MI.getOperand(LodIdx).ChangeToImmediate(0); 4099 --CorrectedNumVAddrs; 4100 } 4101 } 4102 } 4103 4104 // Rewrite the addressing register layout before doing anything else. 4105 if (IsA16 || IsG16) { 4106 if (IsA16) { 4107 // Target must support the feature and gradients need to be 16 bit too 4108 if (!ST.hasA16() || !IsG16) 4109 return false; 4110 } else if (!ST.hasG16()) 4111 return false; 4112 4113 if (NumVAddrs > 1) { 4114 SmallVector<Register, 4> PackedRegs; 4115 // Don't compress addresses for G16 4116 const int PackEndIdx = 4117 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4118 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4119 PackEndIdx, NumGradients); 4120 4121 if (!IsA16) { 4122 // Add uncompressed address 4123 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4124 int AddrReg = MI.getOperand(I).getReg(); 4125 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4126 PackedRegs.push_back(AddrReg); 4127 } 4128 } 4129 4130 // See also below in the non-a16 branch 4131 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4132 4133 if (!UseNSA && PackedRegs.size() > 1) { 4134 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4135 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4136 PackedRegs[0] = Concat.getReg(0); 4137 PackedRegs.resize(1); 4138 } 4139 4140 const int NumPacked = PackedRegs.size(); 4141 for (int I = 0; I != NumVAddrs; ++I) { 4142 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4143 if (!SrcOp.isReg()) { 4144 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4145 continue; 4146 } 4147 4148 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4149 4150 if (I < NumPacked) 4151 SrcOp.setReg(PackedRegs[I]); 4152 else 4153 SrcOp.setReg(AMDGPU::NoRegister); 4154 } 4155 } 4156 } else { 4157 // If the register allocator cannot place the address registers contiguously 4158 // without introducing moves, then using the non-sequential address encoding 4159 // is always preferable, since it saves VALU instructions and is usually a 4160 // wash in terms of code size or even better. 4161 // 4162 // However, we currently have no way of hinting to the register allocator 4163 // that MIMG addresses should be placed contiguously when it is possible to 4164 // do so, so force non-NSA for the common 2-address case as a heuristic. 4165 // 4166 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4167 // allocation when possible. 4168 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4169 4170 if (!UseNSA && NumVAddrs > 1) 4171 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4172 } 4173 4174 int Flags = 0; 4175 if (IsA16) 4176 Flags |= 1; 4177 if (IsG16) 4178 Flags |= 2; 4179 MI.addOperand(MachineOperand::CreateImm(Flags)); 4180 4181 if (BaseOpcode->Store) { // No TFE for stores? 4182 // TODO: Handle dmask trim 4183 Register VData = MI.getOperand(1).getReg(); 4184 LLT Ty = MRI->getType(VData); 4185 if (!Ty.isVector() || Ty.getElementType() != S16) 4186 return true; 4187 4188 Register RepackedReg = handleD16VData(B, *MRI, VData); 4189 if (RepackedReg != VData) { 4190 MI.getOperand(1).setReg(RepackedReg); 4191 } 4192 4193 return true; 4194 } 4195 4196 Register DstReg = MI.getOperand(0).getReg(); 4197 LLT Ty = MRI->getType(DstReg); 4198 const LLT EltTy = Ty.getScalarType(); 4199 const bool IsD16 = Ty.getScalarType() == S16; 4200 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4201 4202 // Confirm that the return type is large enough for the dmask specified 4203 if (NumElts < DMaskLanes) 4204 return false; 4205 4206 if (NumElts > 4 || DMaskLanes > 4) 4207 return false; 4208 4209 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4210 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4211 4212 // The raw dword aligned data component of the load. The only legal cases 4213 // where this matters should be when using the packed D16 format, for 4214 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4215 LLT RoundedTy; 4216 4217 // S32 vector to to cover all data, plus TFE result element. 4218 LLT TFETy; 4219 4220 // Register type to use for each loaded component. Will be S32 or V2S16. 4221 LLT RegTy; 4222 4223 if (IsD16 && ST.hasUnpackedD16VMem()) { 4224 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4225 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4226 RegTy = S32; 4227 } else { 4228 unsigned EltSize = EltTy.getSizeInBits(); 4229 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4230 unsigned RoundedSize = 32 * RoundedElts; 4231 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4232 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4233 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4234 } 4235 4236 // The return type does not need adjustment. 4237 // TODO: Should we change s16 case to s32 or <2 x s16>? 4238 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4239 return true; 4240 4241 Register Dst1Reg; 4242 4243 // Insert after the instruction. 4244 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4245 4246 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4247 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4248 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4249 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4250 4251 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4252 4253 MI.getOperand(0).setReg(NewResultReg); 4254 4255 // In the IR, TFE is supposed to be used with a 2 element struct return 4256 // type. The intruction really returns these two values in one contiguous 4257 // register, with one additional dword beyond the loaded data. Rewrite the 4258 // return type to use a single register result. 4259 4260 if (IsTFE) { 4261 Dst1Reg = MI.getOperand(1).getReg(); 4262 if (MRI->getType(Dst1Reg) != S32) 4263 return false; 4264 4265 // TODO: Make sure the TFE operand bit is set. 4266 MI.RemoveOperand(1); 4267 4268 // Handle the easy case that requires no repack instructions. 4269 if (Ty == S32) { 4270 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4271 return true; 4272 } 4273 } 4274 4275 // Now figure out how to copy the new result register back into the old 4276 // result. 4277 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4278 4279 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4280 4281 if (ResultNumRegs == 1) { 4282 assert(!IsTFE); 4283 ResultRegs[0] = NewResultReg; 4284 } else { 4285 // We have to repack into a new vector of some kind. 4286 for (int I = 0; I != NumDataRegs; ++I) 4287 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4288 B.buildUnmerge(ResultRegs, NewResultReg); 4289 4290 // Drop the final TFE element to get the data part. The TFE result is 4291 // directly written to the right place already. 4292 if (IsTFE) 4293 ResultRegs.resize(NumDataRegs); 4294 } 4295 4296 // For an s16 scalar result, we form an s32 result with a truncate regardless 4297 // of packed vs. unpacked. 4298 if (IsD16 && !Ty.isVector()) { 4299 B.buildTrunc(DstReg, ResultRegs[0]); 4300 return true; 4301 } 4302 4303 // Avoid a build/concat_vector of 1 entry. 4304 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4305 B.buildBitcast(DstReg, ResultRegs[0]); 4306 return true; 4307 } 4308 4309 assert(Ty.isVector()); 4310 4311 if (IsD16) { 4312 // For packed D16 results with TFE enabled, all the data components are 4313 // S32. Cast back to the expected type. 4314 // 4315 // TODO: We don't really need to use load s32 elements. We would only need one 4316 // cast for the TFE result if a multiple of v2s16 was used. 4317 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4318 for (Register &Reg : ResultRegs) 4319 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4320 } else if (ST.hasUnpackedD16VMem()) { 4321 for (Register &Reg : ResultRegs) 4322 Reg = B.buildTrunc(S16, Reg).getReg(0); 4323 } 4324 } 4325 4326 auto padWithUndef = [&](LLT Ty, int NumElts) { 4327 if (NumElts == 0) 4328 return; 4329 Register Undef = B.buildUndef(Ty).getReg(0); 4330 for (int I = 0; I != NumElts; ++I) 4331 ResultRegs.push_back(Undef); 4332 }; 4333 4334 // Pad out any elements eliminated due to the dmask. 4335 LLT ResTy = MRI->getType(ResultRegs[0]); 4336 if (!ResTy.isVector()) { 4337 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4338 B.buildBuildVector(DstReg, ResultRegs); 4339 return true; 4340 } 4341 4342 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4343 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4344 4345 // Deal with the one annoying legal case. 4346 const LLT V3S16 = LLT::vector(3, 16); 4347 if (Ty == V3S16) { 4348 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4349 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4350 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4351 return true; 4352 } 4353 4354 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4355 B.buildConcatVectors(DstReg, ResultRegs); 4356 return true; 4357 } 4358 4359 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4360 LegalizerHelper &Helper, MachineInstr &MI) const { 4361 MachineIRBuilder &B = Helper.MIRBuilder; 4362 GISelChangeObserver &Observer = Helper.Observer; 4363 4364 Register Dst = MI.getOperand(0).getReg(); 4365 LLT Ty = B.getMRI()->getType(Dst); 4366 unsigned Size = Ty.getSizeInBits(); 4367 MachineFunction &MF = B.getMF(); 4368 4369 Observer.changingInstr(MI); 4370 4371 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4372 Ty = getBitcastRegisterType(Ty); 4373 Helper.bitcastDst(MI, Ty, 0); 4374 Dst = MI.getOperand(0).getReg(); 4375 B.setInsertPt(B.getMBB(), MI); 4376 } 4377 4378 // FIXME: We don't really need this intermediate instruction. The intrinsic 4379 // should be fixed to have a memory operand. Since it's readnone, we're not 4380 // allowed to add one. 4381 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4382 MI.RemoveOperand(1); // Remove intrinsic ID 4383 4384 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4385 // TODO: Should this use datalayout alignment? 4386 const unsigned MemSize = (Size + 7) / 8; 4387 const Align MemAlign(4); 4388 MachineMemOperand *MMO = MF.getMachineMemOperand( 4389 MachinePointerInfo(), 4390 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4391 MachineMemOperand::MOInvariant, 4392 MemSize, MemAlign); 4393 MI.addMemOperand(MF, MMO); 4394 4395 // There are no 96-bit result scalar loads, but widening to 128-bit should 4396 // always be legal. We may need to restore this to a 96-bit result if it turns 4397 // out this needs to be converted to a vector load during RegBankSelect. 4398 if (!isPowerOf2_32(Size)) { 4399 if (Ty.isVector()) 4400 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4401 else 4402 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4403 } 4404 4405 Observer.changedInstr(MI); 4406 return true; 4407 } 4408 4409 // TODO: Move to selection 4410 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4411 MachineRegisterInfo &MRI, 4412 MachineIRBuilder &B) const { 4413 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4414 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4415 !ST.isTrapHandlerEnabled()) { 4416 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4417 } else { 4418 // Pass queue pointer to trap handler as input, and insert trap instruction 4419 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4420 MachineRegisterInfo &MRI = *B.getMRI(); 4421 4422 Register LiveIn = 4423 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4424 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4425 return false; 4426 4427 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4428 B.buildCopy(SGPR01, LiveIn); 4429 B.buildInstr(AMDGPU::S_TRAP) 4430 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4431 .addReg(SGPR01, RegState::Implicit); 4432 } 4433 4434 MI.eraseFromParent(); 4435 return true; 4436 } 4437 4438 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4439 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4440 // Is non-HSA path or trap-handler disabled? then, report a warning 4441 // accordingly 4442 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4443 !ST.isTrapHandlerEnabled()) { 4444 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4445 "debugtrap handler not supported", 4446 MI.getDebugLoc(), DS_Warning); 4447 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4448 Ctx.diagnose(NoTrap); 4449 } else { 4450 // Insert debug-trap instruction 4451 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4452 } 4453 4454 MI.eraseFromParent(); 4455 return true; 4456 } 4457 4458 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4459 MachineInstr &MI) const { 4460 MachineIRBuilder &B = Helper.MIRBuilder; 4461 MachineRegisterInfo &MRI = *B.getMRI(); 4462 4463 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4464 auto IntrID = MI.getIntrinsicID(); 4465 switch (IntrID) { 4466 case Intrinsic::amdgcn_if: 4467 case Intrinsic::amdgcn_else: { 4468 MachineInstr *Br = nullptr; 4469 MachineBasicBlock *UncondBrTarget = nullptr; 4470 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4471 const SIRegisterInfo *TRI 4472 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4473 4474 Register Def = MI.getOperand(1).getReg(); 4475 Register Use = MI.getOperand(3).getReg(); 4476 4477 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4478 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4479 if (IntrID == Intrinsic::amdgcn_if) { 4480 B.buildInstr(AMDGPU::SI_IF) 4481 .addDef(Def) 4482 .addUse(Use) 4483 .addMBB(UncondBrTarget); 4484 } else { 4485 B.buildInstr(AMDGPU::SI_ELSE) 4486 .addDef(Def) 4487 .addUse(Use) 4488 .addMBB(UncondBrTarget) 4489 .addImm(0); 4490 } 4491 4492 if (Br) { 4493 Br->getOperand(0).setMBB(CondBrTarget); 4494 } else { 4495 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4496 // since we're swapping branch targets it needs to be reinserted. 4497 // FIXME: IRTranslator should probably not do this 4498 B.buildBr(*CondBrTarget); 4499 } 4500 4501 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4502 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4503 MI.eraseFromParent(); 4504 BrCond->eraseFromParent(); 4505 return true; 4506 } 4507 4508 return false; 4509 } 4510 case Intrinsic::amdgcn_loop: { 4511 MachineInstr *Br = nullptr; 4512 MachineBasicBlock *UncondBrTarget = nullptr; 4513 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4514 const SIRegisterInfo *TRI 4515 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4516 4517 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4518 Register Reg = MI.getOperand(2).getReg(); 4519 4520 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4521 B.buildInstr(AMDGPU::SI_LOOP) 4522 .addUse(Reg) 4523 .addMBB(UncondBrTarget); 4524 4525 if (Br) 4526 Br->getOperand(0).setMBB(CondBrTarget); 4527 else 4528 B.buildBr(*CondBrTarget); 4529 4530 MI.eraseFromParent(); 4531 BrCond->eraseFromParent(); 4532 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4533 return true; 4534 } 4535 4536 return false; 4537 } 4538 case Intrinsic::amdgcn_kernarg_segment_ptr: 4539 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4540 // This only makes sense to call in a kernel, so just lower to null. 4541 B.buildConstant(MI.getOperand(0).getReg(), 0); 4542 MI.eraseFromParent(); 4543 return true; 4544 } 4545 4546 return legalizePreloadedArgIntrin( 4547 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4548 case Intrinsic::amdgcn_implicitarg_ptr: 4549 return legalizeImplicitArgPtr(MI, MRI, B); 4550 case Intrinsic::amdgcn_workitem_id_x: 4551 return legalizePreloadedArgIntrin(MI, MRI, B, 4552 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4553 case Intrinsic::amdgcn_workitem_id_y: 4554 return legalizePreloadedArgIntrin(MI, MRI, B, 4555 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4556 case Intrinsic::amdgcn_workitem_id_z: 4557 return legalizePreloadedArgIntrin(MI, MRI, B, 4558 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4559 case Intrinsic::amdgcn_workgroup_id_x: 4560 return legalizePreloadedArgIntrin(MI, MRI, B, 4561 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4562 case Intrinsic::amdgcn_workgroup_id_y: 4563 return legalizePreloadedArgIntrin(MI, MRI, B, 4564 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4565 case Intrinsic::amdgcn_workgroup_id_z: 4566 return legalizePreloadedArgIntrin(MI, MRI, B, 4567 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4568 case Intrinsic::amdgcn_dispatch_ptr: 4569 return legalizePreloadedArgIntrin(MI, MRI, B, 4570 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4571 case Intrinsic::amdgcn_queue_ptr: 4572 return legalizePreloadedArgIntrin(MI, MRI, B, 4573 AMDGPUFunctionArgInfo::QUEUE_PTR); 4574 case Intrinsic::amdgcn_implicit_buffer_ptr: 4575 return legalizePreloadedArgIntrin( 4576 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4577 case Intrinsic::amdgcn_dispatch_id: 4578 return legalizePreloadedArgIntrin(MI, MRI, B, 4579 AMDGPUFunctionArgInfo::DISPATCH_ID); 4580 case Intrinsic::amdgcn_fdiv_fast: 4581 return legalizeFDIVFastIntrin(MI, MRI, B); 4582 case Intrinsic::amdgcn_is_shared: 4583 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4584 case Intrinsic::amdgcn_is_private: 4585 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4586 case Intrinsic::amdgcn_wavefrontsize: { 4587 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4588 MI.eraseFromParent(); 4589 return true; 4590 } 4591 case Intrinsic::amdgcn_s_buffer_load: 4592 return legalizeSBufferLoad(Helper, MI); 4593 case Intrinsic::amdgcn_raw_buffer_store: 4594 case Intrinsic::amdgcn_struct_buffer_store: 4595 return legalizeBufferStore(MI, MRI, B, false, false); 4596 case Intrinsic::amdgcn_raw_buffer_store_format: 4597 case Intrinsic::amdgcn_struct_buffer_store_format: 4598 return legalizeBufferStore(MI, MRI, B, false, true); 4599 case Intrinsic::amdgcn_raw_tbuffer_store: 4600 case Intrinsic::amdgcn_struct_tbuffer_store: 4601 return legalizeBufferStore(MI, MRI, B, true, true); 4602 case Intrinsic::amdgcn_raw_buffer_load: 4603 case Intrinsic::amdgcn_struct_buffer_load: 4604 return legalizeBufferLoad(MI, MRI, B, false, false); 4605 case Intrinsic::amdgcn_raw_buffer_load_format: 4606 case Intrinsic::amdgcn_struct_buffer_load_format: 4607 return legalizeBufferLoad(MI, MRI, B, true, false); 4608 case Intrinsic::amdgcn_raw_tbuffer_load: 4609 case Intrinsic::amdgcn_struct_tbuffer_load: 4610 return legalizeBufferLoad(MI, MRI, B, true, true); 4611 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4612 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4613 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4614 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4615 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4616 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4617 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4618 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4619 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4620 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4621 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4622 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4623 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4624 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4625 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4626 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4627 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4628 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4629 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4630 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4631 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4632 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4633 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4634 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4635 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4636 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4637 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4638 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4639 return legalizeBufferAtomic(MI, B, IntrID); 4640 case Intrinsic::amdgcn_atomic_inc: 4641 return legalizeAtomicIncDec(MI, B, true); 4642 case Intrinsic::amdgcn_atomic_dec: 4643 return legalizeAtomicIncDec(MI, B, false); 4644 case Intrinsic::trap: 4645 return legalizeTrapIntrinsic(MI, MRI, B); 4646 case Intrinsic::debugtrap: 4647 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4648 case Intrinsic::amdgcn_rsq_clamp: 4649 return legalizeRsqClampIntrinsic(MI, MRI, B); 4650 case Intrinsic::amdgcn_ds_fadd: 4651 case Intrinsic::amdgcn_ds_fmin: 4652 case Intrinsic::amdgcn_ds_fmax: 4653 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4654 default: { 4655 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4656 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4657 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4658 return true; 4659 } 4660 } 4661 4662 return true; 4663 } 4664