1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 /// Return true if we should legalize a load by widening an odd sized memory 364 /// access up to the alignment. Note this case when the memory access itself 365 /// changes, not the size of the result register. 366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 367 unsigned AlignInBits, unsigned AddrSpace, 368 unsigned Opcode) { 369 // We don't want to widen cases that are naturally legal. 370 if (isPowerOf2_32(SizeInBits)) 371 return false; 372 373 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 374 // end up widening these for a scalar load during RegBankSelect, since there 375 // aren't 96-bit scalar loads. 376 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 377 return false; 378 379 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 380 return false; 381 382 // A load is known dereferenceable up to the alignment, so it's legal to widen 383 // to it. 384 // 385 // TODO: Could check dereferenceable for less aligned cases. 386 unsigned RoundedSize = NextPowerOf2(SizeInBits); 387 if (AlignInBits < RoundedSize) 388 return false; 389 390 // Do not widen if it would introduce a slow unaligned load. 391 const SITargetLowering *TLI = ST.getTargetLowering(); 392 bool Fast = false; 393 return TLI->allowsMisalignedMemoryAccessesImpl( 394 RoundedSize, AddrSpace, Align(AlignInBits / 8), 395 MachineMemOperand::MOLoad, &Fast) && 396 Fast; 397 } 398 399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 400 unsigned Opcode) { 401 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 402 return false; 403 404 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 405 Query.MMODescrs[0].AlignInBits, 406 Query.Types[1].getAddressSpace(), Opcode); 407 } 408 409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 410 const GCNTargetMachine &TM) 411 : ST(ST_) { 412 using namespace TargetOpcode; 413 414 auto GetAddrSpacePtr = [&TM](unsigned AS) { 415 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 416 }; 417 418 const LLT S1 = LLT::scalar(1); 419 const LLT S16 = LLT::scalar(16); 420 const LLT S32 = LLT::scalar(32); 421 const LLT S64 = LLT::scalar(64); 422 const LLT S128 = LLT::scalar(128); 423 const LLT S256 = LLT::scalar(256); 424 const LLT S512 = LLT::scalar(512); 425 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 426 427 const LLT V2S16 = LLT::vector(2, 16); 428 const LLT V4S16 = LLT::vector(4, 16); 429 430 const LLT V2S32 = LLT::vector(2, 32); 431 const LLT V3S32 = LLT::vector(3, 32); 432 const LLT V4S32 = LLT::vector(4, 32); 433 const LLT V5S32 = LLT::vector(5, 32); 434 const LLT V6S32 = LLT::vector(6, 32); 435 const LLT V7S32 = LLT::vector(7, 32); 436 const LLT V8S32 = LLT::vector(8, 32); 437 const LLT V9S32 = LLT::vector(9, 32); 438 const LLT V10S32 = LLT::vector(10, 32); 439 const LLT V11S32 = LLT::vector(11, 32); 440 const LLT V12S32 = LLT::vector(12, 32); 441 const LLT V13S32 = LLT::vector(13, 32); 442 const LLT V14S32 = LLT::vector(14, 32); 443 const LLT V15S32 = LLT::vector(15, 32); 444 const LLT V16S32 = LLT::vector(16, 32); 445 const LLT V32S32 = LLT::vector(32, 32); 446 447 const LLT V2S64 = LLT::vector(2, 64); 448 const LLT V3S64 = LLT::vector(3, 64); 449 const LLT V4S64 = LLT::vector(4, 64); 450 const LLT V5S64 = LLT::vector(5, 64); 451 const LLT V6S64 = LLT::vector(6, 64); 452 const LLT V7S64 = LLT::vector(7, 64); 453 const LLT V8S64 = LLT::vector(8, 64); 454 const LLT V16S64 = LLT::vector(16, 64); 455 456 std::initializer_list<LLT> AllS32Vectors = 457 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 458 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 459 std::initializer_list<LLT> AllS64Vectors = 460 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 461 462 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 463 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 464 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 465 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 466 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 467 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 468 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 469 470 const LLT CodePtr = FlatPtr; 471 472 const std::initializer_list<LLT> AddrSpaces64 = { 473 GlobalPtr, ConstantPtr, FlatPtr 474 }; 475 476 const std::initializer_list<LLT> AddrSpaces32 = { 477 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 478 }; 479 480 const std::initializer_list<LLT> FPTypesBase = { 481 S32, S64 482 }; 483 484 const std::initializer_list<LLT> FPTypes16 = { 485 S32, S64, S16 486 }; 487 488 const std::initializer_list<LLT> FPTypesPK16 = { 489 S32, S64, S16, V2S16 490 }; 491 492 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 493 494 setAction({G_BRCOND, S1}, Legal); // VCC branches 495 setAction({G_BRCOND, S32}, Legal); // SCC branches 496 497 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 498 // elements for v3s16 499 getActionDefinitionsBuilder(G_PHI) 500 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 501 .legalFor(AllS32Vectors) 502 .legalFor(AllS64Vectors) 503 .legalFor(AddrSpaces64) 504 .legalFor(AddrSpaces32) 505 .legalIf(isPointer(0)) 506 .clampScalar(0, S16, S256) 507 .widenScalarToNextPow2(0, 32) 508 .clampMaxNumElements(0, S32, 16) 509 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 510 .scalarize(0); 511 512 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 513 // Full set of gfx9 features. 514 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 515 .legalFor({S32, S16, V2S16}) 516 .clampScalar(0, S16, S32) 517 .clampMaxNumElements(0, S16, 2) 518 .scalarize(0) 519 .widenScalarToNextPow2(0, 32); 520 521 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 522 .legalFor({S32, S16, V2S16}) // Clamp modifier 523 .minScalarOrElt(0, S16) 524 .clampMaxNumElements(0, S16, 2) 525 .scalarize(0) 526 .widenScalarToNextPow2(0, 32) 527 .lower(); 528 } else if (ST.has16BitInsts()) { 529 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 530 .legalFor({S32, S16}) 531 .clampScalar(0, S16, S32) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 534 535 // Technically the saturating operations require clamp bit support, but this 536 // was introduced at the same time as 16-bit operations. 537 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 538 .legalFor({S32, S16}) // Clamp modifier 539 .minScalar(0, S16) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 16) 542 .lower(); 543 544 // We're just lowering this, but it helps get a better result to try to 545 // coerce to the desired type first. 546 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 547 .minScalar(0, S16) 548 .scalarize(0) 549 .lower(); 550 } else { 551 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 552 .legalFor({S32}) 553 .clampScalar(0, S32, S32) 554 .scalarize(0); 555 556 if (ST.hasIntClamp()) { 557 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 558 .legalFor({S32}) // Clamp modifier. 559 .scalarize(0) 560 .minScalarOrElt(0, S32) 561 .lower(); 562 } else { 563 // Clamp bit support was added in VI, along with 16-bit operations. 564 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 565 .minScalar(0, S32) 566 .scalarize(0) 567 .lower(); 568 } 569 570 // FIXME: DAG expansion gets better results. The widening uses the smaller 571 // range values and goes for the min/max lowering directly. 572 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 573 .minScalar(0, S32) 574 .scalarize(0) 575 .lower(); 576 } 577 578 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 579 .customFor({S32, S64}) 580 .clampScalar(0, S32, S64) 581 .widenScalarToNextPow2(0, 32) 582 .scalarize(0); 583 584 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 585 .legalFor({S32}) 586 .clampScalar(0, S32, S32) 587 .scalarize(0); 588 589 // Report legal for any types we can handle anywhere. For the cases only legal 590 // on the SALU, RegBankSelect will be able to re-legalize. 591 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 592 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 593 .clampScalar(0, S32, S64) 594 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 595 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 596 .widenScalarToNextPow2(0) 597 .scalarize(0); 598 599 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 600 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 601 .legalFor({{S32, S1}, {S32, S32}}) 602 .minScalar(0, S32) 603 // TODO: .scalarize(0) 604 .lower(); 605 606 getActionDefinitionsBuilder(G_BITCAST) 607 // Don't worry about the size constraint. 608 .legalIf(all(isRegisterType(0), isRegisterType(1))) 609 .lower(); 610 611 612 getActionDefinitionsBuilder(G_CONSTANT) 613 .legalFor({S1, S32, S64, S16, GlobalPtr, 614 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 615 .legalIf(isPointer(0)) 616 .clampScalar(0, S32, S64) 617 .widenScalarToNextPow2(0); 618 619 getActionDefinitionsBuilder(G_FCONSTANT) 620 .legalFor({S32, S64, S16}) 621 .clampScalar(0, S16, S64); 622 623 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 624 .legalIf(isRegisterType(0)) 625 // s1 and s16 are special cases because they have legal operations on 626 // them, but don't really occupy registers in the normal way. 627 .legalFor({S1, S16}) 628 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 629 .clampScalarOrElt(0, S32, MaxScalar) 630 .widenScalarToNextPow2(0, 32) 631 .clampMaxNumElements(0, S32, 16); 632 633 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 634 635 // If the amount is divergent, we have to do a wave reduction to get the 636 // maximum value, so this is expanded during RegBankSelect. 637 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 638 .legalFor({{PrivatePtr, S32}}); 639 640 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 641 .customIf(typeIsNot(0, PrivatePtr)); 642 643 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 644 645 auto &FPOpActions = getActionDefinitionsBuilder( 646 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 647 .legalFor({S32, S64}); 648 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 649 .customFor({S32, S64}); 650 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 651 .customFor({S32, S64}); 652 653 if (ST.has16BitInsts()) { 654 if (ST.hasVOP3PInsts()) 655 FPOpActions.legalFor({S16, V2S16}); 656 else 657 FPOpActions.legalFor({S16}); 658 659 TrigActions.customFor({S16}); 660 FDIVActions.customFor({S16}); 661 } 662 663 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 664 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 665 666 if (ST.hasVOP3PInsts()) { 667 MinNumMaxNum.customFor(FPTypesPK16) 668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 669 .clampMaxNumElements(0, S16, 2) 670 .clampScalar(0, S16, S64) 671 .scalarize(0); 672 } else if (ST.has16BitInsts()) { 673 MinNumMaxNum.customFor(FPTypes16) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else { 677 MinNumMaxNum.customFor(FPTypesBase) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } 681 682 if (ST.hasVOP3PInsts()) 683 FPOpActions.clampMaxNumElements(0, S16, 2); 684 685 FPOpActions 686 .scalarize(0) 687 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 688 689 TrigActions 690 .scalarize(0) 691 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 692 693 FDIVActions 694 .scalarize(0) 695 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 696 697 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 698 .legalFor(FPTypesPK16) 699 .clampMaxNumElements(0, S16, 2) 700 .scalarize(0) 701 .clampScalar(0, S16, S64); 702 703 if (ST.has16BitInsts()) { 704 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 705 .legalFor({S32, S64, S16}) 706 .scalarize(0) 707 .clampScalar(0, S16, S64); 708 } else { 709 getActionDefinitionsBuilder(G_FSQRT) 710 .legalFor({S32, S64}) 711 .scalarize(0) 712 .clampScalar(0, S32, S64); 713 714 if (ST.hasFractBug()) { 715 getActionDefinitionsBuilder(G_FFLOOR) 716 .customFor({S64}) 717 .legalFor({S32, S64}) 718 .scalarize(0) 719 .clampScalar(0, S32, S64); 720 } else { 721 getActionDefinitionsBuilder(G_FFLOOR) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } 726 } 727 728 getActionDefinitionsBuilder(G_FPTRUNC) 729 .legalFor({{S32, S64}, {S16, S32}}) 730 .scalarize(0) 731 .lower(); 732 733 getActionDefinitionsBuilder(G_FPEXT) 734 .legalFor({{S64, S32}, {S32, S16}}) 735 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 736 .scalarize(0); 737 738 getActionDefinitionsBuilder(G_FSUB) 739 // Use actual fsub instruction 740 .legalFor({S32}) 741 // Must use fadd + fneg 742 .lowerFor({S64, S16, V2S16}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 // Whether this is legal depends on the floating point mode for the function. 747 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 748 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 749 FMad.customFor({S32, S16}); 750 else if (ST.hasMadMacF32Insts()) 751 FMad.customFor({S32}); 752 else if (ST.hasMadF16()) 753 FMad.customFor({S16}); 754 FMad.scalarize(0) 755 .lower(); 756 757 auto &FRem = getActionDefinitionsBuilder(G_FREM); 758 if (ST.has16BitInsts()) { 759 FRem.customFor({S16, S32, S64}); 760 } else { 761 FRem.minScalar(0, S32) 762 .customFor({S32, S64}); 763 } 764 FRem.scalarize(0); 765 766 // TODO: Do we need to clamp maximum bitwidth? 767 getActionDefinitionsBuilder(G_TRUNC) 768 .legalIf(isScalar(0)) 769 .legalFor({{V2S16, V2S32}}) 770 .clampMaxNumElements(0, S16, 2) 771 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 772 // situations (like an invalid implicit use), we don't want to infinite loop 773 // in the legalizer. 774 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 775 .alwaysLegal(); 776 777 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 778 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 779 {S32, S1}, {S64, S1}, {S16, S1}}) 780 .scalarize(0) 781 .clampScalar(0, S32, S64) 782 .widenScalarToNextPow2(1, 32); 783 784 // TODO: Split s1->s64 during regbankselect for VALU. 785 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 786 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 787 .lowerFor({{S32, S64}}) 788 .lowerIf(typeIs(1, S1)) 789 .customFor({{S64, S64}}); 790 if (ST.has16BitInsts()) 791 IToFP.legalFor({{S16, S16}}); 792 IToFP.clampScalar(1, S32, S64) 793 .minScalar(0, S32) 794 .scalarize(0) 795 .widenScalarToNextPow2(1); 796 797 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 798 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 799 .customFor({{S64, S64}}) 800 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 801 if (ST.has16BitInsts()) 802 FPToI.legalFor({{S16, S16}}); 803 else 804 FPToI.minScalar(1, S32); 805 806 FPToI.minScalar(0, S32) 807 .scalarize(0) 808 .lower(); 809 810 // Lower roundeven into G_FRINT 811 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 812 .scalarize(0) 813 .lower(); 814 815 if (ST.has16BitInsts()) { 816 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 817 .legalFor({S16, S32, S64}) 818 .clampScalar(0, S16, S64) 819 .scalarize(0); 820 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S32, S64}) 823 .clampScalar(0, S32, S64) 824 .scalarize(0); 825 } else { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32}) 828 .customFor({S64}) 829 .clampScalar(0, S32, S64) 830 .scalarize(0); 831 } 832 833 getActionDefinitionsBuilder(G_PTR_ADD) 834 .legalIf(all(isPointer(0), sameSize(0, 1))) 835 .scalarize(0) 836 .scalarSameSizeAs(1, 0); 837 838 getActionDefinitionsBuilder(G_PTRMASK) 839 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 840 .scalarSameSizeAs(1, 0) 841 .scalarize(0); 842 843 auto &CmpBuilder = 844 getActionDefinitionsBuilder(G_ICMP) 845 // The compare output type differs based on the register bank of the output, 846 // so make both s1 and s32 legal. 847 // 848 // Scalar compares producing output in scc will be promoted to s32, as that 849 // is the allocatable register type that will be needed for the copy from 850 // scc. This will be promoted during RegBankSelect, and we assume something 851 // before that won't try to use s32 result types. 852 // 853 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 854 // bank. 855 .legalForCartesianProduct( 856 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 857 .legalForCartesianProduct( 858 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 859 if (ST.has16BitInsts()) { 860 CmpBuilder.legalFor({{S1, S16}}); 861 } 862 863 CmpBuilder 864 .widenScalarToNextPow2(1) 865 .clampScalar(1, S32, S64) 866 .scalarize(0) 867 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 868 869 getActionDefinitionsBuilder(G_FCMP) 870 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 871 .widenScalarToNextPow2(1) 872 .clampScalar(1, S32, S64) 873 .scalarize(0); 874 875 // FIXME: fpow has a selection pattern that should move to custom lowering. 876 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 877 if (ST.has16BitInsts()) 878 Exp2Ops.legalFor({S32, S16}); 879 else 880 Exp2Ops.legalFor({S32}); 881 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 882 Exp2Ops.scalarize(0); 883 884 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 885 if (ST.has16BitInsts()) 886 ExpOps.customFor({{S32}, {S16}}); 887 else 888 ExpOps.customFor({S32}); 889 ExpOps.clampScalar(0, MinScalarFPTy, S32) 890 .scalarize(0); 891 892 getActionDefinitionsBuilder(G_FPOWI) 893 .clampScalar(0, MinScalarFPTy, S32) 894 .lower(); 895 896 // The 64-bit versions produce 32-bit results, but only on the SALU. 897 getActionDefinitionsBuilder(G_CTPOP) 898 .legalFor({{S32, S32}, {S32, S64}}) 899 .clampScalar(0, S32, S32) 900 .clampScalar(1, S32, S64) 901 .scalarize(0) 902 .widenScalarToNextPow2(0, 32) 903 .widenScalarToNextPow2(1, 32); 904 905 // The hardware instructions return a different result on 0 than the generic 906 // instructions expect. The hardware produces -1, but these produce the 907 // bitwidth. 908 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 909 .scalarize(0) 910 .clampScalar(0, S32, S32) 911 .clampScalar(1, S32, S64) 912 .widenScalarToNextPow2(0, 32) 913 .widenScalarToNextPow2(1, 32) 914 .lower(); 915 916 // The 64-bit versions produce 32-bit results, but only on the SALU. 917 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 918 .legalFor({{S32, S32}, {S32, S64}}) 919 .clampScalar(0, S32, S32) 920 .clampScalar(1, S32, S64) 921 .scalarize(0) 922 .widenScalarToNextPow2(0, 32) 923 .widenScalarToNextPow2(1, 32); 924 925 getActionDefinitionsBuilder(G_BITREVERSE) 926 .legalFor({S32}) 927 .clampScalar(0, S32, S32) 928 .scalarize(0); 929 930 if (ST.has16BitInsts()) { 931 getActionDefinitionsBuilder(G_BSWAP) 932 .legalFor({S16, S32, V2S16}) 933 .clampMaxNumElements(0, S16, 2) 934 // FIXME: Fixing non-power-of-2 before clamp is workaround for 935 // narrowScalar limitation. 936 .widenScalarToNextPow2(0) 937 .clampScalar(0, S16, S32) 938 .scalarize(0); 939 940 if (ST.hasVOP3PInsts()) { 941 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 942 .legalFor({S32, S16, V2S16}) 943 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 944 .clampMaxNumElements(0, S16, 2) 945 .minScalar(0, S16) 946 .widenScalarToNextPow2(0) 947 .scalarize(0) 948 .lower(); 949 } else { 950 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 951 .legalFor({S32, S16}) 952 .widenScalarToNextPow2(0) 953 .minScalar(0, S16) 954 .scalarize(0) 955 .lower(); 956 } 957 } else { 958 // TODO: Should have same legality without v_perm_b32 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S32}) 961 .lowerIf(scalarNarrowerThan(0, 32)) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .maxScalar(0, S32) 966 .scalarize(0) 967 .lower(); 968 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 970 .legalFor({S32}) 971 .minScalar(0, S32) 972 .widenScalarToNextPow2(0) 973 .scalarize(0) 974 .lower(); 975 } 976 977 getActionDefinitionsBuilder(G_INTTOPTR) 978 // List the common cases 979 .legalForCartesianProduct(AddrSpaces64, {S64}) 980 .legalForCartesianProduct(AddrSpaces32, {S32}) 981 .scalarize(0) 982 // Accept any address space as long as the size matches 983 .legalIf(sameSize(0, 1)) 984 .widenScalarIf(smallerThan(1, 0), 985 [](const LegalityQuery &Query) { 986 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 987 }) 988 .narrowScalarIf(largerThan(1, 0), 989 [](const LegalityQuery &Query) { 990 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 991 }); 992 993 getActionDefinitionsBuilder(G_PTRTOINT) 994 // List the common cases 995 .legalForCartesianProduct(AddrSpaces64, {S64}) 996 .legalForCartesianProduct(AddrSpaces32, {S32}) 997 .scalarize(0) 998 // Accept any address space as long as the size matches 999 .legalIf(sameSize(0, 1)) 1000 .widenScalarIf(smallerThan(0, 1), 1001 [](const LegalityQuery &Query) { 1002 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1003 }) 1004 .narrowScalarIf( 1005 largerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }); 1009 1010 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1011 .scalarize(0) 1012 .custom(); 1013 1014 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1015 bool IsLoad) -> bool { 1016 const LLT DstTy = Query.Types[0]; 1017 1018 // Split vector extloads. 1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1020 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1021 1022 if (MemSize < DstTy.getSizeInBits()) 1023 MemSize = std::max(MemSize, AlignBits); 1024 1025 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1026 return true; 1027 1028 const LLT PtrTy = Query.Types[1]; 1029 unsigned AS = PtrTy.getAddressSpace(); 1030 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1031 return true; 1032 1033 // Catch weird sized loads that don't evenly divide into the access sizes 1034 // TODO: May be able to widen depending on alignment etc. 1035 unsigned NumRegs = (MemSize + 31) / 32; 1036 if (NumRegs == 3) { 1037 if (!ST.hasDwordx3LoadStores()) 1038 return true; 1039 } else { 1040 // If the alignment allows, these should have been widened. 1041 if (!isPowerOf2_32(NumRegs)) 1042 return true; 1043 } 1044 1045 if (AlignBits < MemSize) { 1046 const SITargetLowering *TLI = ST.getTargetLowering(); 1047 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1048 Align(AlignBits / 8)); 1049 } 1050 1051 return false; 1052 }; 1053 1054 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1055 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1056 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1057 1058 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1059 // LDS 1060 // TODO: Unsupported flat for SI. 1061 1062 for (unsigned Op : {G_LOAD, G_STORE}) { 1063 const bool IsStore = Op == G_STORE; 1064 1065 auto &Actions = getActionDefinitionsBuilder(Op); 1066 // Explicitly list some common cases. 1067 // TODO: Does this help compile time at all? 1068 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1069 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1070 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1071 {S64, GlobalPtr, 64, GlobalAlign32}, 1072 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1073 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1074 {S32, GlobalPtr, 8, GlobalAlign8}, 1075 {S32, GlobalPtr, 16, GlobalAlign16}, 1076 1077 {S32, LocalPtr, 32, 32}, 1078 {S64, LocalPtr, 64, 32}, 1079 {V2S32, LocalPtr, 64, 32}, 1080 {S32, LocalPtr, 8, 8}, 1081 {S32, LocalPtr, 16, 16}, 1082 {V2S16, LocalPtr, 32, 32}, 1083 1084 {S32, PrivatePtr, 32, 32}, 1085 {S32, PrivatePtr, 8, 8}, 1086 {S32, PrivatePtr, 16, 16}, 1087 {V2S16, PrivatePtr, 32, 32}, 1088 1089 {S32, ConstantPtr, 32, GlobalAlign32}, 1090 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1091 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1092 {S64, ConstantPtr, 64, GlobalAlign32}, 1093 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1094 Actions.legalIf( 1095 [=](const LegalityQuery &Query) -> bool { 1096 return isLoadStoreLegal(ST, Query, Op); 1097 }); 1098 1099 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1100 // 64-bits. 1101 // 1102 // TODO: Should generalize bitcast action into coerce, which will also cover 1103 // inserting addrspacecasts. 1104 Actions.customIf(typeIs(1, Constant32Ptr)); 1105 1106 // Turn any illegal element vectors into something easier to deal 1107 // with. These will ultimately produce 32-bit scalar shifts to extract the 1108 // parts anyway. 1109 // 1110 // For odd 16-bit element vectors, prefer to split those into pieces with 1111 // 16-bit vector parts. 1112 Actions.bitcastIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1115 Query.MMODescrs[0].SizeInBits); 1116 }, bitcastToRegisterType(0)); 1117 1118 if (!IsStore) { 1119 // Widen suitably aligned loads by loading extra bytes. The standard 1120 // legalization actions can't properly express widening memory operands. 1121 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1122 return shouldWidenLoad(ST, Query, G_LOAD); 1123 }); 1124 } 1125 1126 // FIXME: load/store narrowing should be moved to lower action 1127 Actions 1128 .narrowScalarIf( 1129 [=](const LegalityQuery &Query) -> bool { 1130 return !Query.Types[0].isVector() && 1131 needToSplitMemOp(Query, Op == G_LOAD); 1132 }, 1133 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1134 const LLT DstTy = Query.Types[0]; 1135 const LLT PtrTy = Query.Types[1]; 1136 1137 const unsigned DstSize = DstTy.getSizeInBits(); 1138 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1139 1140 // Split extloads. 1141 if (DstSize > MemSize) 1142 return std::make_pair(0, LLT::scalar(MemSize)); 1143 1144 if (!isPowerOf2_32(DstSize)) { 1145 // We're probably decomposing an odd sized store. Try to split 1146 // to the widest type. TODO: Account for alignment. As-is it 1147 // should be OK, since the new parts will be further legalized. 1148 unsigned FloorSize = PowerOf2Floor(DstSize); 1149 return std::make_pair(0, LLT::scalar(FloorSize)); 1150 } 1151 1152 if (DstSize > 32 && (DstSize % 32 != 0)) { 1153 // FIXME: Need a way to specify non-extload of larger size if 1154 // suitably aligned. 1155 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1156 } 1157 1158 unsigned MaxSize = maxSizeForAddrSpace(ST, 1159 PtrTy.getAddressSpace(), 1160 Op == G_LOAD); 1161 if (MemSize > MaxSize) 1162 return std::make_pair(0, LLT::scalar(MaxSize)); 1163 1164 unsigned Align = Query.MMODescrs[0].AlignInBits; 1165 return std::make_pair(0, LLT::scalar(Align)); 1166 }) 1167 .fewerElementsIf( 1168 [=](const LegalityQuery &Query) -> bool { 1169 return Query.Types[0].isVector() && 1170 needToSplitMemOp(Query, Op == G_LOAD); 1171 }, 1172 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1173 const LLT DstTy = Query.Types[0]; 1174 const LLT PtrTy = Query.Types[1]; 1175 1176 LLT EltTy = DstTy.getElementType(); 1177 unsigned MaxSize = maxSizeForAddrSpace(ST, 1178 PtrTy.getAddressSpace(), 1179 Op == G_LOAD); 1180 1181 // FIXME: Handle widened to power of 2 results better. This ends 1182 // up scalarizing. 1183 // FIXME: 3 element stores scalarized on SI 1184 1185 // Split if it's too large for the address space. 1186 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1187 unsigned NumElts = DstTy.getNumElements(); 1188 unsigned EltSize = EltTy.getSizeInBits(); 1189 1190 if (MaxSize % EltSize == 0) { 1191 return std::make_pair( 1192 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1193 } 1194 1195 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1196 1197 // FIXME: Refine when odd breakdowns handled 1198 // The scalars will need to be re-legalized. 1199 if (NumPieces == 1 || NumPieces >= NumElts || 1200 NumElts % NumPieces != 0) 1201 return std::make_pair(0, EltTy); 1202 1203 return std::make_pair(0, 1204 LLT::vector(NumElts / NumPieces, EltTy)); 1205 } 1206 1207 // FIXME: We could probably handle weird extending loads better. 1208 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1209 if (DstTy.getSizeInBits() > MemSize) 1210 return std::make_pair(0, EltTy); 1211 1212 unsigned EltSize = EltTy.getSizeInBits(); 1213 unsigned DstSize = DstTy.getSizeInBits(); 1214 if (!isPowerOf2_32(DstSize)) { 1215 // We're probably decomposing an odd sized store. Try to split 1216 // to the widest type. TODO: Account for alignment. As-is it 1217 // should be OK, since the new parts will be further legalized. 1218 unsigned FloorSize = PowerOf2Floor(DstSize); 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1221 } 1222 1223 // Need to split because of alignment. 1224 unsigned Align = Query.MMODescrs[0].AlignInBits; 1225 if (EltSize > Align && 1226 (EltSize / Align < DstTy.getNumElements())) { 1227 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1228 } 1229 1230 // May need relegalization for the scalars. 1231 return std::make_pair(0, EltTy); 1232 }) 1233 .lowerIfMemSizeNotPow2() 1234 .minScalar(0, S32); 1235 1236 if (IsStore) 1237 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1238 1239 Actions 1240 .widenScalarToNextPow2(0) 1241 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1242 .lower(); 1243 } 1244 1245 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1246 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1247 {S32, GlobalPtr, 16, 2 * 8}, 1248 {S32, LocalPtr, 8, 8}, 1249 {S32, LocalPtr, 16, 16}, 1250 {S32, PrivatePtr, 8, 8}, 1251 {S32, PrivatePtr, 16, 16}, 1252 {S32, ConstantPtr, 8, 8}, 1253 {S32, ConstantPtr, 16, 2 * 8}}); 1254 if (ST.hasFlatAddressSpace()) { 1255 ExtLoads.legalForTypesWithMemDesc( 1256 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1257 } 1258 1259 ExtLoads.clampScalar(0, S32, S32) 1260 .widenScalarToNextPow2(0) 1261 .unsupportedIfMemSizeNotPow2() 1262 .lower(); 1263 1264 auto &Atomics = getActionDefinitionsBuilder( 1265 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1266 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1267 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1268 G_ATOMICRMW_UMIN}) 1269 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1270 {S64, GlobalPtr}, {S64, LocalPtr}, 1271 {S32, RegionPtr}, {S64, RegionPtr}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1274 } 1275 1276 if (ST.hasLDSFPAtomics()) { 1277 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1278 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1279 } 1280 1281 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1282 // demarshalling 1283 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1284 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1285 {S32, FlatPtr}, {S64, FlatPtr}}) 1286 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1287 {S32, RegionPtr}, {S64, RegionPtr}}); 1288 // TODO: Pointer types, any 32-bit or 64-bit vector 1289 1290 // Condition should be s32 for scalar, s1 for vector. 1291 getActionDefinitionsBuilder(G_SELECT) 1292 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1293 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1294 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1295 .clampScalar(0, S16, S64) 1296 .scalarize(1) 1297 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1298 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1299 .clampMaxNumElements(0, S32, 2) 1300 .clampMaxNumElements(0, LocalPtr, 2) 1301 .clampMaxNumElements(0, PrivatePtr, 2) 1302 .scalarize(0) 1303 .widenScalarToNextPow2(0) 1304 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1305 1306 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1307 // be more flexible with the shift amount type. 1308 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1309 .legalFor({{S32, S32}, {S64, S32}}); 1310 if (ST.has16BitInsts()) { 1311 if (ST.hasVOP3PInsts()) { 1312 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1313 .clampMaxNumElements(0, S16, 2); 1314 } else 1315 Shifts.legalFor({{S16, S16}}); 1316 1317 // TODO: Support 16-bit shift amounts for all types 1318 Shifts.widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1321 // 32-bit amount. 1322 const LLT ValTy = Query.Types[0]; 1323 const LLT AmountTy = Query.Types[1]; 1324 return ValTy.getSizeInBits() <= 16 && 1325 AmountTy.getSizeInBits() < 16; 1326 }, changeTo(1, S16)); 1327 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1328 Shifts.clampScalar(1, S32, S32); 1329 Shifts.clampScalar(0, S16, S64); 1330 Shifts.widenScalarToNextPow2(0, 16); 1331 1332 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1333 .minScalar(0, S16) 1334 .scalarize(0) 1335 .lower(); 1336 } else { 1337 // Make sure we legalize the shift amount type first, as the general 1338 // expansion for the shifted type will produce much worse code if it hasn't 1339 // been truncated already. 1340 Shifts.clampScalar(1, S32, S32); 1341 Shifts.clampScalar(0, S32, S64); 1342 Shifts.widenScalarToNextPow2(0, 32); 1343 1344 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1345 .minScalar(0, S32) 1346 .scalarize(0) 1347 .lower(); 1348 } 1349 Shifts.scalarize(0); 1350 1351 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1352 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1353 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1354 unsigned IdxTypeIdx = 2; 1355 1356 getActionDefinitionsBuilder(Op) 1357 .customIf([=](const LegalityQuery &Query) { 1358 const LLT EltTy = Query.Types[EltTypeIdx]; 1359 const LLT VecTy = Query.Types[VecTypeIdx]; 1360 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1361 const unsigned EltSize = EltTy.getSizeInBits(); 1362 return (EltSize == 32 || EltSize == 64) && 1363 VecTy.getSizeInBits() % 32 == 0 && 1364 VecTy.getSizeInBits() <= MaxRegisterSize && 1365 IdxTy.getSizeInBits() == 32; 1366 }) 1367 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1368 bitcastToVectorElement32(VecTypeIdx)) 1369 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1370 .bitcastIf( 1371 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1372 [=](const LegalityQuery &Query) { 1373 // For > 64-bit element types, try to turn this into a 64-bit 1374 // element vector since we may be able to do better indexing 1375 // if this is scalar. If not, fall back to 32. 1376 const LLT EltTy = Query.Types[EltTypeIdx]; 1377 const LLT VecTy = Query.Types[VecTypeIdx]; 1378 const unsigned DstEltSize = EltTy.getSizeInBits(); 1379 const unsigned VecSize = VecTy.getSizeInBits(); 1380 1381 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1382 return std::make_pair( 1383 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1384 }) 1385 .clampScalar(EltTypeIdx, S32, S64) 1386 .clampScalar(VecTypeIdx, S32, S64) 1387 .clampScalar(IdxTypeIdx, S32, S32) 1388 .clampMaxNumElements(VecTypeIdx, S32, 32) 1389 // TODO: Clamp elements for 64-bit vectors? 1390 // It should only be necessary with variable indexes. 1391 // As a last resort, lower to the stack 1392 .lower(); 1393 } 1394 1395 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1396 .unsupportedIf([=](const LegalityQuery &Query) { 1397 const LLT &EltTy = Query.Types[1].getElementType(); 1398 return Query.Types[0] != EltTy; 1399 }); 1400 1401 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1402 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1403 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1404 1405 // FIXME: Doesn't handle extract of illegal sizes. 1406 getActionDefinitionsBuilder(Op) 1407 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1408 // FIXME: Multiples of 16 should not be legal. 1409 .legalIf([=](const LegalityQuery &Query) { 1410 const LLT BigTy = Query.Types[BigTyIdx]; 1411 const LLT LitTy = Query.Types[LitTyIdx]; 1412 return (BigTy.getSizeInBits() % 32 == 0) && 1413 (LitTy.getSizeInBits() % 16 == 0); 1414 }) 1415 .widenScalarIf( 1416 [=](const LegalityQuery &Query) { 1417 const LLT BigTy = Query.Types[BigTyIdx]; 1418 return (BigTy.getScalarSizeInBits() < 16); 1419 }, 1420 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1421 .widenScalarIf( 1422 [=](const LegalityQuery &Query) { 1423 const LLT LitTy = Query.Types[LitTyIdx]; 1424 return (LitTy.getScalarSizeInBits() < 16); 1425 }, 1426 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1427 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1428 .widenScalarToNextPow2(BigTyIdx, 32); 1429 1430 } 1431 1432 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1433 .legalForCartesianProduct(AllS32Vectors, {S32}) 1434 .legalForCartesianProduct(AllS64Vectors, {S64}) 1435 .clampNumElements(0, V16S32, V32S32) 1436 .clampNumElements(0, V2S64, V16S64) 1437 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1438 1439 if (ST.hasScalarPackInsts()) { 1440 BuildVector 1441 // FIXME: Should probably widen s1 vectors straight to s32 1442 .minScalarOrElt(0, S16) 1443 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1444 .minScalar(1, S32); 1445 1446 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1447 .legalFor({V2S16, S32}) 1448 .lower(); 1449 BuildVector.minScalarOrElt(0, S32); 1450 } else { 1451 BuildVector.customFor({V2S16, S16}); 1452 BuildVector.minScalarOrElt(0, S32); 1453 1454 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1455 .customFor({V2S16, S32}) 1456 .lower(); 1457 } 1458 1459 BuildVector.legalIf(isRegisterType(0)); 1460 1461 // FIXME: Clamp maximum size 1462 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1463 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1464 .clampMaxNumElements(0, S32, 32) 1465 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1466 .clampMaxNumElements(0, S16, 64); 1467 1468 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1469 // pre-legalize. 1470 if (ST.hasVOP3PInsts()) { 1471 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1472 .customFor({V2S16, V2S16}) 1473 .lower(); 1474 } else 1475 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1476 1477 // Merge/Unmerge 1478 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1479 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1480 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1481 1482 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1483 const LLT Ty = Query.Types[TypeIdx]; 1484 if (Ty.isVector()) { 1485 const LLT &EltTy = Ty.getElementType(); 1486 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1487 return true; 1488 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1489 return true; 1490 } 1491 return false; 1492 }; 1493 1494 auto &Builder = getActionDefinitionsBuilder(Op) 1495 .lowerFor({{S16, V2S16}}) 1496 .lowerIf([=](const LegalityQuery &Query) { 1497 const LLT BigTy = Query.Types[BigTyIdx]; 1498 return BigTy.getSizeInBits() == 32; 1499 }) 1500 // Try to widen to s16 first for small types. 1501 // TODO: Only do this on targets with legal s16 shifts 1502 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1503 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1504 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1505 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1506 elementTypeIs(1, S16)), 1507 changeTo(1, V2S16)) 1508 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1509 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1510 // valid. 1511 .clampScalar(LitTyIdx, S32, S512) 1512 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1513 // Break up vectors with weird elements into scalars 1514 .fewerElementsIf( 1515 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1516 scalarize(0)) 1517 .fewerElementsIf( 1518 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1519 scalarize(1)) 1520 .clampScalar(BigTyIdx, S32, MaxScalar); 1521 1522 if (Op == G_MERGE_VALUES) { 1523 Builder.widenScalarIf( 1524 // TODO: Use 16-bit shifts if legal for 8-bit values? 1525 [=](const LegalityQuery &Query) { 1526 const LLT Ty = Query.Types[LitTyIdx]; 1527 return Ty.getSizeInBits() < 32; 1528 }, 1529 changeTo(LitTyIdx, S32)); 1530 } 1531 1532 Builder.widenScalarIf( 1533 [=](const LegalityQuery &Query) { 1534 const LLT Ty = Query.Types[BigTyIdx]; 1535 return !isPowerOf2_32(Ty.getSizeInBits()) && 1536 Ty.getSizeInBits() % 16 != 0; 1537 }, 1538 [=](const LegalityQuery &Query) { 1539 // Pick the next power of 2, or a multiple of 64 over 128. 1540 // Whichever is smaller. 1541 const LLT &Ty = Query.Types[BigTyIdx]; 1542 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1543 if (NewSizeInBits >= 256) { 1544 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1545 if (RoundedTo < NewSizeInBits) 1546 NewSizeInBits = RoundedTo; 1547 } 1548 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1549 }) 1550 .legalIf([=](const LegalityQuery &Query) { 1551 const LLT &BigTy = Query.Types[BigTyIdx]; 1552 const LLT &LitTy = Query.Types[LitTyIdx]; 1553 1554 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1555 return false; 1556 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1557 return false; 1558 1559 return BigTy.getSizeInBits() % 16 == 0 && 1560 LitTy.getSizeInBits() % 16 == 0 && 1561 BigTy.getSizeInBits() <= MaxRegisterSize; 1562 }) 1563 // Any vectors left are the wrong size. Scalarize them. 1564 .scalarize(0) 1565 .scalarize(1); 1566 } 1567 1568 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1569 // RegBankSelect. 1570 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1571 .legalFor({{S32}, {S64}}); 1572 1573 if (ST.hasVOP3PInsts()) { 1574 SextInReg.lowerFor({{V2S16}}) 1575 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1576 // get more vector shift opportunities, since we'll get those when 1577 // expanded. 1578 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1579 } else if (ST.has16BitInsts()) { 1580 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1581 } else { 1582 // Prefer to promote to s32 before lowering if we don't have 16-bit 1583 // shifts. This avoid a lot of intermediate truncate and extend operations. 1584 SextInReg.lowerFor({{S32}, {S64}}); 1585 } 1586 1587 SextInReg 1588 .scalarize(0) 1589 .clampScalar(0, S32, S64) 1590 .lower(); 1591 1592 getActionDefinitionsBuilder(G_FSHR) 1593 .legalFor({{S32, S32}}) 1594 .scalarize(0) 1595 .lower(); 1596 1597 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1598 .legalFor({S64}); 1599 1600 getActionDefinitionsBuilder(G_FENCE) 1601 .alwaysLegal(); 1602 1603 getActionDefinitionsBuilder({ 1604 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1605 G_FCOPYSIGN, 1606 1607 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1608 G_ATOMICRMW_NAND, 1609 G_ATOMICRMW_FSUB, 1610 G_READ_REGISTER, 1611 G_WRITE_REGISTER, 1612 1613 G_SADDO, G_SSUBO, 1614 1615 // TODO: Implement 1616 G_FMINIMUM, G_FMAXIMUM, 1617 G_FSHL 1618 }).lower(); 1619 1620 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1621 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1622 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1623 .unsupported(); 1624 1625 computeTables(); 1626 verify(*ST.getInstrInfo()); 1627 } 1628 1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1630 MachineInstr &MI) const { 1631 MachineIRBuilder &B = Helper.MIRBuilder; 1632 MachineRegisterInfo &MRI = *B.getMRI(); 1633 1634 switch (MI.getOpcode()) { 1635 case TargetOpcode::G_ADDRSPACE_CAST: 1636 return legalizeAddrSpaceCast(MI, MRI, B); 1637 case TargetOpcode::G_FRINT: 1638 return legalizeFrint(MI, MRI, B); 1639 case TargetOpcode::G_FCEIL: 1640 return legalizeFceil(MI, MRI, B); 1641 case TargetOpcode::G_FREM: 1642 return legalizeFrem(MI, MRI, B); 1643 case TargetOpcode::G_INTRINSIC_TRUNC: 1644 return legalizeIntrinsicTrunc(MI, MRI, B); 1645 case TargetOpcode::G_SITOFP: 1646 return legalizeITOFP(MI, MRI, B, true); 1647 case TargetOpcode::G_UITOFP: 1648 return legalizeITOFP(MI, MRI, B, false); 1649 case TargetOpcode::G_FPTOSI: 1650 return legalizeFPTOI(MI, MRI, B, true); 1651 case TargetOpcode::G_FPTOUI: 1652 return legalizeFPTOI(MI, MRI, B, false); 1653 case TargetOpcode::G_FMINNUM: 1654 case TargetOpcode::G_FMAXNUM: 1655 case TargetOpcode::G_FMINNUM_IEEE: 1656 case TargetOpcode::G_FMAXNUM_IEEE: 1657 return legalizeMinNumMaxNum(Helper, MI); 1658 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1659 return legalizeExtractVectorElt(MI, MRI, B); 1660 case TargetOpcode::G_INSERT_VECTOR_ELT: 1661 return legalizeInsertVectorElt(MI, MRI, B); 1662 case TargetOpcode::G_SHUFFLE_VECTOR: 1663 return legalizeShuffleVector(MI, MRI, B); 1664 case TargetOpcode::G_FSIN: 1665 case TargetOpcode::G_FCOS: 1666 return legalizeSinCos(MI, MRI, B); 1667 case TargetOpcode::G_GLOBAL_VALUE: 1668 return legalizeGlobalValue(MI, MRI, B); 1669 case TargetOpcode::G_LOAD: 1670 return legalizeLoad(Helper, MI); 1671 case TargetOpcode::G_FMAD: 1672 return legalizeFMad(MI, MRI, B); 1673 case TargetOpcode::G_FDIV: 1674 return legalizeFDIV(MI, MRI, B); 1675 case TargetOpcode::G_UDIV: 1676 case TargetOpcode::G_UREM: 1677 return legalizeUDIV_UREM(MI, MRI, B); 1678 case TargetOpcode::G_SDIV: 1679 case TargetOpcode::G_SREM: 1680 return legalizeSDIV_SREM(MI, MRI, B); 1681 case TargetOpcode::G_ATOMIC_CMPXCHG: 1682 return legalizeAtomicCmpXChg(MI, MRI, B); 1683 case TargetOpcode::G_FLOG: 1684 return legalizeFlog(MI, B, numbers::ln2f); 1685 case TargetOpcode::G_FLOG10: 1686 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1687 case TargetOpcode::G_FEXP: 1688 return legalizeFExp(MI, B); 1689 case TargetOpcode::G_FPOW: 1690 return legalizeFPow(MI, B); 1691 case TargetOpcode::G_FFLOOR: 1692 return legalizeFFloor(MI, MRI, B); 1693 case TargetOpcode::G_BUILD_VECTOR: 1694 return legalizeBuildVector(MI, MRI, B); 1695 default: 1696 return false; 1697 } 1698 1699 llvm_unreachable("expected switch to return"); 1700 } 1701 1702 Register AMDGPULegalizerInfo::getSegmentAperture( 1703 unsigned AS, 1704 MachineRegisterInfo &MRI, 1705 MachineIRBuilder &B) const { 1706 MachineFunction &MF = B.getMF(); 1707 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1708 const LLT S32 = LLT::scalar(32); 1709 1710 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1711 1712 if (ST.hasApertureRegs()) { 1713 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1714 // getreg. 1715 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1716 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1717 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1718 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1719 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1721 unsigned Encoding = 1722 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1723 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1724 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1725 1726 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1727 1728 B.buildInstr(AMDGPU::S_GETREG_B32) 1729 .addDef(GetReg) 1730 .addImm(Encoding); 1731 MRI.setType(GetReg, S32); 1732 1733 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1734 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1735 } 1736 1737 Register QueuePtr = MRI.createGenericVirtualRegister( 1738 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1739 1740 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1741 return Register(); 1742 1743 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1744 // private_segment_aperture_base_hi. 1745 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1746 1747 // TODO: can we be smarter about machine pointer info? 1748 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1749 MachineMemOperand *MMO = MF.getMachineMemOperand( 1750 PtrInfo, 1751 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1752 MachineMemOperand::MOInvariant, 1753 4, commonAlignment(Align(64), StructOffset)); 1754 1755 Register LoadAddr; 1756 1757 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1758 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1759 } 1760 1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1762 MachineInstr &MI, MachineRegisterInfo &MRI, 1763 MachineIRBuilder &B) const { 1764 MachineFunction &MF = B.getMF(); 1765 1766 const LLT S32 = LLT::scalar(32); 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Src = MI.getOperand(1).getReg(); 1769 1770 LLT DstTy = MRI.getType(Dst); 1771 LLT SrcTy = MRI.getType(Src); 1772 unsigned DestAS = DstTy.getAddressSpace(); 1773 unsigned SrcAS = SrcTy.getAddressSpace(); 1774 1775 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1776 // vector element. 1777 assert(!DstTy.isVector()); 1778 1779 const AMDGPUTargetMachine &TM 1780 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1781 1782 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1783 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1784 return true; 1785 } 1786 1787 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 // Truncate. 1789 B.buildExtract(Dst, Src, 0); 1790 MI.eraseFromParent(); 1791 return true; 1792 } 1793 1794 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1795 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1796 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1797 1798 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1799 // another. Merge operands are required to be the same type, but creating an 1800 // extra ptrtoint would be kind of pointless. 1801 auto HighAddr = B.buildConstant( 1802 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1803 B.buildMerge(Dst, {Src, HighAddr}); 1804 MI.eraseFromParent(); 1805 return true; 1806 } 1807 1808 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1809 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1810 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1811 unsigned NullVal = TM.getNullPointerValue(DestAS); 1812 1813 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1814 auto FlatNull = B.buildConstant(SrcTy, 0); 1815 1816 // Extract low 32-bits of the pointer. 1817 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1818 1819 auto CmpRes = 1820 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1821 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1822 1823 MI.eraseFromParent(); 1824 return true; 1825 } 1826 1827 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1828 return false; 1829 1830 if (!ST.hasFlatAddressSpace()) 1831 return false; 1832 1833 auto SegmentNull = 1834 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1835 auto FlatNull = 1836 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1837 1838 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1839 if (!ApertureReg.isValid()) 1840 return false; 1841 1842 auto CmpRes = 1843 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1844 1845 // Coerce the type of the low half of the result so we can use merge_values. 1846 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1847 1848 // TODO: Should we allow mismatched types but matching sizes in merges to 1849 // avoid the ptrtoint? 1850 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1851 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1852 1853 MI.eraseFromParent(); 1854 return true; 1855 } 1856 1857 bool AMDGPULegalizerInfo::legalizeFrint( 1858 MachineInstr &MI, MachineRegisterInfo &MRI, 1859 MachineIRBuilder &B) const { 1860 Register Src = MI.getOperand(1).getReg(); 1861 LLT Ty = MRI.getType(Src); 1862 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1863 1864 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1865 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1866 1867 auto C1 = B.buildFConstant(Ty, C1Val); 1868 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1869 1870 // TODO: Should this propagate fast-math-flags? 1871 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1872 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1873 1874 auto C2 = B.buildFConstant(Ty, C2Val); 1875 auto Fabs = B.buildFAbs(Ty, Src); 1876 1877 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1878 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeFceil( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B) const { 1886 1887 const LLT S1 = LLT::scalar(1); 1888 const LLT S64 = LLT::scalar(64); 1889 1890 Register Src = MI.getOperand(1).getReg(); 1891 assert(MRI.getType(Src) == S64); 1892 1893 // result = trunc(src) 1894 // if (src > 0.0 && src != result) 1895 // result += 1.0 1896 1897 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1898 1899 const auto Zero = B.buildFConstant(S64, 0.0); 1900 const auto One = B.buildFConstant(S64, 1.0); 1901 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1902 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1903 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1904 auto Add = B.buildSelect(S64, And, One, Zero); 1905 1906 // TODO: Should this propagate fast-math-flags? 1907 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrem( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register DstReg = MI.getOperand(0).getReg(); 1915 Register Src0Reg = MI.getOperand(1).getReg(); 1916 Register Src1Reg = MI.getOperand(2).getReg(); 1917 auto Flags = MI.getFlags(); 1918 LLT Ty = MRI.getType(DstReg); 1919 1920 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1921 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1922 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1923 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1924 MI.eraseFromParent(); 1925 return true; 1926 } 1927 1928 static MachineInstrBuilder extractF64Exponent(Register Hi, 1929 MachineIRBuilder &B) { 1930 const unsigned FractBits = 52; 1931 const unsigned ExpBits = 11; 1932 LLT S32 = LLT::scalar(32); 1933 1934 auto Const0 = B.buildConstant(S32, FractBits - 32); 1935 auto Const1 = B.buildConstant(S32, ExpBits); 1936 1937 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1938 .addUse(Hi) 1939 .addUse(Const0.getReg(0)) 1940 .addUse(Const1.getReg(0)); 1941 1942 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 const LLT S1 = LLT::scalar(1); 1949 const LLT S32 = LLT::scalar(32); 1950 const LLT S64 = LLT::scalar(64); 1951 1952 Register Src = MI.getOperand(1).getReg(); 1953 assert(MRI.getType(Src) == S64); 1954 1955 // TODO: Should this use extract since the low half is unused? 1956 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1957 Register Hi = Unmerge.getReg(1); 1958 1959 // Extract the upper half, since this is where we will find the sign and 1960 // exponent. 1961 auto Exp = extractF64Exponent(Hi, B); 1962 1963 const unsigned FractBits = 52; 1964 1965 // Extract the sign bit. 1966 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1967 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1968 1969 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1970 1971 const auto Zero32 = B.buildConstant(S32, 0); 1972 1973 // Extend back to 64-bits. 1974 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1975 1976 auto Shr = B.buildAShr(S64, FractMask, Exp); 1977 auto Not = B.buildNot(S64, Shr); 1978 auto Tmp0 = B.buildAnd(S64, Src, Not); 1979 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1980 1981 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1982 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1983 1984 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1985 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1986 MI.eraseFromParent(); 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeITOFP( 1991 MachineInstr &MI, MachineRegisterInfo &MRI, 1992 MachineIRBuilder &B, bool Signed) const { 1993 1994 Register Dst = MI.getOperand(0).getReg(); 1995 Register Src = MI.getOperand(1).getReg(); 1996 1997 const LLT S64 = LLT::scalar(64); 1998 const LLT S32 = LLT::scalar(32); 1999 2000 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2001 2002 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2003 2004 auto CvtHi = Signed ? 2005 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2006 B.buildUITOFP(S64, Unmerge.getReg(1)); 2007 2008 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2009 2010 auto ThirtyTwo = B.buildConstant(S32, 32); 2011 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2012 .addUse(CvtHi.getReg(0)) 2013 .addUse(ThirtyTwo.getReg(0)); 2014 2015 // TODO: Should this propagate fast-math-flags? 2016 B.buildFAdd(Dst, LdExp, CvtLo); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 // TODO: Copied from DAG implementation. Verify logic and document how this 2022 // actually works. 2023 bool AMDGPULegalizerInfo::legalizeFPTOI( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 MachineIRBuilder &B, bool Signed) const { 2026 2027 Register Dst = MI.getOperand(0).getReg(); 2028 Register Src = MI.getOperand(1).getReg(); 2029 2030 const LLT S64 = LLT::scalar(64); 2031 const LLT S32 = LLT::scalar(32); 2032 2033 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2034 2035 unsigned Flags = MI.getFlags(); 2036 2037 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2038 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2039 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2040 2041 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2042 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2043 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2044 2045 auto Hi = Signed ? 2046 B.buildFPTOSI(S32, FloorMul) : 2047 B.buildFPTOUI(S32, FloorMul); 2048 auto Lo = B.buildFPTOUI(S32, Fma); 2049 2050 B.buildMerge(Dst, { Lo, Hi }); 2051 MI.eraseFromParent(); 2052 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2057 MachineInstr &MI) const { 2058 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2059 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2060 2061 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2062 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2063 2064 // With ieee_mode disabled, the instructions have the correct behavior 2065 // already for G_FMINNUM/G_FMAXNUM 2066 if (!MFI->getMode().IEEE) 2067 return !IsIEEEOp; 2068 2069 if (IsIEEEOp) 2070 return true; 2071 2072 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, 2077 MachineIRBuilder &B) const { 2078 // TODO: Should move some of this into LegalizerHelper. 2079 2080 // TODO: Promote dynamic indexing of s16 to s32 2081 2082 // FIXME: Artifact combiner probably should have replaced the truncated 2083 // constant before this, so we shouldn't need 2084 // getConstantVRegValWithLookThrough. 2085 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2086 MI.getOperand(2).getReg(), MRI); 2087 if (!IdxVal) // Dynamic case will be selected to register indexing. 2088 return true; 2089 2090 Register Dst = MI.getOperand(0).getReg(); 2091 Register Vec = MI.getOperand(1).getReg(); 2092 2093 LLT VecTy = MRI.getType(Vec); 2094 LLT EltTy = VecTy.getElementType(); 2095 assert(EltTy == MRI.getType(Dst)); 2096 2097 if (IdxVal->Value < VecTy.getNumElements()) 2098 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2099 else 2100 B.buildUndef(Dst); 2101 2102 MI.eraseFromParent(); 2103 return true; 2104 } 2105 2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2107 MachineInstr &MI, MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 // TODO: Should move some of this into LegalizerHelper. 2110 2111 // TODO: Promote dynamic indexing of s16 to s32 2112 2113 // FIXME: Artifact combiner probably should have replaced the truncated 2114 // constant before this, so we shouldn't need 2115 // getConstantVRegValWithLookThrough. 2116 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2117 MI.getOperand(3).getReg(), MRI); 2118 if (!IdxVal) // Dynamic case will be selected to register indexing. 2119 return true; 2120 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Vec = MI.getOperand(1).getReg(); 2123 Register Ins = MI.getOperand(2).getReg(); 2124 2125 LLT VecTy = MRI.getType(Vec); 2126 LLT EltTy = VecTy.getElementType(); 2127 assert(EltTy == MRI.getType(Ins)); 2128 2129 if (IdxVal->Value < VecTy.getNumElements()) 2130 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2131 else 2132 B.buildUndef(Dst); 2133 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B) const { 2141 const LLT V2S16 = LLT::vector(2, 16); 2142 2143 Register Dst = MI.getOperand(0).getReg(); 2144 Register Src0 = MI.getOperand(1).getReg(); 2145 LLT DstTy = MRI.getType(Dst); 2146 LLT SrcTy = MRI.getType(Src0); 2147 2148 if (SrcTy == V2S16 && DstTy == V2S16 && 2149 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2150 return true; 2151 2152 MachineIRBuilder HelperBuilder(MI); 2153 GISelObserverWrapper DummyObserver; 2154 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2155 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeSinCos( 2159 MachineInstr &MI, MachineRegisterInfo &MRI, 2160 MachineIRBuilder &B) const { 2161 2162 Register DstReg = MI.getOperand(0).getReg(); 2163 Register SrcReg = MI.getOperand(1).getReg(); 2164 LLT Ty = MRI.getType(DstReg); 2165 unsigned Flags = MI.getFlags(); 2166 2167 Register TrigVal; 2168 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2169 if (ST.hasTrigReducedRange()) { 2170 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2171 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2172 .addUse(MulVal.getReg(0)) 2173 .setMIFlags(Flags).getReg(0); 2174 } else 2175 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2176 2177 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2178 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2179 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2180 .addUse(TrigVal) 2181 .setMIFlags(Flags); 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2187 MachineIRBuilder &B, 2188 const GlobalValue *GV, 2189 int64_t Offset, 2190 unsigned GAFlags) const { 2191 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2192 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2193 // to the following code sequence: 2194 // 2195 // For constant address space: 2196 // s_getpc_b64 s[0:1] 2197 // s_add_u32 s0, s0, $symbol 2198 // s_addc_u32 s1, s1, 0 2199 // 2200 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2201 // a fixup or relocation is emitted to replace $symbol with a literal 2202 // constant, which is a pc-relative offset from the encoding of the $symbol 2203 // operand to the global variable. 2204 // 2205 // For global address space: 2206 // s_getpc_b64 s[0:1] 2207 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2208 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2209 // 2210 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2211 // fixups or relocations are emitted to replace $symbol@*@lo and 2212 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2213 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2214 // operand to the global variable. 2215 // 2216 // What we want here is an offset from the value returned by s_getpc 2217 // (which is the address of the s_add_u32 instruction) to the global 2218 // variable, but since the encoding of $symbol starts 4 bytes after the start 2219 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2220 // small. This requires us to add 4 to the global variable offset in order to 2221 // compute the correct address. 2222 2223 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2224 2225 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2226 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2227 2228 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2229 .addDef(PCReg); 2230 2231 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2232 if (GAFlags == SIInstrInfo::MO_NONE) 2233 MIB.addImm(0); 2234 else 2235 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2236 2237 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2238 2239 if (PtrTy.getSizeInBits() == 32) 2240 B.buildExtract(DstReg, PCReg, 0); 2241 return true; 2242 } 2243 2244 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2245 MachineInstr &MI, MachineRegisterInfo &MRI, 2246 MachineIRBuilder &B) const { 2247 Register DstReg = MI.getOperand(0).getReg(); 2248 LLT Ty = MRI.getType(DstReg); 2249 unsigned AS = Ty.getAddressSpace(); 2250 2251 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2252 MachineFunction &MF = B.getMF(); 2253 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2254 2255 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2256 if (!MFI->isEntryFunction()) { 2257 const Function &Fn = MF.getFunction(); 2258 DiagnosticInfoUnsupported BadLDSDecl( 2259 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2260 DS_Warning); 2261 Fn.getContext().diagnose(BadLDSDecl); 2262 2263 // We currently don't have a way to correctly allocate LDS objects that 2264 // aren't directly associated with a kernel. We do force inlining of 2265 // functions that use local objects. However, if these dead functions are 2266 // not eliminated, we don't want a compile time error. Just emit a warning 2267 // and a trap, since there should be no callable path here. 2268 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2269 B.buildUndef(DstReg); 2270 MI.eraseFromParent(); 2271 return true; 2272 } 2273 2274 // TODO: We could emit code to handle the initialization somewhere. 2275 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2276 const SITargetLowering *TLI = ST.getTargetLowering(); 2277 if (!TLI->shouldUseLDSConstAddress(GV)) { 2278 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2279 return true; // Leave in place; 2280 } 2281 2282 B.buildConstant( 2283 DstReg, 2284 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2285 MI.eraseFromParent(); 2286 return true; 2287 } 2288 2289 const Function &Fn = MF.getFunction(); 2290 DiagnosticInfoUnsupported BadInit( 2291 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2292 Fn.getContext().diagnose(BadInit); 2293 return true; 2294 } 2295 2296 const SITargetLowering *TLI = ST.getTargetLowering(); 2297 2298 if (TLI->shouldEmitFixup(GV)) { 2299 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 if (TLI->shouldEmitPCReloc(GV)) { 2305 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2306 MI.eraseFromParent(); 2307 return true; 2308 } 2309 2310 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2311 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2312 2313 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2314 MachinePointerInfo::getGOT(MF), 2315 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2316 MachineMemOperand::MOInvariant, 2317 8 /*Size*/, Align(8)); 2318 2319 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2320 2321 if (Ty.getSizeInBits() == 32) { 2322 // Truncate if this is a 32-bit constant adrdess. 2323 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2324 B.buildExtract(DstReg, Load, 0); 2325 } else 2326 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2327 2328 MI.eraseFromParent(); 2329 return true; 2330 } 2331 2332 static LLT widenToNextPowerOf2(LLT Ty) { 2333 if (Ty.isVector()) 2334 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2335 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2336 } 2337 2338 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2339 MachineInstr &MI) const { 2340 MachineIRBuilder &B = Helper.MIRBuilder; 2341 MachineRegisterInfo &MRI = *B.getMRI(); 2342 GISelChangeObserver &Observer = Helper.Observer; 2343 2344 Register PtrReg = MI.getOperand(1).getReg(); 2345 LLT PtrTy = MRI.getType(PtrReg); 2346 unsigned AddrSpace = PtrTy.getAddressSpace(); 2347 2348 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2349 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2350 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2351 Observer.changingInstr(MI); 2352 MI.getOperand(1).setReg(Cast.getReg(0)); 2353 Observer.changedInstr(MI); 2354 return true; 2355 } 2356 2357 Register ValReg = MI.getOperand(0).getReg(); 2358 LLT ValTy = MRI.getType(ValReg); 2359 2360 MachineMemOperand *MMO = *MI.memoperands_begin(); 2361 const unsigned ValSize = ValTy.getSizeInBits(); 2362 const unsigned MemSize = 8 * MMO->getSize(); 2363 const Align MemAlign = MMO->getAlign(); 2364 const unsigned AlignInBits = 8 * MemAlign.value(); 2365 2366 // Widen non-power-of-2 loads to the alignment if needed 2367 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2368 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2369 2370 // This was already the correct extending load result type, so just adjust 2371 // the memory type. 2372 if (WideMemSize == ValSize) { 2373 MachineFunction &MF = B.getMF(); 2374 2375 MachineMemOperand *WideMMO = 2376 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2377 Observer.changingInstr(MI); 2378 MI.setMemRefs(MF, {WideMMO}); 2379 Observer.changedInstr(MI); 2380 return true; 2381 } 2382 2383 // Don't bother handling edge case that should probably never be produced. 2384 if (ValSize > WideMemSize) 2385 return false; 2386 2387 LLT WideTy = widenToNextPowerOf2(ValTy); 2388 2389 Register WideLoad; 2390 if (!WideTy.isVector()) { 2391 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2392 B.buildTrunc(ValReg, WideLoad).getReg(0); 2393 } else { 2394 // Extract the subvector. 2395 2396 if (isRegisterType(ValTy)) { 2397 // If this a case where G_EXTRACT is legal, use it. 2398 // (e.g. <3 x s32> -> <4 x s32>) 2399 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2400 B.buildExtract(ValReg, WideLoad, 0); 2401 } else { 2402 // For cases where the widened type isn't a nice register value, unmerge 2403 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2404 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2405 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2406 B.setInsertPt(B.getMBB(), MI.getIterator()); 2407 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2408 } 2409 } 2410 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 return false; 2416 } 2417 2418 bool AMDGPULegalizerInfo::legalizeFMad( 2419 MachineInstr &MI, MachineRegisterInfo &MRI, 2420 MachineIRBuilder &B) const { 2421 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2422 assert(Ty.isScalar()); 2423 2424 MachineFunction &MF = B.getMF(); 2425 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2426 2427 // TODO: Always legal with future ftz flag. 2428 // FIXME: Do we need just output? 2429 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2430 return true; 2431 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2432 return true; 2433 2434 MachineIRBuilder HelperBuilder(MI); 2435 GISelObserverWrapper DummyObserver; 2436 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2437 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2438 } 2439 2440 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2441 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2442 Register DstReg = MI.getOperand(0).getReg(); 2443 Register PtrReg = MI.getOperand(1).getReg(); 2444 Register CmpVal = MI.getOperand(2).getReg(); 2445 Register NewVal = MI.getOperand(3).getReg(); 2446 2447 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2448 "this should not have been custom lowered"); 2449 2450 LLT ValTy = MRI.getType(CmpVal); 2451 LLT VecTy = LLT::vector(2, ValTy); 2452 2453 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2454 2455 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2456 .addDef(DstReg) 2457 .addUse(PtrReg) 2458 .addUse(PackedVal) 2459 .setMemRefs(MI.memoperands()); 2460 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 2465 bool AMDGPULegalizerInfo::legalizeFlog( 2466 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2467 Register Dst = MI.getOperand(0).getReg(); 2468 Register Src = MI.getOperand(1).getReg(); 2469 LLT Ty = B.getMRI()->getType(Dst); 2470 unsigned Flags = MI.getFlags(); 2471 2472 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2473 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2474 2475 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2476 MI.eraseFromParent(); 2477 return true; 2478 } 2479 2480 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2481 MachineIRBuilder &B) const { 2482 Register Dst = MI.getOperand(0).getReg(); 2483 Register Src = MI.getOperand(1).getReg(); 2484 unsigned Flags = MI.getFlags(); 2485 LLT Ty = B.getMRI()->getType(Dst); 2486 2487 auto K = B.buildFConstant(Ty, numbers::log2e); 2488 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2489 B.buildFExp2(Dst, Mul, Flags); 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 2494 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2495 MachineIRBuilder &B) const { 2496 Register Dst = MI.getOperand(0).getReg(); 2497 Register Src0 = MI.getOperand(1).getReg(); 2498 Register Src1 = MI.getOperand(2).getReg(); 2499 unsigned Flags = MI.getFlags(); 2500 LLT Ty = B.getMRI()->getType(Dst); 2501 const LLT S16 = LLT::scalar(16); 2502 const LLT S32 = LLT::scalar(32); 2503 2504 if (Ty == S32) { 2505 auto Log = B.buildFLog2(S32, Src0, Flags); 2506 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2507 .addUse(Log.getReg(0)) 2508 .addUse(Src1) 2509 .setMIFlags(Flags); 2510 B.buildFExp2(Dst, Mul, Flags); 2511 } else if (Ty == S16) { 2512 // There's no f16 fmul_legacy, so we need to convert for it. 2513 auto Log = B.buildFLog2(S16, Src0, Flags); 2514 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2515 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2516 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2517 .addUse(Ext0.getReg(0)) 2518 .addUse(Ext1.getReg(0)) 2519 .setMIFlags(Flags); 2520 2521 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2522 } else 2523 return false; 2524 2525 MI.eraseFromParent(); 2526 return true; 2527 } 2528 2529 // Find a source register, ignoring any possible source modifiers. 2530 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2531 Register ModSrc = OrigSrc; 2532 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2533 ModSrc = SrcFNeg->getOperand(1).getReg(); 2534 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2535 ModSrc = SrcFAbs->getOperand(1).getReg(); 2536 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2537 ModSrc = SrcFAbs->getOperand(1).getReg(); 2538 return ModSrc; 2539 } 2540 2541 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2542 MachineRegisterInfo &MRI, 2543 MachineIRBuilder &B) const { 2544 2545 const LLT S1 = LLT::scalar(1); 2546 const LLT S64 = LLT::scalar(64); 2547 Register Dst = MI.getOperand(0).getReg(); 2548 Register OrigSrc = MI.getOperand(1).getReg(); 2549 unsigned Flags = MI.getFlags(); 2550 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2551 "this should not have been custom lowered"); 2552 2553 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2554 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2555 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2556 // V_FRACT bug is: 2557 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2558 // 2559 // Convert floor(x) to (x - fract(x)) 2560 2561 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2562 .addUse(OrigSrc) 2563 .setMIFlags(Flags); 2564 2565 // Give source modifier matching some assistance before obscuring a foldable 2566 // pattern. 2567 2568 // TODO: We can avoid the neg on the fract? The input sign to fract 2569 // shouldn't matter? 2570 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2571 2572 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2573 2574 Register Min = MRI.createGenericVirtualRegister(S64); 2575 2576 // We don't need to concern ourselves with the snan handling difference, so 2577 // use the one which will directly select. 2578 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2579 if (MFI->getMode().IEEE) 2580 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2581 else 2582 B.buildFMinNum(Min, Fract, Const, Flags); 2583 2584 Register CorrectedFract = Min; 2585 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2586 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2587 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2588 } 2589 2590 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2591 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2592 2593 MI.eraseFromParent(); 2594 return true; 2595 } 2596 2597 // Turn an illegal packed v2s16 build vector into bit operations. 2598 // TODO: This should probably be a bitcast action in LegalizerHelper. 2599 bool AMDGPULegalizerInfo::legalizeBuildVector( 2600 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2601 Register Dst = MI.getOperand(0).getReg(); 2602 const LLT S32 = LLT::scalar(32); 2603 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2604 2605 Register Src0 = MI.getOperand(1).getReg(); 2606 Register Src1 = MI.getOperand(2).getReg(); 2607 assert(MRI.getType(Src0) == LLT::scalar(16)); 2608 2609 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2610 B.buildBitcast(Dst, Merge); 2611 2612 MI.eraseFromParent(); 2613 return true; 2614 } 2615 2616 // Return the use branch instruction, otherwise null if the usage is invalid. 2617 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2618 MachineRegisterInfo &MRI, 2619 MachineInstr *&Br, 2620 MachineBasicBlock *&UncondBrTarget) { 2621 Register CondDef = MI.getOperand(0).getReg(); 2622 if (!MRI.hasOneNonDBGUse(CondDef)) 2623 return nullptr; 2624 2625 MachineBasicBlock *Parent = MI.getParent(); 2626 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2627 if (UseMI.getParent() != Parent || 2628 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2629 return nullptr; 2630 2631 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2632 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2633 if (Next == Parent->end()) { 2634 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2635 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2636 return nullptr; 2637 UncondBrTarget = &*NextMBB; 2638 } else { 2639 if (Next->getOpcode() != AMDGPU::G_BR) 2640 return nullptr; 2641 Br = &*Next; 2642 UncondBrTarget = Br->getOperand(0).getMBB(); 2643 } 2644 2645 return &UseMI; 2646 } 2647 2648 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2649 const ArgDescriptor *Arg, 2650 const TargetRegisterClass *ArgRC, 2651 LLT ArgTy) const { 2652 MCRegister SrcReg = Arg->getRegister(); 2653 assert(SrcReg.isPhysical() && "Physical register expected"); 2654 assert(DstReg.isVirtual() && "Virtual register expected"); 2655 2656 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2657 ArgTy); 2658 if (Arg->isMasked()) { 2659 // TODO: Should we try to emit this once in the entry block? 2660 const LLT S32 = LLT::scalar(32); 2661 const unsigned Mask = Arg->getMask(); 2662 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2663 2664 Register AndMaskSrc = LiveIn; 2665 2666 if (Shift != 0) { 2667 auto ShiftAmt = B.buildConstant(S32, Shift); 2668 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2669 } 2670 2671 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2672 } else { 2673 B.buildCopy(DstReg, LiveIn); 2674 } 2675 2676 return true; 2677 } 2678 2679 bool AMDGPULegalizerInfo::loadInputValue( 2680 Register DstReg, MachineIRBuilder &B, 2681 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2682 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2683 const ArgDescriptor *Arg; 2684 const TargetRegisterClass *ArgRC; 2685 LLT ArgTy; 2686 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2687 2688 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2689 return false; // TODO: Handle these 2690 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2691 } 2692 2693 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2694 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2695 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2696 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2697 return false; 2698 2699 MI.eraseFromParent(); 2700 return true; 2701 } 2702 2703 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2704 MachineRegisterInfo &MRI, 2705 MachineIRBuilder &B) const { 2706 Register Dst = MI.getOperand(0).getReg(); 2707 LLT DstTy = MRI.getType(Dst); 2708 LLT S16 = LLT::scalar(16); 2709 LLT S32 = LLT::scalar(32); 2710 LLT S64 = LLT::scalar(64); 2711 2712 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2713 return true; 2714 2715 if (DstTy == S16) 2716 return legalizeFDIV16(MI, MRI, B); 2717 if (DstTy == S32) 2718 return legalizeFDIV32(MI, MRI, B); 2719 if (DstTy == S64) 2720 return legalizeFDIV64(MI, MRI, B); 2721 2722 return false; 2723 } 2724 2725 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2726 Register DstReg, 2727 Register X, 2728 Register Y, 2729 bool IsDiv) const { 2730 const LLT S1 = LLT::scalar(1); 2731 const LLT S32 = LLT::scalar(32); 2732 2733 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2734 // algorithm used here. 2735 2736 // Initial estimate of inv(y). 2737 auto FloatY = B.buildUITOFP(S32, Y); 2738 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2739 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2740 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2741 auto Z = B.buildFPTOUI(S32, ScaledY); 2742 2743 // One round of UNR. 2744 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2745 auto NegYZ = B.buildMul(S32, NegY, Z); 2746 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2747 2748 // Quotient/remainder estimate. 2749 auto Q = B.buildUMulH(S32, X, Z); 2750 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2751 2752 // First quotient/remainder refinement. 2753 auto One = B.buildConstant(S32, 1); 2754 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2755 if (IsDiv) 2756 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2757 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2758 2759 // Second quotient/remainder refinement. 2760 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2761 if (IsDiv) 2762 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2763 else 2764 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2765 } 2766 2767 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2768 MachineRegisterInfo &MRI, 2769 MachineIRBuilder &B) const { 2770 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2771 Register DstReg = MI.getOperand(0).getReg(); 2772 Register Num = MI.getOperand(1).getReg(); 2773 Register Den = MI.getOperand(2).getReg(); 2774 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2775 MI.eraseFromParent(); 2776 return true; 2777 } 2778 2779 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2780 // 2781 // Return lo, hi of result 2782 // 2783 // %cvt.lo = G_UITOFP Val.lo 2784 // %cvt.hi = G_UITOFP Val.hi 2785 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2786 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2787 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2788 // %mul2 = G_FMUL %mul1, 2**(-32) 2789 // %trunc = G_INTRINSIC_TRUNC %mul2 2790 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2791 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2792 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2793 Register Val) { 2794 const LLT S32 = LLT::scalar(32); 2795 auto Unmerge = B.buildUnmerge(S32, Val); 2796 2797 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2798 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2799 2800 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2801 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2802 2803 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2804 auto Mul1 = 2805 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2806 2807 // 2**(-32) 2808 auto Mul2 = 2809 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2810 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2811 2812 // -(2**32) 2813 auto Mad2 = B.buildFMAD(S32, Trunc, 2814 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2815 2816 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2817 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2818 2819 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2820 } 2821 2822 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2823 Register DstReg, 2824 Register Numer, 2825 Register Denom, 2826 bool IsDiv) const { 2827 const LLT S32 = LLT::scalar(32); 2828 const LLT S64 = LLT::scalar(64); 2829 const LLT S1 = LLT::scalar(1); 2830 Register RcpLo, RcpHi; 2831 2832 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2833 2834 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2835 2836 auto Zero64 = B.buildConstant(S64, 0); 2837 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2838 2839 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2840 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2841 2842 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2843 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2844 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2845 2846 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2847 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2848 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2849 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2850 2851 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2852 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2853 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2854 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2855 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2856 2857 auto Zero32 = B.buildConstant(S32, 0); 2858 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2859 auto Add2_HiC = 2860 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2861 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2862 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2863 2864 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2865 Register NumerLo = UnmergeNumer.getReg(0); 2866 Register NumerHi = UnmergeNumer.getReg(1); 2867 2868 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2869 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2870 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2871 Register Mul3_Lo = UnmergeMul3.getReg(0); 2872 Register Mul3_Hi = UnmergeMul3.getReg(1); 2873 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2874 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2875 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2876 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2877 2878 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2879 Register DenomLo = UnmergeDenom.getReg(0); 2880 Register DenomHi = UnmergeDenom.getReg(1); 2881 2882 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2883 auto C1 = B.buildSExt(S32, CmpHi); 2884 2885 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2886 auto C2 = B.buildSExt(S32, CmpLo); 2887 2888 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2889 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2890 2891 // TODO: Here and below portions of the code can be enclosed into if/endif. 2892 // Currently control flow is unconditional and we have 4 selects after 2893 // potential endif to substitute PHIs. 2894 2895 // if C3 != 0 ... 2896 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2897 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2898 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2899 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2900 2901 auto One64 = B.buildConstant(S64, 1); 2902 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2903 2904 auto C4 = 2905 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2906 auto C5 = 2907 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2908 auto C6 = B.buildSelect( 2909 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2910 2911 // if (C6 != 0) 2912 auto Add4 = B.buildAdd(S64, Add3, One64); 2913 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2914 2915 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2916 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2917 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2918 2919 // endif C6 2920 // endif C3 2921 2922 if (IsDiv) { 2923 auto Sel1 = B.buildSelect( 2924 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2925 B.buildSelect(DstReg, 2926 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2927 } else { 2928 auto Sel2 = B.buildSelect( 2929 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2930 B.buildSelect(DstReg, 2931 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2932 } 2933 } 2934 2935 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2936 MachineRegisterInfo &MRI, 2937 MachineIRBuilder &B) const { 2938 const LLT S64 = LLT::scalar(64); 2939 const LLT S32 = LLT::scalar(32); 2940 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2941 Register DstReg = MI.getOperand(0).getReg(); 2942 Register Num = MI.getOperand(1).getReg(); 2943 Register Den = MI.getOperand(2).getReg(); 2944 LLT Ty = MRI.getType(DstReg); 2945 2946 if (Ty == S32) 2947 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2948 else if (Ty == S64) 2949 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2950 else 2951 return false; 2952 2953 MI.eraseFromParent(); 2954 return true; 2955 2956 } 2957 2958 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2959 MachineRegisterInfo &MRI, 2960 MachineIRBuilder &B) const { 2961 const LLT S64 = LLT::scalar(64); 2962 const LLT S32 = LLT::scalar(32); 2963 2964 Register DstReg = MI.getOperand(0).getReg(); 2965 const LLT Ty = MRI.getType(DstReg); 2966 if (Ty != S32 && Ty != S64) 2967 return false; 2968 2969 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2970 2971 Register LHS = MI.getOperand(1).getReg(); 2972 Register RHS = MI.getOperand(2).getReg(); 2973 2974 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2975 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2976 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2977 2978 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2979 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2980 2981 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2982 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2983 2984 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2985 if (Ty == S32) 2986 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2987 else 2988 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2989 2990 Register Sign; 2991 if (IsDiv) 2992 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2993 else 2994 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2995 2996 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2997 B.buildSub(DstReg, UDivRem, Sign); 2998 2999 MI.eraseFromParent(); 3000 return true; 3001 } 3002 3003 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3004 MachineRegisterInfo &MRI, 3005 MachineIRBuilder &B) const { 3006 Register Res = MI.getOperand(0).getReg(); 3007 Register LHS = MI.getOperand(1).getReg(); 3008 Register RHS = MI.getOperand(2).getReg(); 3009 3010 uint16_t Flags = MI.getFlags(); 3011 3012 LLT ResTy = MRI.getType(Res); 3013 LLT S32 = LLT::scalar(32); 3014 LLT S64 = LLT::scalar(64); 3015 3016 const MachineFunction &MF = B.getMF(); 3017 bool Unsafe = 3018 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3019 3020 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3021 return false; 3022 3023 if (!Unsafe && ResTy == S32 && 3024 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3025 return false; 3026 3027 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3028 // 1 / x -> RCP(x) 3029 if (CLHS->isExactlyValue(1.0)) { 3030 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3031 .addUse(RHS) 3032 .setMIFlags(Flags); 3033 3034 MI.eraseFromParent(); 3035 return true; 3036 } 3037 3038 // -1 / x -> RCP( FNEG(x) ) 3039 if (CLHS->isExactlyValue(-1.0)) { 3040 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3041 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3042 .addUse(FNeg.getReg(0)) 3043 .setMIFlags(Flags); 3044 3045 MI.eraseFromParent(); 3046 return true; 3047 } 3048 } 3049 3050 // x / y -> x * (1.0 / y) 3051 if (Unsafe) { 3052 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3053 .addUse(RHS) 3054 .setMIFlags(Flags); 3055 B.buildFMul(Res, LHS, RCP, Flags); 3056 3057 MI.eraseFromParent(); 3058 return true; 3059 } 3060 3061 return false; 3062 } 3063 3064 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3065 MachineRegisterInfo &MRI, 3066 MachineIRBuilder &B) const { 3067 Register Res = MI.getOperand(0).getReg(); 3068 Register LHS = MI.getOperand(1).getReg(); 3069 Register RHS = MI.getOperand(2).getReg(); 3070 3071 uint16_t Flags = MI.getFlags(); 3072 3073 LLT S16 = LLT::scalar(16); 3074 LLT S32 = LLT::scalar(32); 3075 3076 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3077 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3078 3079 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3080 .addUse(RHSExt.getReg(0)) 3081 .setMIFlags(Flags); 3082 3083 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3084 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3085 3086 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3087 .addUse(RDst.getReg(0)) 3088 .addUse(RHS) 3089 .addUse(LHS) 3090 .setMIFlags(Flags); 3091 3092 MI.eraseFromParent(); 3093 return true; 3094 } 3095 3096 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3097 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3098 static void toggleSPDenormMode(bool Enable, 3099 MachineIRBuilder &B, 3100 const GCNSubtarget &ST, 3101 AMDGPU::SIModeRegisterDefaults Mode) { 3102 // Set SP denorm mode to this value. 3103 unsigned SPDenormMode = 3104 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3105 3106 if (ST.hasDenormModeInst()) { 3107 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3108 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3109 3110 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3111 B.buildInstr(AMDGPU::S_DENORM_MODE) 3112 .addImm(NewDenormModeValue); 3113 3114 } else { 3115 // Select FP32 bit field in mode register. 3116 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3117 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3118 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3119 3120 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3121 .addImm(SPDenormMode) 3122 .addImm(SPDenormModeBitField); 3123 } 3124 } 3125 3126 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3127 MachineRegisterInfo &MRI, 3128 MachineIRBuilder &B) const { 3129 Register Res = MI.getOperand(0).getReg(); 3130 Register LHS = MI.getOperand(1).getReg(); 3131 Register RHS = MI.getOperand(2).getReg(); 3132 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3133 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3134 3135 uint16_t Flags = MI.getFlags(); 3136 3137 LLT S32 = LLT::scalar(32); 3138 LLT S1 = LLT::scalar(1); 3139 3140 auto One = B.buildFConstant(S32, 1.0f); 3141 3142 auto DenominatorScaled = 3143 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3144 .addUse(LHS) 3145 .addUse(RHS) 3146 .addImm(0) 3147 .setMIFlags(Flags); 3148 auto NumeratorScaled = 3149 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3150 .addUse(LHS) 3151 .addUse(RHS) 3152 .addImm(1) 3153 .setMIFlags(Flags); 3154 3155 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3156 .addUse(DenominatorScaled.getReg(0)) 3157 .setMIFlags(Flags); 3158 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3159 3160 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3161 // aren't modeled as reading it. 3162 if (!Mode.allFP32Denormals()) 3163 toggleSPDenormMode(true, B, ST, Mode); 3164 3165 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3166 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3167 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3168 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3169 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3170 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3171 3172 if (!Mode.allFP32Denormals()) 3173 toggleSPDenormMode(false, B, ST, Mode); 3174 3175 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3176 .addUse(Fma4.getReg(0)) 3177 .addUse(Fma1.getReg(0)) 3178 .addUse(Fma3.getReg(0)) 3179 .addUse(NumeratorScaled.getReg(1)) 3180 .setMIFlags(Flags); 3181 3182 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3183 .addUse(Fmas.getReg(0)) 3184 .addUse(RHS) 3185 .addUse(LHS) 3186 .setMIFlags(Flags); 3187 3188 MI.eraseFromParent(); 3189 return true; 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3193 MachineRegisterInfo &MRI, 3194 MachineIRBuilder &B) const { 3195 Register Res = MI.getOperand(0).getReg(); 3196 Register LHS = MI.getOperand(1).getReg(); 3197 Register RHS = MI.getOperand(2).getReg(); 3198 3199 uint16_t Flags = MI.getFlags(); 3200 3201 LLT S64 = LLT::scalar(64); 3202 LLT S1 = LLT::scalar(1); 3203 3204 auto One = B.buildFConstant(S64, 1.0); 3205 3206 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3207 .addUse(LHS) 3208 .addUse(RHS) 3209 .addImm(0) 3210 .setMIFlags(Flags); 3211 3212 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3213 3214 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3215 .addUse(DivScale0.getReg(0)) 3216 .setMIFlags(Flags); 3217 3218 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3219 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3220 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3221 3222 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3223 .addUse(LHS) 3224 .addUse(RHS) 3225 .addImm(1) 3226 .setMIFlags(Flags); 3227 3228 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3229 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3230 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3231 3232 Register Scale; 3233 if (!ST.hasUsableDivScaleConditionOutput()) { 3234 // Workaround a hardware bug on SI where the condition output from div_scale 3235 // is not usable. 3236 3237 LLT S32 = LLT::scalar(32); 3238 3239 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3240 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3241 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3242 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3243 3244 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3245 Scale1Unmerge.getReg(1)); 3246 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3247 Scale0Unmerge.getReg(1)); 3248 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3249 } else { 3250 Scale = DivScale1.getReg(1); 3251 } 3252 3253 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3254 .addUse(Fma4.getReg(0)) 3255 .addUse(Fma3.getReg(0)) 3256 .addUse(Mul.getReg(0)) 3257 .addUse(Scale) 3258 .setMIFlags(Flags); 3259 3260 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3261 .addUse(Fmas.getReg(0)) 3262 .addUse(RHS) 3263 .addUse(LHS) 3264 .setMIFlags(Flags); 3265 3266 MI.eraseFromParent(); 3267 return true; 3268 } 3269 3270 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3271 MachineRegisterInfo &MRI, 3272 MachineIRBuilder &B) const { 3273 Register Res = MI.getOperand(0).getReg(); 3274 Register LHS = MI.getOperand(2).getReg(); 3275 Register RHS = MI.getOperand(3).getReg(); 3276 uint16_t Flags = MI.getFlags(); 3277 3278 LLT S32 = LLT::scalar(32); 3279 LLT S1 = LLT::scalar(1); 3280 3281 auto Abs = B.buildFAbs(S32, RHS, Flags); 3282 const APFloat C0Val(1.0f); 3283 3284 auto C0 = B.buildConstant(S32, 0x6f800000); 3285 auto C1 = B.buildConstant(S32, 0x2f800000); 3286 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3287 3288 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3289 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3290 3291 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3292 3293 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3294 .addUse(Mul0.getReg(0)) 3295 .setMIFlags(Flags); 3296 3297 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3298 3299 B.buildFMul(Res, Sel, Mul1, Flags); 3300 3301 MI.eraseFromParent(); 3302 return true; 3303 } 3304 3305 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3306 // FIXME: Why do we handle this one but not other removed instructions? 3307 // 3308 // Reciprocal square root. The clamp prevents infinite results, clamping 3309 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3310 // +-max_float. 3311 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3312 MachineRegisterInfo &MRI, 3313 MachineIRBuilder &B) const { 3314 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3315 return true; 3316 3317 Register Dst = MI.getOperand(0).getReg(); 3318 Register Src = MI.getOperand(2).getReg(); 3319 auto Flags = MI.getFlags(); 3320 3321 LLT Ty = MRI.getType(Dst); 3322 3323 const fltSemantics *FltSemantics; 3324 if (Ty == LLT::scalar(32)) 3325 FltSemantics = &APFloat::IEEEsingle(); 3326 else if (Ty == LLT::scalar(64)) 3327 FltSemantics = &APFloat::IEEEdouble(); 3328 else 3329 return false; 3330 3331 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3332 .addUse(Src) 3333 .setMIFlags(Flags); 3334 3335 // We don't need to concern ourselves with the snan handling difference, since 3336 // the rsq quieted (or not) so use the one which will directly select. 3337 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3338 const bool UseIEEE = MFI->getMode().IEEE; 3339 3340 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3341 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3342 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3343 3344 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3345 3346 if (UseIEEE) 3347 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3348 else 3349 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3350 MI.eraseFromParent(); 3351 return true; 3352 } 3353 3354 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3355 switch (IID) { 3356 case Intrinsic::amdgcn_ds_fadd: 3357 return AMDGPU::G_ATOMICRMW_FADD; 3358 case Intrinsic::amdgcn_ds_fmin: 3359 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3360 case Intrinsic::amdgcn_ds_fmax: 3361 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3362 default: 3363 llvm_unreachable("not a DS FP intrinsic"); 3364 } 3365 } 3366 3367 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3368 MachineInstr &MI, 3369 Intrinsic::ID IID) const { 3370 GISelChangeObserver &Observer = Helper.Observer; 3371 Observer.changingInstr(MI); 3372 3373 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3374 3375 // The remaining operands were used to set fields in the MemOperand on 3376 // construction. 3377 for (int I = 6; I > 3; --I) 3378 MI.RemoveOperand(I); 3379 3380 MI.RemoveOperand(1); // Remove the intrinsic ID. 3381 Observer.changedInstr(MI); 3382 return true; 3383 } 3384 3385 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3386 MachineRegisterInfo &MRI, 3387 MachineIRBuilder &B) const { 3388 uint64_t Offset = 3389 ST.getTargetLowering()->getImplicitParameterOffset( 3390 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3391 LLT DstTy = MRI.getType(DstReg); 3392 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3393 3394 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3395 if (!loadInputValue(KernargPtrReg, B, 3396 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3397 return false; 3398 3399 // FIXME: This should be nuw 3400 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3401 return true; 3402 } 3403 3404 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3405 MachineRegisterInfo &MRI, 3406 MachineIRBuilder &B) const { 3407 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3408 if (!MFI->isEntryFunction()) { 3409 return legalizePreloadedArgIntrin(MI, MRI, B, 3410 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3411 } 3412 3413 Register DstReg = MI.getOperand(0).getReg(); 3414 if (!getImplicitArgPtr(DstReg, MRI, B)) 3415 return false; 3416 3417 MI.eraseFromParent(); 3418 return true; 3419 } 3420 3421 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3422 MachineRegisterInfo &MRI, 3423 MachineIRBuilder &B, 3424 unsigned AddrSpace) const { 3425 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3426 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3427 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3428 MI.eraseFromParent(); 3429 return true; 3430 } 3431 3432 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3433 // offset (the offset that is included in bounds checking and swizzling, to be 3434 // split between the instruction's voffset and immoffset fields) and soffset 3435 // (the offset that is excluded from bounds checking and swizzling, to go in 3436 // the instruction's soffset field). This function takes the first kind of 3437 // offset and figures out how to split it between voffset and immoffset. 3438 std::tuple<Register, unsigned, unsigned> 3439 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3440 Register OrigOffset) const { 3441 const unsigned MaxImm = 4095; 3442 Register BaseReg; 3443 unsigned TotalConstOffset; 3444 MachineInstr *OffsetDef; 3445 const LLT S32 = LLT::scalar(32); 3446 3447 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3448 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3449 3450 unsigned ImmOffset = TotalConstOffset; 3451 3452 // If the immediate value is too big for the immoffset field, put the value 3453 // and -4096 into the immoffset field so that the value that is copied/added 3454 // for the voffset field is a multiple of 4096, and it stands more chance 3455 // of being CSEd with the copy/add for another similar load/store. 3456 // However, do not do that rounding down to a multiple of 4096 if that is a 3457 // negative number, as it appears to be illegal to have a negative offset 3458 // in the vgpr, even if adding the immediate offset makes it positive. 3459 unsigned Overflow = ImmOffset & ~MaxImm; 3460 ImmOffset -= Overflow; 3461 if ((int32_t)Overflow < 0) { 3462 Overflow += ImmOffset; 3463 ImmOffset = 0; 3464 } 3465 3466 if (Overflow != 0) { 3467 if (!BaseReg) { 3468 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3469 } else { 3470 auto OverflowVal = B.buildConstant(S32, Overflow); 3471 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3472 } 3473 } 3474 3475 if (!BaseReg) 3476 BaseReg = B.buildConstant(S32, 0).getReg(0); 3477 3478 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3479 } 3480 3481 /// Handle register layout difference for f16 images for some subtargets. 3482 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3483 MachineRegisterInfo &MRI, 3484 Register Reg) const { 3485 if (!ST.hasUnpackedD16VMem()) 3486 return Reg; 3487 3488 const LLT S16 = LLT::scalar(16); 3489 const LLT S32 = LLT::scalar(32); 3490 LLT StoreVT = MRI.getType(Reg); 3491 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3492 3493 auto Unmerge = B.buildUnmerge(S16, Reg); 3494 3495 SmallVector<Register, 4> WideRegs; 3496 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3497 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3498 3499 int NumElts = StoreVT.getNumElements(); 3500 3501 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3502 } 3503 3504 Register AMDGPULegalizerInfo::fixStoreSourceType( 3505 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3506 MachineRegisterInfo *MRI = B.getMRI(); 3507 LLT Ty = MRI->getType(VData); 3508 3509 const LLT S16 = LLT::scalar(16); 3510 3511 // Fixup illegal register types for i8 stores. 3512 if (Ty == LLT::scalar(8) || Ty == S16) { 3513 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3514 return AnyExt; 3515 } 3516 3517 if (Ty.isVector()) { 3518 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3519 if (IsFormat) 3520 return handleD16VData(B, *MRI, VData); 3521 } 3522 } 3523 3524 return VData; 3525 } 3526 3527 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3528 MachineRegisterInfo &MRI, 3529 MachineIRBuilder &B, 3530 bool IsTyped, 3531 bool IsFormat) const { 3532 Register VData = MI.getOperand(1).getReg(); 3533 LLT Ty = MRI.getType(VData); 3534 LLT EltTy = Ty.getScalarType(); 3535 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3536 const LLT S32 = LLT::scalar(32); 3537 3538 VData = fixStoreSourceType(B, VData, IsFormat); 3539 Register RSrc = MI.getOperand(2).getReg(); 3540 3541 MachineMemOperand *MMO = *MI.memoperands_begin(); 3542 const int MemSize = MMO->getSize(); 3543 3544 unsigned ImmOffset; 3545 unsigned TotalOffset; 3546 3547 // The typed intrinsics add an immediate after the registers. 3548 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3549 3550 // The struct intrinsic variants add one additional operand over raw. 3551 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3552 Register VIndex; 3553 int OpOffset = 0; 3554 if (HasVIndex) { 3555 VIndex = MI.getOperand(3).getReg(); 3556 OpOffset = 1; 3557 } 3558 3559 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3560 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3561 3562 unsigned Format = 0; 3563 if (IsTyped) { 3564 Format = MI.getOperand(5 + OpOffset).getImm(); 3565 ++OpOffset; 3566 } 3567 3568 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3569 3570 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3571 if (TotalOffset != 0) 3572 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3573 3574 unsigned Opc; 3575 if (IsTyped) { 3576 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3577 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3578 } else if (IsFormat) { 3579 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3580 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3581 } else { 3582 switch (MemSize) { 3583 case 1: 3584 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3585 break; 3586 case 2: 3587 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3588 break; 3589 default: 3590 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3591 break; 3592 } 3593 } 3594 3595 if (!VIndex) 3596 VIndex = B.buildConstant(S32, 0).getReg(0); 3597 3598 auto MIB = B.buildInstr(Opc) 3599 .addUse(VData) // vdata 3600 .addUse(RSrc) // rsrc 3601 .addUse(VIndex) // vindex 3602 .addUse(VOffset) // voffset 3603 .addUse(SOffset) // soffset 3604 .addImm(ImmOffset); // offset(imm) 3605 3606 if (IsTyped) 3607 MIB.addImm(Format); 3608 3609 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3610 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3611 .addMemOperand(MMO); 3612 3613 MI.eraseFromParent(); 3614 return true; 3615 } 3616 3617 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3618 MachineRegisterInfo &MRI, 3619 MachineIRBuilder &B, 3620 bool IsFormat, 3621 bool IsTyped) const { 3622 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3623 MachineMemOperand *MMO = *MI.memoperands_begin(); 3624 const int MemSize = MMO->getSize(); 3625 const LLT S32 = LLT::scalar(32); 3626 3627 Register Dst = MI.getOperand(0).getReg(); 3628 Register RSrc = MI.getOperand(2).getReg(); 3629 3630 // The typed intrinsics add an immediate after the registers. 3631 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3632 3633 // The struct intrinsic variants add one additional operand over raw. 3634 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3635 Register VIndex; 3636 int OpOffset = 0; 3637 if (HasVIndex) { 3638 VIndex = MI.getOperand(3).getReg(); 3639 OpOffset = 1; 3640 } 3641 3642 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3643 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3644 3645 unsigned Format = 0; 3646 if (IsTyped) { 3647 Format = MI.getOperand(5 + OpOffset).getImm(); 3648 ++OpOffset; 3649 } 3650 3651 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3652 unsigned ImmOffset; 3653 unsigned TotalOffset; 3654 3655 LLT Ty = MRI.getType(Dst); 3656 LLT EltTy = Ty.getScalarType(); 3657 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3658 const bool Unpacked = ST.hasUnpackedD16VMem(); 3659 3660 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3661 if (TotalOffset != 0) 3662 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3663 3664 unsigned Opc; 3665 3666 if (IsTyped) { 3667 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3668 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3669 } else if (IsFormat) { 3670 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3671 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3672 } else { 3673 switch (MemSize) { 3674 case 1: 3675 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3676 break; 3677 case 2: 3678 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3679 break; 3680 default: 3681 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3682 break; 3683 } 3684 } 3685 3686 Register LoadDstReg; 3687 3688 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3689 LLT UnpackedTy = Ty.changeElementSize(32); 3690 3691 if (IsExtLoad) 3692 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3693 else if (Unpacked && IsD16 && Ty.isVector()) 3694 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3695 else 3696 LoadDstReg = Dst; 3697 3698 if (!VIndex) 3699 VIndex = B.buildConstant(S32, 0).getReg(0); 3700 3701 auto MIB = B.buildInstr(Opc) 3702 .addDef(LoadDstReg) // vdata 3703 .addUse(RSrc) // rsrc 3704 .addUse(VIndex) // vindex 3705 .addUse(VOffset) // voffset 3706 .addUse(SOffset) // soffset 3707 .addImm(ImmOffset); // offset(imm) 3708 3709 if (IsTyped) 3710 MIB.addImm(Format); 3711 3712 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3713 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3714 .addMemOperand(MMO); 3715 3716 if (LoadDstReg != Dst) { 3717 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3718 3719 // Widen result for extending loads was widened. 3720 if (IsExtLoad) 3721 B.buildTrunc(Dst, LoadDstReg); 3722 else { 3723 // Repack to original 16-bit vector result 3724 // FIXME: G_TRUNC should work, but legalization currently fails 3725 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3726 SmallVector<Register, 4> Repack; 3727 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3728 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3729 B.buildMerge(Dst, Repack); 3730 } 3731 } 3732 3733 MI.eraseFromParent(); 3734 return true; 3735 } 3736 3737 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3738 MachineIRBuilder &B, 3739 bool IsInc) const { 3740 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3741 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3742 B.buildInstr(Opc) 3743 .addDef(MI.getOperand(0).getReg()) 3744 .addUse(MI.getOperand(2).getReg()) 3745 .addUse(MI.getOperand(3).getReg()) 3746 .cloneMemRefs(MI); 3747 MI.eraseFromParent(); 3748 return true; 3749 } 3750 3751 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3752 switch (IntrID) { 3753 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3754 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3755 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3756 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3757 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3758 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3759 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3760 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3761 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3762 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3763 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3764 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3765 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3766 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3767 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3768 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3769 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3770 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3771 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3772 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3773 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3774 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3775 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3776 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3777 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3778 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3779 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3780 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3781 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3782 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3783 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3784 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3785 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3786 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3787 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3788 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3789 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3790 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3791 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3792 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3793 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3794 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3795 default: 3796 llvm_unreachable("unhandled atomic opcode"); 3797 } 3798 } 3799 3800 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3801 MachineIRBuilder &B, 3802 Intrinsic::ID IID) const { 3803 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3804 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3805 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3806 3807 Register Dst; 3808 3809 int OpOffset = 0; 3810 if (HasReturn) { 3811 // A few FP atomics do not support return values. 3812 Dst = MI.getOperand(0).getReg(); 3813 } else { 3814 OpOffset = -1; 3815 } 3816 3817 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3818 Register CmpVal; 3819 3820 if (IsCmpSwap) { 3821 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3822 ++OpOffset; 3823 } 3824 3825 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3826 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3827 3828 // The struct intrinsic variants add one additional operand over raw. 3829 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3830 Register VIndex; 3831 if (HasVIndex) { 3832 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3833 ++OpOffset; 3834 } 3835 3836 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3837 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3838 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3839 3840 MachineMemOperand *MMO = *MI.memoperands_begin(); 3841 3842 unsigned ImmOffset; 3843 unsigned TotalOffset; 3844 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3845 if (TotalOffset != 0) 3846 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3847 3848 if (!VIndex) 3849 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3850 3851 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3852 3853 if (HasReturn) 3854 MIB.addDef(Dst); 3855 3856 MIB.addUse(VData); // vdata 3857 3858 if (IsCmpSwap) 3859 MIB.addReg(CmpVal); 3860 3861 MIB.addUse(RSrc) // rsrc 3862 .addUse(VIndex) // vindex 3863 .addUse(VOffset) // voffset 3864 .addUse(SOffset) // soffset 3865 .addImm(ImmOffset) // offset(imm) 3866 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3867 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3868 .addMemOperand(MMO); 3869 3870 MI.eraseFromParent(); 3871 return true; 3872 } 3873 3874 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3875 /// vector with s16 typed elements. 3876 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3877 SmallVectorImpl<Register> &PackedAddrs, 3878 int AddrIdx, int DimIdx, int EndIdx, 3879 int NumGradients) { 3880 const LLT S16 = LLT::scalar(16); 3881 const LLT V2S16 = LLT::vector(2, 16); 3882 3883 for (int I = AddrIdx; I < EndIdx; ++I) { 3884 MachineOperand &SrcOp = MI.getOperand(I); 3885 if (!SrcOp.isReg()) 3886 continue; // _L to _LZ may have eliminated this. 3887 3888 Register AddrReg = SrcOp.getReg(); 3889 3890 if (I < DimIdx) { 3891 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3892 PackedAddrs.push_back(AddrReg); 3893 } else { 3894 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3895 // derivatives dx/dh and dx/dv are packed with undef. 3896 if (((I + 1) >= EndIdx) || 3897 ((NumGradients / 2) % 2 == 1 && 3898 (I == DimIdx + (NumGradients / 2) - 1 || 3899 I == DimIdx + NumGradients - 1)) || 3900 // Check for _L to _LZ optimization 3901 !MI.getOperand(I + 1).isReg()) { 3902 PackedAddrs.push_back( 3903 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3904 .getReg(0)); 3905 } else { 3906 PackedAddrs.push_back( 3907 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3908 .getReg(0)); 3909 ++I; 3910 } 3911 } 3912 } 3913 } 3914 3915 /// Convert from separate vaddr components to a single vector address register, 3916 /// and replace the remaining operands with $noreg. 3917 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3918 int DimIdx, int NumVAddrs) { 3919 const LLT S32 = LLT::scalar(32); 3920 3921 SmallVector<Register, 8> AddrRegs; 3922 for (int I = 0; I != NumVAddrs; ++I) { 3923 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3924 if (SrcOp.isReg()) { 3925 AddrRegs.push_back(SrcOp.getReg()); 3926 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3927 } 3928 } 3929 3930 int NumAddrRegs = AddrRegs.size(); 3931 if (NumAddrRegs != 1) { 3932 // Round up to 8 elements for v5-v7 3933 // FIXME: Missing intermediate sized register classes and instructions. 3934 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3935 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3936 auto Undef = B.buildUndef(S32); 3937 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3938 NumAddrRegs = RoundedNumRegs; 3939 } 3940 3941 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3942 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3943 } 3944 3945 for (int I = 1; I != NumVAddrs; ++I) { 3946 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3947 if (SrcOp.isReg()) 3948 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3949 } 3950 } 3951 3952 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3953 /// 3954 /// Depending on the subtarget, load/store with 16-bit element data need to be 3955 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3956 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3957 /// registers. 3958 /// 3959 /// We don't want to directly select image instructions just yet, but also want 3960 /// to exposes all register repacking to the legalizer/combiners. We also don't 3961 /// want a selected instrution entering RegBankSelect. In order to avoid 3962 /// defining a multitude of intermediate image instructions, directly hack on 3963 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3964 /// now unnecessary arguments with $noreg. 3965 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3966 MachineInstr &MI, MachineIRBuilder &B, 3967 GISelChangeObserver &Observer, 3968 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3969 3970 const int NumDefs = MI.getNumExplicitDefs(); 3971 bool IsTFE = NumDefs == 2; 3972 // We are only processing the operands of d16 image operations on subtargets 3973 // that use the unpacked register layout, or need to repack the TFE result. 3974 3975 // TODO: Do we need to guard against already legalized intrinsics? 3976 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3977 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3978 3979 MachineRegisterInfo *MRI = B.getMRI(); 3980 const LLT S32 = LLT::scalar(32); 3981 const LLT S16 = LLT::scalar(16); 3982 const LLT V2S16 = LLT::vector(2, 16); 3983 3984 // Index of first address argument 3985 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3986 3987 int NumVAddrs, NumGradients; 3988 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3989 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3990 getDMaskIdx(BaseOpcode, NumDefs); 3991 unsigned DMask = 0; 3992 3993 // Check for 16 bit addresses and pack if true. 3994 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3995 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3996 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3997 const bool IsG16 = GradTy == S16; 3998 const bool IsA16 = AddrTy == S16; 3999 4000 int DMaskLanes = 0; 4001 if (!BaseOpcode->Atomic) { 4002 DMask = MI.getOperand(DMaskIdx).getImm(); 4003 if (BaseOpcode->Gather4) { 4004 DMaskLanes = 4; 4005 } else if (DMask != 0) { 4006 DMaskLanes = countPopulation(DMask); 4007 } else if (!IsTFE && !BaseOpcode->Store) { 4008 // If dmask is 0, this is a no-op load. This can be eliminated. 4009 B.buildUndef(MI.getOperand(0)); 4010 MI.eraseFromParent(); 4011 return true; 4012 } 4013 } 4014 4015 Observer.changingInstr(MI); 4016 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4017 4018 unsigned NewOpcode = NumDefs == 0 ? 4019 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4020 4021 // Track that we legalized this 4022 MI.setDesc(B.getTII().get(NewOpcode)); 4023 4024 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4025 // dmask to be at least 1 otherwise the instruction will fail 4026 if (IsTFE && DMask == 0) { 4027 DMask = 0x1; 4028 DMaskLanes = 1; 4029 MI.getOperand(DMaskIdx).setImm(DMask); 4030 } 4031 4032 if (BaseOpcode->Atomic) { 4033 Register VData0 = MI.getOperand(2).getReg(); 4034 LLT Ty = MRI->getType(VData0); 4035 4036 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4037 if (Ty.isVector()) 4038 return false; 4039 4040 if (BaseOpcode->AtomicX2) { 4041 Register VData1 = MI.getOperand(3).getReg(); 4042 // The two values are packed in one register. 4043 LLT PackedTy = LLT::vector(2, Ty); 4044 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4045 MI.getOperand(2).setReg(Concat.getReg(0)); 4046 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4047 } 4048 } 4049 4050 int CorrectedNumVAddrs = NumVAddrs; 4051 4052 // Optimize _L to _LZ when _L is zero 4053 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4054 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4055 const ConstantFP *ConstantLod; 4056 const int LodIdx = AddrIdx + NumVAddrs - 1; 4057 4058 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4059 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4060 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4061 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4062 LZMappingInfo->LZ, ImageDimIntr->Dim); 4063 4064 // The starting indexes should remain in the same place. 4065 --NumVAddrs; 4066 --CorrectedNumVAddrs; 4067 4068 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4069 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4070 MI.RemoveOperand(LodIdx); 4071 } 4072 } 4073 } 4074 4075 // Optimize _mip away, when 'lod' is zero 4076 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4077 int64_t ConstantLod; 4078 const int LodIdx = AddrIdx + NumVAddrs - 1; 4079 4080 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4081 if (ConstantLod == 0) { 4082 // TODO: Change intrinsic opcode and remove operand instead or replacing 4083 // it with 0, as the _L to _LZ handling is done above. 4084 MI.getOperand(LodIdx).ChangeToImmediate(0); 4085 --CorrectedNumVAddrs; 4086 } 4087 } 4088 } 4089 4090 // Rewrite the addressing register layout before doing anything else. 4091 if (IsA16 || IsG16) { 4092 if (IsA16) { 4093 // Target must support the feature and gradients need to be 16 bit too 4094 if (!ST.hasA16() || !IsG16) 4095 return false; 4096 } else if (!ST.hasG16()) 4097 return false; 4098 4099 if (NumVAddrs > 1) { 4100 SmallVector<Register, 4> PackedRegs; 4101 // Don't compress addresses for G16 4102 const int PackEndIdx = 4103 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4104 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4105 PackEndIdx, NumGradients); 4106 4107 if (!IsA16) { 4108 // Add uncompressed address 4109 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4110 int AddrReg = MI.getOperand(I).getReg(); 4111 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4112 PackedRegs.push_back(AddrReg); 4113 } 4114 } 4115 4116 // See also below in the non-a16 branch 4117 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4118 4119 if (!UseNSA && PackedRegs.size() > 1) { 4120 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4121 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4122 PackedRegs[0] = Concat.getReg(0); 4123 PackedRegs.resize(1); 4124 } 4125 4126 const int NumPacked = PackedRegs.size(); 4127 for (int I = 0; I != NumVAddrs; ++I) { 4128 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4129 if (!SrcOp.isReg()) { 4130 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4131 continue; 4132 } 4133 4134 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4135 4136 if (I < NumPacked) 4137 SrcOp.setReg(PackedRegs[I]); 4138 else 4139 SrcOp.setReg(AMDGPU::NoRegister); 4140 } 4141 } 4142 } else { 4143 // If the register allocator cannot place the address registers contiguously 4144 // without introducing moves, then using the non-sequential address encoding 4145 // is always preferable, since it saves VALU instructions and is usually a 4146 // wash in terms of code size or even better. 4147 // 4148 // However, we currently have no way of hinting to the register allocator 4149 // that MIMG addresses should be placed contiguously when it is possible to 4150 // do so, so force non-NSA for the common 2-address case as a heuristic. 4151 // 4152 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4153 // allocation when possible. 4154 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4155 4156 if (!UseNSA && NumVAddrs > 1) 4157 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4158 } 4159 4160 int Flags = 0; 4161 if (IsA16) 4162 Flags |= 1; 4163 if (IsG16) 4164 Flags |= 2; 4165 MI.addOperand(MachineOperand::CreateImm(Flags)); 4166 4167 if (BaseOpcode->Store) { // No TFE for stores? 4168 // TODO: Handle dmask trim 4169 Register VData = MI.getOperand(1).getReg(); 4170 LLT Ty = MRI->getType(VData); 4171 if (!Ty.isVector() || Ty.getElementType() != S16) 4172 return true; 4173 4174 Register RepackedReg = handleD16VData(B, *MRI, VData); 4175 if (RepackedReg != VData) { 4176 MI.getOperand(1).setReg(RepackedReg); 4177 } 4178 4179 return true; 4180 } 4181 4182 Register DstReg = MI.getOperand(0).getReg(); 4183 LLT Ty = MRI->getType(DstReg); 4184 const LLT EltTy = Ty.getScalarType(); 4185 const bool IsD16 = Ty.getScalarType() == S16; 4186 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4187 4188 // Confirm that the return type is large enough for the dmask specified 4189 if (NumElts < DMaskLanes) 4190 return false; 4191 4192 if (NumElts > 4 || DMaskLanes > 4) 4193 return false; 4194 4195 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4196 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4197 4198 // The raw dword aligned data component of the load. The only legal cases 4199 // where this matters should be when using the packed D16 format, for 4200 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4201 LLT RoundedTy; 4202 4203 // S32 vector to to cover all data, plus TFE result element. 4204 LLT TFETy; 4205 4206 // Register type to use for each loaded component. Will be S32 or V2S16. 4207 LLT RegTy; 4208 4209 if (IsD16 && ST.hasUnpackedD16VMem()) { 4210 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4211 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4212 RegTy = S32; 4213 } else { 4214 unsigned EltSize = EltTy.getSizeInBits(); 4215 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4216 unsigned RoundedSize = 32 * RoundedElts; 4217 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4218 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4219 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4220 } 4221 4222 // The return type does not need adjustment. 4223 // TODO: Should we change s16 case to s32 or <2 x s16>? 4224 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4225 return true; 4226 4227 Register Dst1Reg; 4228 4229 // Insert after the instruction. 4230 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4231 4232 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4233 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4234 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4235 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4236 4237 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4238 4239 MI.getOperand(0).setReg(NewResultReg); 4240 4241 // In the IR, TFE is supposed to be used with a 2 element struct return 4242 // type. The intruction really returns these two values in one contiguous 4243 // register, with one additional dword beyond the loaded data. Rewrite the 4244 // return type to use a single register result. 4245 4246 if (IsTFE) { 4247 Dst1Reg = MI.getOperand(1).getReg(); 4248 if (MRI->getType(Dst1Reg) != S32) 4249 return false; 4250 4251 // TODO: Make sure the TFE operand bit is set. 4252 MI.RemoveOperand(1); 4253 4254 // Handle the easy case that requires no repack instructions. 4255 if (Ty == S32) { 4256 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4257 return true; 4258 } 4259 } 4260 4261 // Now figure out how to copy the new result register back into the old 4262 // result. 4263 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4264 4265 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4266 4267 if (ResultNumRegs == 1) { 4268 assert(!IsTFE); 4269 ResultRegs[0] = NewResultReg; 4270 } else { 4271 // We have to repack into a new vector of some kind. 4272 for (int I = 0; I != NumDataRegs; ++I) 4273 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4274 B.buildUnmerge(ResultRegs, NewResultReg); 4275 4276 // Drop the final TFE element to get the data part. The TFE result is 4277 // directly written to the right place already. 4278 if (IsTFE) 4279 ResultRegs.resize(NumDataRegs); 4280 } 4281 4282 // For an s16 scalar result, we form an s32 result with a truncate regardless 4283 // of packed vs. unpacked. 4284 if (IsD16 && !Ty.isVector()) { 4285 B.buildTrunc(DstReg, ResultRegs[0]); 4286 return true; 4287 } 4288 4289 // Avoid a build/concat_vector of 1 entry. 4290 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4291 B.buildBitcast(DstReg, ResultRegs[0]); 4292 return true; 4293 } 4294 4295 assert(Ty.isVector()); 4296 4297 if (IsD16) { 4298 // For packed D16 results with TFE enabled, all the data components are 4299 // S32. Cast back to the expected type. 4300 // 4301 // TODO: We don't really need to use load s32 elements. We would only need one 4302 // cast for the TFE result if a multiple of v2s16 was used. 4303 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4304 for (Register &Reg : ResultRegs) 4305 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4306 } else if (ST.hasUnpackedD16VMem()) { 4307 for (Register &Reg : ResultRegs) 4308 Reg = B.buildTrunc(S16, Reg).getReg(0); 4309 } 4310 } 4311 4312 auto padWithUndef = [&](LLT Ty, int NumElts) { 4313 if (NumElts == 0) 4314 return; 4315 Register Undef = B.buildUndef(Ty).getReg(0); 4316 for (int I = 0; I != NumElts; ++I) 4317 ResultRegs.push_back(Undef); 4318 }; 4319 4320 // Pad out any elements eliminated due to the dmask. 4321 LLT ResTy = MRI->getType(ResultRegs[0]); 4322 if (!ResTy.isVector()) { 4323 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4324 B.buildBuildVector(DstReg, ResultRegs); 4325 return true; 4326 } 4327 4328 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4329 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4330 4331 // Deal with the one annoying legal case. 4332 const LLT V3S16 = LLT::vector(3, 16); 4333 if (Ty == V3S16) { 4334 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4335 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4336 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4337 return true; 4338 } 4339 4340 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4341 B.buildConcatVectors(DstReg, ResultRegs); 4342 return true; 4343 } 4344 4345 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4346 LegalizerHelper &Helper, MachineInstr &MI) const { 4347 MachineIRBuilder &B = Helper.MIRBuilder; 4348 GISelChangeObserver &Observer = Helper.Observer; 4349 4350 Register Dst = MI.getOperand(0).getReg(); 4351 LLT Ty = B.getMRI()->getType(Dst); 4352 unsigned Size = Ty.getSizeInBits(); 4353 MachineFunction &MF = B.getMF(); 4354 4355 Observer.changingInstr(MI); 4356 4357 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4358 Ty = getBitcastRegisterType(Ty); 4359 Helper.bitcastDst(MI, Ty, 0); 4360 Dst = MI.getOperand(0).getReg(); 4361 B.setInsertPt(B.getMBB(), MI); 4362 } 4363 4364 // FIXME: We don't really need this intermediate instruction. The intrinsic 4365 // should be fixed to have a memory operand. Since it's readnone, we're not 4366 // allowed to add one. 4367 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4368 MI.RemoveOperand(1); // Remove intrinsic ID 4369 4370 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4371 // TODO: Should this use datalayout alignment? 4372 const unsigned MemSize = (Size + 7) / 8; 4373 const Align MemAlign(4); 4374 MachineMemOperand *MMO = MF.getMachineMemOperand( 4375 MachinePointerInfo(), 4376 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4377 MachineMemOperand::MOInvariant, 4378 MemSize, MemAlign); 4379 MI.addMemOperand(MF, MMO); 4380 4381 // There are no 96-bit result scalar loads, but widening to 128-bit should 4382 // always be legal. We may need to restore this to a 96-bit result if it turns 4383 // out this needs to be converted to a vector load during RegBankSelect. 4384 if (!isPowerOf2_32(Size)) { 4385 if (Ty.isVector()) 4386 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4387 else 4388 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4389 } 4390 4391 Observer.changedInstr(MI); 4392 return true; 4393 } 4394 4395 // TODO: Move to selection 4396 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4397 MachineRegisterInfo &MRI, 4398 MachineIRBuilder &B) const { 4399 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4400 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4401 !ST.isTrapHandlerEnabled()) { 4402 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4403 } else { 4404 // Pass queue pointer to trap handler as input, and insert trap instruction 4405 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4406 MachineRegisterInfo &MRI = *B.getMRI(); 4407 4408 Register LiveIn = 4409 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4410 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4411 return false; 4412 4413 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4414 B.buildCopy(SGPR01, LiveIn); 4415 B.buildInstr(AMDGPU::S_TRAP) 4416 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4417 .addReg(SGPR01, RegState::Implicit); 4418 } 4419 4420 MI.eraseFromParent(); 4421 return true; 4422 } 4423 4424 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4425 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4426 // Is non-HSA path or trap-handler disabled? then, report a warning 4427 // accordingly 4428 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4429 !ST.isTrapHandlerEnabled()) { 4430 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4431 "debugtrap handler not supported", 4432 MI.getDebugLoc(), DS_Warning); 4433 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4434 Ctx.diagnose(NoTrap); 4435 } else { 4436 // Insert debug-trap instruction 4437 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4438 } 4439 4440 MI.eraseFromParent(); 4441 return true; 4442 } 4443 4444 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4445 MachineInstr &MI) const { 4446 MachineIRBuilder &B = Helper.MIRBuilder; 4447 MachineRegisterInfo &MRI = *B.getMRI(); 4448 4449 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4450 auto IntrID = MI.getIntrinsicID(); 4451 switch (IntrID) { 4452 case Intrinsic::amdgcn_if: 4453 case Intrinsic::amdgcn_else: { 4454 MachineInstr *Br = nullptr; 4455 MachineBasicBlock *UncondBrTarget = nullptr; 4456 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4457 const SIRegisterInfo *TRI 4458 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4459 4460 Register Def = MI.getOperand(1).getReg(); 4461 Register Use = MI.getOperand(3).getReg(); 4462 4463 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4464 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4465 if (IntrID == Intrinsic::amdgcn_if) { 4466 B.buildInstr(AMDGPU::SI_IF) 4467 .addDef(Def) 4468 .addUse(Use) 4469 .addMBB(UncondBrTarget); 4470 } else { 4471 B.buildInstr(AMDGPU::SI_ELSE) 4472 .addDef(Def) 4473 .addUse(Use) 4474 .addMBB(UncondBrTarget) 4475 .addImm(0); 4476 } 4477 4478 if (Br) { 4479 Br->getOperand(0).setMBB(CondBrTarget); 4480 } else { 4481 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4482 // since we're swapping branch targets it needs to be reinserted. 4483 // FIXME: IRTranslator should probably not do this 4484 B.buildBr(*CondBrTarget); 4485 } 4486 4487 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4488 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4489 MI.eraseFromParent(); 4490 BrCond->eraseFromParent(); 4491 return true; 4492 } 4493 4494 return false; 4495 } 4496 case Intrinsic::amdgcn_loop: { 4497 MachineInstr *Br = nullptr; 4498 MachineBasicBlock *UncondBrTarget = nullptr; 4499 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4500 const SIRegisterInfo *TRI 4501 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4502 4503 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4504 Register Reg = MI.getOperand(2).getReg(); 4505 4506 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4507 B.buildInstr(AMDGPU::SI_LOOP) 4508 .addUse(Reg) 4509 .addMBB(UncondBrTarget); 4510 4511 if (Br) 4512 Br->getOperand(0).setMBB(CondBrTarget); 4513 else 4514 B.buildBr(*CondBrTarget); 4515 4516 MI.eraseFromParent(); 4517 BrCond->eraseFromParent(); 4518 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4519 return true; 4520 } 4521 4522 return false; 4523 } 4524 case Intrinsic::amdgcn_kernarg_segment_ptr: 4525 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4526 // This only makes sense to call in a kernel, so just lower to null. 4527 B.buildConstant(MI.getOperand(0).getReg(), 0); 4528 MI.eraseFromParent(); 4529 return true; 4530 } 4531 4532 return legalizePreloadedArgIntrin( 4533 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4534 case Intrinsic::amdgcn_implicitarg_ptr: 4535 return legalizeImplicitArgPtr(MI, MRI, B); 4536 case Intrinsic::amdgcn_workitem_id_x: 4537 return legalizePreloadedArgIntrin(MI, MRI, B, 4538 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4539 case Intrinsic::amdgcn_workitem_id_y: 4540 return legalizePreloadedArgIntrin(MI, MRI, B, 4541 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4542 case Intrinsic::amdgcn_workitem_id_z: 4543 return legalizePreloadedArgIntrin(MI, MRI, B, 4544 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4545 case Intrinsic::amdgcn_workgroup_id_x: 4546 return legalizePreloadedArgIntrin(MI, MRI, B, 4547 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4548 case Intrinsic::amdgcn_workgroup_id_y: 4549 return legalizePreloadedArgIntrin(MI, MRI, B, 4550 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4551 case Intrinsic::amdgcn_workgroup_id_z: 4552 return legalizePreloadedArgIntrin(MI, MRI, B, 4553 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4554 case Intrinsic::amdgcn_dispatch_ptr: 4555 return legalizePreloadedArgIntrin(MI, MRI, B, 4556 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4557 case Intrinsic::amdgcn_queue_ptr: 4558 return legalizePreloadedArgIntrin(MI, MRI, B, 4559 AMDGPUFunctionArgInfo::QUEUE_PTR); 4560 case Intrinsic::amdgcn_implicit_buffer_ptr: 4561 return legalizePreloadedArgIntrin( 4562 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4563 case Intrinsic::amdgcn_dispatch_id: 4564 return legalizePreloadedArgIntrin(MI, MRI, B, 4565 AMDGPUFunctionArgInfo::DISPATCH_ID); 4566 case Intrinsic::amdgcn_fdiv_fast: 4567 return legalizeFDIVFastIntrin(MI, MRI, B); 4568 case Intrinsic::amdgcn_is_shared: 4569 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4570 case Intrinsic::amdgcn_is_private: 4571 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4572 case Intrinsic::amdgcn_wavefrontsize: { 4573 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4574 MI.eraseFromParent(); 4575 return true; 4576 } 4577 case Intrinsic::amdgcn_s_buffer_load: 4578 return legalizeSBufferLoad(Helper, MI); 4579 case Intrinsic::amdgcn_raw_buffer_store: 4580 case Intrinsic::amdgcn_struct_buffer_store: 4581 return legalizeBufferStore(MI, MRI, B, false, false); 4582 case Intrinsic::amdgcn_raw_buffer_store_format: 4583 case Intrinsic::amdgcn_struct_buffer_store_format: 4584 return legalizeBufferStore(MI, MRI, B, false, true); 4585 case Intrinsic::amdgcn_raw_tbuffer_store: 4586 case Intrinsic::amdgcn_struct_tbuffer_store: 4587 return legalizeBufferStore(MI, MRI, B, true, true); 4588 case Intrinsic::amdgcn_raw_buffer_load: 4589 case Intrinsic::amdgcn_struct_buffer_load: 4590 return legalizeBufferLoad(MI, MRI, B, false, false); 4591 case Intrinsic::amdgcn_raw_buffer_load_format: 4592 case Intrinsic::amdgcn_struct_buffer_load_format: 4593 return legalizeBufferLoad(MI, MRI, B, true, false); 4594 case Intrinsic::amdgcn_raw_tbuffer_load: 4595 case Intrinsic::amdgcn_struct_tbuffer_load: 4596 return legalizeBufferLoad(MI, MRI, B, true, true); 4597 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4598 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4599 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4600 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4601 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4602 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4603 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4604 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4605 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4606 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4607 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4608 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4609 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4610 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4611 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4612 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4613 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4614 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4615 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4616 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4617 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4618 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4619 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4620 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4621 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4622 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4623 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4624 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4625 return legalizeBufferAtomic(MI, B, IntrID); 4626 case Intrinsic::amdgcn_atomic_inc: 4627 return legalizeAtomicIncDec(MI, B, true); 4628 case Intrinsic::amdgcn_atomic_dec: 4629 return legalizeAtomicIncDec(MI, B, false); 4630 case Intrinsic::trap: 4631 return legalizeTrapIntrinsic(MI, MRI, B); 4632 case Intrinsic::debugtrap: 4633 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4634 case Intrinsic::amdgcn_rsq_clamp: 4635 return legalizeRsqClampIntrinsic(MI, MRI, B); 4636 case Intrinsic::amdgcn_ds_fadd: 4637 case Intrinsic::amdgcn_ds_fmin: 4638 case Intrinsic::amdgcn_ds_fmax: 4639 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4640 default: { 4641 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4642 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4643 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4644 return true; 4645 } 4646 } 4647 4648 return true; 4649 } 4650