1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 339 LLT EltTy = Ty.getElementType(); 340 if (EltTy.isPointer()) 341 return true; 342 343 unsigned EltSize = EltTy.getSizeInBits(); 344 return EltSize != 32 && EltSize != 64; 345 } 346 347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 348 unsigned Opcode) { 349 const LLT Ty = Query.Types[0]; 350 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 351 !loadStoreBitcastWorkaround(Ty); 352 } 353 354 /// Return true if a load or store of the type should be lowered with a bitcast 355 /// to a different type. 356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 357 const unsigned MemSizeInBits) { 358 const unsigned Size = Ty.getSizeInBits(); 359 if (Size != MemSizeInBits) 360 return Size <= 32 && Ty.isVector(); 361 362 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 363 return true; 364 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 365 !isRegisterVectorElementType(Ty.getElementType()); 366 } 367 368 /// Return true if we should legalize a load by widening an odd sized memory 369 /// access up to the alignment. Note this case when the memory access itself 370 /// changes, not the size of the result register. 371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 372 unsigned AlignInBits, unsigned AddrSpace, 373 unsigned Opcode) { 374 // We don't want to widen cases that are naturally legal. 375 if (isPowerOf2_32(SizeInBits)) 376 return false; 377 378 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 379 // end up widening these for a scalar load during RegBankSelect, since there 380 // aren't 96-bit scalar loads. 381 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 382 return false; 383 384 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 385 return false; 386 387 // A load is known dereferenceable up to the alignment, so it's legal to widen 388 // to it. 389 // 390 // TODO: Could check dereferenceable for less aligned cases. 391 unsigned RoundedSize = NextPowerOf2(SizeInBits); 392 if (AlignInBits < RoundedSize) 393 return false; 394 395 // Do not widen if it would introduce a slow unaligned load. 396 const SITargetLowering *TLI = ST.getTargetLowering(); 397 bool Fast = false; 398 return TLI->allowsMisalignedMemoryAccessesImpl( 399 RoundedSize, AddrSpace, Align(AlignInBits / 8), 400 MachineMemOperand::MOLoad, &Fast) && 401 Fast; 402 } 403 404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 405 unsigned Opcode) { 406 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 407 return false; 408 409 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 410 Query.MMODescrs[0].AlignInBits, 411 Query.Types[1].getAddressSpace(), Opcode); 412 } 413 414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 415 const GCNTargetMachine &TM) 416 : ST(ST_) { 417 using namespace TargetOpcode; 418 419 auto GetAddrSpacePtr = [&TM](unsigned AS) { 420 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 421 }; 422 423 const LLT S1 = LLT::scalar(1); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S16 = LLT::vector(2, 16); 433 const LLT V4S16 = LLT::vector(4, 16); 434 435 const LLT V2S32 = LLT::vector(2, 32); 436 const LLT V3S32 = LLT::vector(3, 32); 437 const LLT V4S32 = LLT::vector(4, 32); 438 const LLT V5S32 = LLT::vector(5, 32); 439 const LLT V6S32 = LLT::vector(6, 32); 440 const LLT V7S32 = LLT::vector(7, 32); 441 const LLT V8S32 = LLT::vector(8, 32); 442 const LLT V9S32 = LLT::vector(9, 32); 443 const LLT V10S32 = LLT::vector(10, 32); 444 const LLT V11S32 = LLT::vector(11, 32); 445 const LLT V12S32 = LLT::vector(12, 32); 446 const LLT V13S32 = LLT::vector(13, 32); 447 const LLT V14S32 = LLT::vector(14, 32); 448 const LLT V15S32 = LLT::vector(15, 32); 449 const LLT V16S32 = LLT::vector(16, 32); 450 const LLT V32S32 = LLT::vector(32, 32); 451 452 const LLT V2S64 = LLT::vector(2, 64); 453 const LLT V3S64 = LLT::vector(3, 64); 454 const LLT V4S64 = LLT::vector(4, 64); 455 const LLT V5S64 = LLT::vector(5, 64); 456 const LLT V6S64 = LLT::vector(6, 64); 457 const LLT V7S64 = LLT::vector(7, 64); 458 const LLT V8S64 = LLT::vector(8, 64); 459 const LLT V16S64 = LLT::vector(16, 64); 460 461 std::initializer_list<LLT> AllS32Vectors = 462 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 463 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 464 std::initializer_list<LLT> AllS64Vectors = 465 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 466 467 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 468 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 469 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 470 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 471 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 472 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 473 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 474 475 const LLT CodePtr = FlatPtr; 476 477 const std::initializer_list<LLT> AddrSpaces64 = { 478 GlobalPtr, ConstantPtr, FlatPtr 479 }; 480 481 const std::initializer_list<LLT> AddrSpaces32 = { 482 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 483 }; 484 485 const std::initializer_list<LLT> FPTypesBase = { 486 S32, S64 487 }; 488 489 const std::initializer_list<LLT> FPTypes16 = { 490 S32, S64, S16 491 }; 492 493 const std::initializer_list<LLT> FPTypesPK16 = { 494 S32, S64, S16, V2S16 495 }; 496 497 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 498 499 setAction({G_BRCOND, S1}, Legal); // VCC branches 500 setAction({G_BRCOND, S32}, Legal); // SCC branches 501 502 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 503 // elements for v3s16 504 getActionDefinitionsBuilder(G_PHI) 505 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 506 .legalFor(AllS32Vectors) 507 .legalFor(AllS64Vectors) 508 .legalFor(AddrSpaces64) 509 .legalFor(AddrSpaces32) 510 .legalIf(isPointer(0)) 511 .clampScalar(0, S16, S256) 512 .widenScalarToNextPow2(0, 32) 513 .clampMaxNumElements(0, S32, 16) 514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 515 .scalarize(0); 516 517 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 518 // Full set of gfx9 features. 519 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 520 .legalFor({S32, S16, V2S16}) 521 .clampScalar(0, S16, S32) 522 .clampMaxNumElements(0, S16, 2) 523 .scalarize(0) 524 .widenScalarToNextPow2(0, 32); 525 526 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 527 .legalFor({S32, S16, V2S16}) // Clamp modifier 528 .minScalarOrElt(0, S16) 529 .clampMaxNumElements(0, S16, 2) 530 .scalarize(0) 531 .widenScalarToNextPow2(0, 32) 532 .lower(); 533 } else if (ST.has16BitInsts()) { 534 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 535 .legalFor({S32, S16}) 536 .clampScalar(0, S16, S32) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 539 540 // Technically the saturating operations require clamp bit support, but this 541 // was introduced at the same time as 16-bit operations. 542 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 543 .legalFor({S32, S16}) // Clamp modifier 544 .minScalar(0, S16) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 16) 547 .lower(); 548 549 // We're just lowering this, but it helps get a better result to try to 550 // coerce to the desired type first. 551 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 552 .minScalar(0, S16) 553 .scalarize(0) 554 .lower(); 555 } else { 556 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 557 .legalFor({S32}) 558 .clampScalar(0, S32, S32) 559 .scalarize(0); 560 561 if (ST.hasIntClamp()) { 562 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 563 .legalFor({S32}) // Clamp modifier. 564 .scalarize(0) 565 .minScalarOrElt(0, S32) 566 .lower(); 567 } else { 568 // Clamp bit support was added in VI, along with 16-bit operations. 569 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 570 .minScalar(0, S32) 571 .scalarize(0) 572 .lower(); 573 } 574 575 // FIXME: DAG expansion gets better results. The widening uses the smaller 576 // range values and goes for the min/max lowering directly. 577 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 584 .customFor({S32, S64}) 585 .clampScalar(0, S32, S64) 586 .widenScalarToNextPow2(0, 32) 587 .scalarize(0); 588 589 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 590 .legalFor({S32}) 591 .clampScalar(0, S32, S32) 592 .scalarize(0); 593 594 // Report legal for any types we can handle anywhere. For the cases only legal 595 // on the SALU, RegBankSelect will be able to re-legalize. 596 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 597 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 598 .clampScalar(0, S32, S64) 599 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 600 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 601 .widenScalarToNextPow2(0) 602 .scalarize(0); 603 604 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 605 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 606 .legalFor({{S32, S1}, {S32, S32}}) 607 .minScalar(0, S32) 608 // TODO: .scalarize(0) 609 .lower(); 610 611 getActionDefinitionsBuilder(G_BITCAST) 612 // Don't worry about the size constraint. 613 .legalIf(all(isRegisterType(0), isRegisterType(1))) 614 .lower(); 615 616 617 getActionDefinitionsBuilder(G_CONSTANT) 618 .legalFor({S1, S32, S64, S16, GlobalPtr, 619 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 620 .legalIf(isPointer(0)) 621 .clampScalar(0, S32, S64) 622 .widenScalarToNextPow2(0); 623 624 getActionDefinitionsBuilder(G_FCONSTANT) 625 .legalFor({S32, S64, S16}) 626 .clampScalar(0, S16, S64); 627 628 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 629 .legalIf(isRegisterType(0)) 630 // s1 and s16 are special cases because they have legal operations on 631 // them, but don't really occupy registers in the normal way. 632 .legalFor({S1, S16}) 633 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 634 .clampScalarOrElt(0, S32, MaxScalar) 635 .widenScalarToNextPow2(0, 32) 636 .clampMaxNumElements(0, S32, 16); 637 638 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 639 640 // If the amount is divergent, we have to do a wave reduction to get the 641 // maximum value, so this is expanded during RegBankSelect. 642 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 643 .legalFor({{PrivatePtr, S32}}); 644 645 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 646 .customIf(typeIsNot(0, PrivatePtr)); 647 648 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 649 650 auto &FPOpActions = getActionDefinitionsBuilder( 651 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 652 .legalFor({S32, S64}); 653 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 654 .customFor({S32, S64}); 655 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 656 .customFor({S32, S64}); 657 658 if (ST.has16BitInsts()) { 659 if (ST.hasVOP3PInsts()) 660 FPOpActions.legalFor({S16, V2S16}); 661 else 662 FPOpActions.legalFor({S16}); 663 664 TrigActions.customFor({S16}); 665 FDIVActions.customFor({S16}); 666 } 667 668 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 669 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 670 671 if (ST.hasVOP3PInsts()) { 672 MinNumMaxNum.customFor(FPTypesPK16) 673 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 674 .clampMaxNumElements(0, S16, 2) 675 .clampScalar(0, S16, S64) 676 .scalarize(0); 677 } else if (ST.has16BitInsts()) { 678 MinNumMaxNum.customFor(FPTypes16) 679 .clampScalar(0, S16, S64) 680 .scalarize(0); 681 } else { 682 MinNumMaxNum.customFor(FPTypesBase) 683 .clampScalar(0, S32, S64) 684 .scalarize(0); 685 } 686 687 if (ST.hasVOP3PInsts()) 688 FPOpActions.clampMaxNumElements(0, S16, 2); 689 690 FPOpActions 691 .scalarize(0) 692 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 693 694 TrigActions 695 .scalarize(0) 696 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 697 698 FDIVActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 703 .legalFor(FPTypesPK16) 704 .clampMaxNumElements(0, S16, 2) 705 .scalarize(0) 706 .clampScalar(0, S16, S64); 707 708 if (ST.has16BitInsts()) { 709 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 710 .legalFor({S32, S64, S16}) 711 .scalarize(0) 712 .clampScalar(0, S16, S64); 713 } else { 714 getActionDefinitionsBuilder(G_FSQRT) 715 .legalFor({S32, S64}) 716 .scalarize(0) 717 .clampScalar(0, S32, S64); 718 719 if (ST.hasFractBug()) { 720 getActionDefinitionsBuilder(G_FFLOOR) 721 .customFor({S64}) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } else { 726 getActionDefinitionsBuilder(G_FFLOOR) 727 .legalFor({S32, S64}) 728 .scalarize(0) 729 .clampScalar(0, S32, S64); 730 } 731 } 732 733 getActionDefinitionsBuilder(G_FPTRUNC) 734 .legalFor({{S32, S64}, {S16, S32}}) 735 .scalarize(0) 736 .lower(); 737 738 getActionDefinitionsBuilder(G_FPEXT) 739 .legalFor({{S64, S32}, {S32, S16}}) 740 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 741 .scalarize(0); 742 743 getActionDefinitionsBuilder(G_FSUB) 744 // Use actual fsub instruction 745 .legalFor({S32}) 746 // Must use fadd + fneg 747 .lowerFor({S64, S16, V2S16}) 748 .scalarize(0) 749 .clampScalar(0, S32, S64); 750 751 // Whether this is legal depends on the floating point mode for the function. 752 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 753 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 754 FMad.customFor({S32, S16}); 755 else if (ST.hasMadMacF32Insts()) 756 FMad.customFor({S32}); 757 else if (ST.hasMadF16()) 758 FMad.customFor({S16}); 759 FMad.scalarize(0) 760 .lower(); 761 762 auto &FRem = getActionDefinitionsBuilder(G_FREM); 763 if (ST.has16BitInsts()) { 764 FRem.customFor({S16, S32, S64}); 765 } else { 766 FRem.minScalar(0, S32) 767 .customFor({S32, S64}); 768 } 769 FRem.scalarize(0); 770 771 // TODO: Do we need to clamp maximum bitwidth? 772 getActionDefinitionsBuilder(G_TRUNC) 773 .legalIf(isScalar(0)) 774 .legalFor({{V2S16, V2S32}}) 775 .clampMaxNumElements(0, S16, 2) 776 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 777 // situations (like an invalid implicit use), we don't want to infinite loop 778 // in the legalizer. 779 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 780 .alwaysLegal(); 781 782 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 783 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 784 {S32, S1}, {S64, S1}, {S16, S1}}) 785 .scalarize(0) 786 .clampScalar(0, S32, S64) 787 .widenScalarToNextPow2(1, 32); 788 789 // TODO: Split s1->s64 during regbankselect for VALU. 790 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 791 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 792 .lowerFor({{S32, S64}}) 793 .lowerIf(typeIs(1, S1)) 794 .customFor({{S64, S64}}); 795 if (ST.has16BitInsts()) 796 IToFP.legalFor({{S16, S16}}); 797 IToFP.clampScalar(1, S32, S64) 798 .minScalar(0, S32) 799 .scalarize(0) 800 .widenScalarToNextPow2(1); 801 802 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 803 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 804 .customFor({{S64, S64}}) 805 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 806 if (ST.has16BitInsts()) 807 FPToI.legalFor({{S16, S16}}); 808 else 809 FPToI.minScalar(1, S32); 810 811 FPToI.minScalar(0, S32) 812 .scalarize(0) 813 .lower(); 814 815 // Lower roundeven into G_FRINT 816 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 817 .scalarize(0) 818 .lower(); 819 820 if (ST.has16BitInsts()) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S16, S32, S64}) 823 .clampScalar(0, S16, S64) 824 .scalarize(0); 825 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32, S64}) 828 .clampScalar(0, S32, S64) 829 .scalarize(0); 830 } else { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S32}) 833 .customFor({S64}) 834 .clampScalar(0, S32, S64) 835 .scalarize(0); 836 } 837 838 getActionDefinitionsBuilder(G_PTR_ADD) 839 .legalIf(all(isPointer(0), sameSize(0, 1))) 840 .scalarize(0) 841 .scalarSameSizeAs(1, 0); 842 843 getActionDefinitionsBuilder(G_PTRMASK) 844 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 845 .scalarSameSizeAs(1, 0) 846 .scalarize(0); 847 848 auto &CmpBuilder = 849 getActionDefinitionsBuilder(G_ICMP) 850 // The compare output type differs based on the register bank of the output, 851 // so make both s1 and s32 legal. 852 // 853 // Scalar compares producing output in scc will be promoted to s32, as that 854 // is the allocatable register type that will be needed for the copy from 855 // scc. This will be promoted during RegBankSelect, and we assume something 856 // before that won't try to use s32 result types. 857 // 858 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 859 // bank. 860 .legalForCartesianProduct( 861 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 862 .legalForCartesianProduct( 863 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 864 if (ST.has16BitInsts()) { 865 CmpBuilder.legalFor({{S1, S16}}); 866 } 867 868 CmpBuilder 869 .widenScalarToNextPow2(1) 870 .clampScalar(1, S32, S64) 871 .scalarize(0) 872 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 873 874 getActionDefinitionsBuilder(G_FCMP) 875 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 876 .widenScalarToNextPow2(1) 877 .clampScalar(1, S32, S64) 878 .scalarize(0); 879 880 // FIXME: fpow has a selection pattern that should move to custom lowering. 881 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 882 if (ST.has16BitInsts()) 883 Exp2Ops.legalFor({S32, S16}); 884 else 885 Exp2Ops.legalFor({S32}); 886 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 887 Exp2Ops.scalarize(0); 888 889 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 890 if (ST.has16BitInsts()) 891 ExpOps.customFor({{S32}, {S16}}); 892 else 893 ExpOps.customFor({S32}); 894 ExpOps.clampScalar(0, MinScalarFPTy, S32) 895 .scalarize(0); 896 897 getActionDefinitionsBuilder(G_FPOWI) 898 .clampScalar(0, MinScalarFPTy, S32) 899 .lower(); 900 901 // The 64-bit versions produce 32-bit results, but only on the SALU. 902 getActionDefinitionsBuilder(G_CTPOP) 903 .legalFor({{S32, S32}, {S32, S64}}) 904 .clampScalar(0, S32, S32) 905 .clampScalar(1, S32, S64) 906 .scalarize(0) 907 .widenScalarToNextPow2(0, 32) 908 .widenScalarToNextPow2(1, 32); 909 910 // The hardware instructions return a different result on 0 than the generic 911 // instructions expect. The hardware produces -1, but these produce the 912 // bitwidth. 913 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 914 .scalarize(0) 915 .clampScalar(0, S32, S32) 916 .clampScalar(1, S32, S64) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 getActionDefinitionsBuilder(G_BITREVERSE) 931 .legalFor({S32}) 932 .clampScalar(0, S32, S32) 933 .scalarize(0); 934 935 if (ST.has16BitInsts()) { 936 getActionDefinitionsBuilder(G_BSWAP) 937 .legalFor({S16, S32, V2S16}) 938 .clampMaxNumElements(0, S16, 2) 939 // FIXME: Fixing non-power-of-2 before clamp is workaround for 940 // narrowScalar limitation. 941 .widenScalarToNextPow2(0) 942 .clampScalar(0, S16, S32) 943 .scalarize(0); 944 945 if (ST.hasVOP3PInsts()) { 946 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 947 .legalFor({S32, S16, V2S16}) 948 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 949 .clampMaxNumElements(0, S16, 2) 950 .minScalar(0, S16) 951 .widenScalarToNextPow2(0) 952 .scalarize(0) 953 .lower(); 954 } else { 955 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 956 .legalFor({S32, S16}) 957 .widenScalarToNextPow2(0) 958 .minScalar(0, S16) 959 .scalarize(0) 960 .lower(); 961 } 962 } else { 963 // TODO: Should have same legality without v_perm_b32 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S32}) 966 .lowerIf(scalarNarrowerThan(0, 32)) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .maxScalar(0, S32) 971 .scalarize(0) 972 .lower(); 973 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 975 .legalFor({S32}) 976 .minScalar(0, S32) 977 .widenScalarToNextPow2(0) 978 .scalarize(0) 979 .lower(); 980 } 981 982 getActionDefinitionsBuilder(G_INTTOPTR) 983 // List the common cases 984 .legalForCartesianProduct(AddrSpaces64, {S64}) 985 .legalForCartesianProduct(AddrSpaces32, {S32}) 986 .scalarize(0) 987 // Accept any address space as long as the size matches 988 .legalIf(sameSize(0, 1)) 989 .widenScalarIf(smallerThan(1, 0), 990 [](const LegalityQuery &Query) { 991 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 992 }) 993 .narrowScalarIf(largerThan(1, 0), 994 [](const LegalityQuery &Query) { 995 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 996 }); 997 998 getActionDefinitionsBuilder(G_PTRTOINT) 999 // List the common cases 1000 .legalForCartesianProduct(AddrSpaces64, {S64}) 1001 .legalForCartesianProduct(AddrSpaces32, {S32}) 1002 .scalarize(0) 1003 // Accept any address space as long as the size matches 1004 .legalIf(sameSize(0, 1)) 1005 .widenScalarIf(smallerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }) 1009 .narrowScalarIf( 1010 largerThan(0, 1), 1011 [](const LegalityQuery &Query) { 1012 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1013 }); 1014 1015 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1016 .scalarize(0) 1017 .custom(); 1018 1019 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1020 bool IsLoad) -> bool { 1021 const LLT DstTy = Query.Types[0]; 1022 1023 // Split vector extloads. 1024 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1025 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1026 1027 if (MemSize < DstTy.getSizeInBits()) 1028 MemSize = std::max(MemSize, AlignBits); 1029 1030 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1031 return true; 1032 1033 const LLT PtrTy = Query.Types[1]; 1034 unsigned AS = PtrTy.getAddressSpace(); 1035 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1036 return true; 1037 1038 // Catch weird sized loads that don't evenly divide into the access sizes 1039 // TODO: May be able to widen depending on alignment etc. 1040 unsigned NumRegs = (MemSize + 31) / 32; 1041 if (NumRegs == 3) { 1042 if (!ST.hasDwordx3LoadStores()) 1043 return true; 1044 } else { 1045 // If the alignment allows, these should have been widened. 1046 if (!isPowerOf2_32(NumRegs)) 1047 return true; 1048 } 1049 1050 if (AlignBits < MemSize) { 1051 const SITargetLowering *TLI = ST.getTargetLowering(); 1052 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1053 Align(AlignBits / 8)); 1054 } 1055 1056 return false; 1057 }; 1058 1059 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1060 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1061 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1062 1063 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1064 // LDS 1065 // TODO: Unsupported flat for SI. 1066 1067 for (unsigned Op : {G_LOAD, G_STORE}) { 1068 const bool IsStore = Op == G_STORE; 1069 1070 auto &Actions = getActionDefinitionsBuilder(Op); 1071 // Explicitly list some common cases. 1072 // TODO: Does this help compile time at all? 1073 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1074 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1075 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1076 {S64, GlobalPtr, 64, GlobalAlign32}, 1077 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1078 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1079 {S32, GlobalPtr, 8, GlobalAlign8}, 1080 {S32, GlobalPtr, 16, GlobalAlign16}, 1081 1082 {S32, LocalPtr, 32, 32}, 1083 {S64, LocalPtr, 64, 32}, 1084 {V2S32, LocalPtr, 64, 32}, 1085 {S32, LocalPtr, 8, 8}, 1086 {S32, LocalPtr, 16, 16}, 1087 {V2S16, LocalPtr, 32, 32}, 1088 1089 {S32, PrivatePtr, 32, 32}, 1090 {S32, PrivatePtr, 8, 8}, 1091 {S32, PrivatePtr, 16, 16}, 1092 {V2S16, PrivatePtr, 32, 32}, 1093 1094 {S32, ConstantPtr, 32, GlobalAlign32}, 1095 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1096 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1097 {S64, ConstantPtr, 64, GlobalAlign32}, 1098 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1099 Actions.legalIf( 1100 [=](const LegalityQuery &Query) -> bool { 1101 return isLoadStoreLegal(ST, Query, Op); 1102 }); 1103 1104 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1105 // 64-bits. 1106 // 1107 // TODO: Should generalize bitcast action into coerce, which will also cover 1108 // inserting addrspacecasts. 1109 Actions.customIf(typeIs(1, Constant32Ptr)); 1110 1111 // Turn any illegal element vectors into something easier to deal 1112 // with. These will ultimately produce 32-bit scalar shifts to extract the 1113 // parts anyway. 1114 // 1115 // For odd 16-bit element vectors, prefer to split those into pieces with 1116 // 16-bit vector parts. 1117 Actions.bitcastIf( 1118 [=](const LegalityQuery &Query) -> bool { 1119 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1120 Query.MMODescrs[0].SizeInBits); 1121 }, bitcastToRegisterType(0)); 1122 1123 if (!IsStore) { 1124 // Widen suitably aligned loads by loading extra bytes. The standard 1125 // legalization actions can't properly express widening memory operands. 1126 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1127 return shouldWidenLoad(ST, Query, G_LOAD); 1128 }); 1129 } 1130 1131 // FIXME: load/store narrowing should be moved to lower action 1132 Actions 1133 .narrowScalarIf( 1134 [=](const LegalityQuery &Query) -> bool { 1135 return !Query.Types[0].isVector() && 1136 needToSplitMemOp(Query, Op == G_LOAD); 1137 }, 1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1139 const LLT DstTy = Query.Types[0]; 1140 const LLT PtrTy = Query.Types[1]; 1141 1142 const unsigned DstSize = DstTy.getSizeInBits(); 1143 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1144 1145 // Split extloads. 1146 if (DstSize > MemSize) 1147 return std::make_pair(0, LLT::scalar(MemSize)); 1148 1149 if (!isPowerOf2_32(DstSize)) { 1150 // We're probably decomposing an odd sized store. Try to split 1151 // to the widest type. TODO: Account for alignment. As-is it 1152 // should be OK, since the new parts will be further legalized. 1153 unsigned FloorSize = PowerOf2Floor(DstSize); 1154 return std::make_pair(0, LLT::scalar(FloorSize)); 1155 } 1156 1157 if (DstSize > 32 && (DstSize % 32 != 0)) { 1158 // FIXME: Need a way to specify non-extload of larger size if 1159 // suitably aligned. 1160 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1161 } 1162 1163 unsigned MaxSize = maxSizeForAddrSpace(ST, 1164 PtrTy.getAddressSpace(), 1165 Op == G_LOAD); 1166 if (MemSize > MaxSize) 1167 return std::make_pair(0, LLT::scalar(MaxSize)); 1168 1169 unsigned Align = Query.MMODescrs[0].AlignInBits; 1170 return std::make_pair(0, LLT::scalar(Align)); 1171 }) 1172 .fewerElementsIf( 1173 [=](const LegalityQuery &Query) -> bool { 1174 return Query.Types[0].isVector() && 1175 needToSplitMemOp(Query, Op == G_LOAD); 1176 }, 1177 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1178 const LLT DstTy = Query.Types[0]; 1179 const LLT PtrTy = Query.Types[1]; 1180 1181 LLT EltTy = DstTy.getElementType(); 1182 unsigned MaxSize = maxSizeForAddrSpace(ST, 1183 PtrTy.getAddressSpace(), 1184 Op == G_LOAD); 1185 1186 // FIXME: Handle widened to power of 2 results better. This ends 1187 // up scalarizing. 1188 // FIXME: 3 element stores scalarized on SI 1189 1190 // Split if it's too large for the address space. 1191 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1192 unsigned NumElts = DstTy.getNumElements(); 1193 unsigned EltSize = EltTy.getSizeInBits(); 1194 1195 if (MaxSize % EltSize == 0) { 1196 return std::make_pair( 1197 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1198 } 1199 1200 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1201 1202 // FIXME: Refine when odd breakdowns handled 1203 // The scalars will need to be re-legalized. 1204 if (NumPieces == 1 || NumPieces >= NumElts || 1205 NumElts % NumPieces != 0) 1206 return std::make_pair(0, EltTy); 1207 1208 return std::make_pair(0, 1209 LLT::vector(NumElts / NumPieces, EltTy)); 1210 } 1211 1212 // FIXME: We could probably handle weird extending loads better. 1213 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1214 if (DstTy.getSizeInBits() > MemSize) 1215 return std::make_pair(0, EltTy); 1216 1217 unsigned EltSize = EltTy.getSizeInBits(); 1218 unsigned DstSize = DstTy.getSizeInBits(); 1219 if (!isPowerOf2_32(DstSize)) { 1220 // We're probably decomposing an odd sized store. Try to split 1221 // to the widest type. TODO: Account for alignment. As-is it 1222 // should be OK, since the new parts will be further legalized. 1223 unsigned FloorSize = PowerOf2Floor(DstSize); 1224 return std::make_pair( 1225 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1226 } 1227 1228 // Need to split because of alignment. 1229 unsigned Align = Query.MMODescrs[0].AlignInBits; 1230 if (EltSize > Align && 1231 (EltSize / Align < DstTy.getNumElements())) { 1232 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1233 } 1234 1235 // May need relegalization for the scalars. 1236 return std::make_pair(0, EltTy); 1237 }) 1238 .lowerIfMemSizeNotPow2() 1239 .minScalar(0, S32); 1240 1241 if (IsStore) 1242 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1243 1244 Actions 1245 .widenScalarToNextPow2(0) 1246 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1247 .lower(); 1248 } 1249 1250 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1251 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1252 {S32, GlobalPtr, 16, 2 * 8}, 1253 {S32, LocalPtr, 8, 8}, 1254 {S32, LocalPtr, 16, 16}, 1255 {S32, PrivatePtr, 8, 8}, 1256 {S32, PrivatePtr, 16, 16}, 1257 {S32, ConstantPtr, 8, 8}, 1258 {S32, ConstantPtr, 16, 2 * 8}}); 1259 if (ST.hasFlatAddressSpace()) { 1260 ExtLoads.legalForTypesWithMemDesc( 1261 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1262 } 1263 1264 ExtLoads.clampScalar(0, S32, S32) 1265 .widenScalarToNextPow2(0) 1266 .unsupportedIfMemSizeNotPow2() 1267 .lower(); 1268 1269 auto &Atomics = getActionDefinitionsBuilder( 1270 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1271 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1272 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1273 G_ATOMICRMW_UMIN}) 1274 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1275 {S64, GlobalPtr}, {S64, LocalPtr}, 1276 {S32, RegionPtr}, {S64, RegionPtr}}); 1277 if (ST.hasFlatAddressSpace()) { 1278 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1279 } 1280 1281 if (ST.hasLDSFPAtomics()) { 1282 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1283 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1284 } 1285 1286 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1287 // demarshalling 1288 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1289 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1290 {S32, FlatPtr}, {S64, FlatPtr}}) 1291 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1292 {S32, RegionPtr}, {S64, RegionPtr}}); 1293 // TODO: Pointer types, any 32-bit or 64-bit vector 1294 1295 // Condition should be s32 for scalar, s1 for vector. 1296 getActionDefinitionsBuilder(G_SELECT) 1297 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1298 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1299 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1300 .clampScalar(0, S16, S64) 1301 .scalarize(1) 1302 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1303 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1304 .clampMaxNumElements(0, S32, 2) 1305 .clampMaxNumElements(0, LocalPtr, 2) 1306 .clampMaxNumElements(0, PrivatePtr, 2) 1307 .scalarize(0) 1308 .widenScalarToNextPow2(0) 1309 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1310 1311 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1312 // be more flexible with the shift amount type. 1313 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1314 .legalFor({{S32, S32}, {S64, S32}}); 1315 if (ST.has16BitInsts()) { 1316 if (ST.hasVOP3PInsts()) { 1317 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1318 .clampMaxNumElements(0, S16, 2); 1319 } else 1320 Shifts.legalFor({{S16, S16}}); 1321 1322 // TODO: Support 16-bit shift amounts for all types 1323 Shifts.widenScalarIf( 1324 [=](const LegalityQuery &Query) { 1325 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1326 // 32-bit amount. 1327 const LLT ValTy = Query.Types[0]; 1328 const LLT AmountTy = Query.Types[1]; 1329 return ValTy.getSizeInBits() <= 16 && 1330 AmountTy.getSizeInBits() < 16; 1331 }, changeTo(1, S16)); 1332 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1333 Shifts.clampScalar(1, S32, S32); 1334 Shifts.clampScalar(0, S16, S64); 1335 Shifts.widenScalarToNextPow2(0, 16); 1336 1337 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1338 .minScalar(0, S16) 1339 .scalarize(0) 1340 .lower(); 1341 } else { 1342 // Make sure we legalize the shift amount type first, as the general 1343 // expansion for the shifted type will produce much worse code if it hasn't 1344 // been truncated already. 1345 Shifts.clampScalar(1, S32, S32); 1346 Shifts.clampScalar(0, S32, S64); 1347 Shifts.widenScalarToNextPow2(0, 32); 1348 1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1350 .minScalar(0, S32) 1351 .scalarize(0) 1352 .lower(); 1353 } 1354 Shifts.scalarize(0); 1355 1356 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1357 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1358 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1359 unsigned IdxTypeIdx = 2; 1360 1361 getActionDefinitionsBuilder(Op) 1362 .customIf([=](const LegalityQuery &Query) { 1363 const LLT EltTy = Query.Types[EltTypeIdx]; 1364 const LLT VecTy = Query.Types[VecTypeIdx]; 1365 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1366 const unsigned EltSize = EltTy.getSizeInBits(); 1367 return (EltSize == 32 || EltSize == 64) && 1368 VecTy.getSizeInBits() % 32 == 0 && 1369 VecTy.getSizeInBits() <= MaxRegisterSize && 1370 IdxTy.getSizeInBits() == 32; 1371 }) 1372 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1373 bitcastToVectorElement32(VecTypeIdx)) 1374 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1375 .bitcastIf( 1376 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1377 [=](const LegalityQuery &Query) { 1378 // For > 64-bit element types, try to turn this into a 64-bit 1379 // element vector since we may be able to do better indexing 1380 // if this is scalar. If not, fall back to 32. 1381 const LLT EltTy = Query.Types[EltTypeIdx]; 1382 const LLT VecTy = Query.Types[VecTypeIdx]; 1383 const unsigned DstEltSize = EltTy.getSizeInBits(); 1384 const unsigned VecSize = VecTy.getSizeInBits(); 1385 1386 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1387 return std::make_pair( 1388 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1389 }) 1390 .clampScalar(EltTypeIdx, S32, S64) 1391 .clampScalar(VecTypeIdx, S32, S64) 1392 .clampScalar(IdxTypeIdx, S32, S32) 1393 .clampMaxNumElements(VecTypeIdx, S32, 32) 1394 // TODO: Clamp elements for 64-bit vectors? 1395 // It should only be necessary with variable indexes. 1396 // As a last resort, lower to the stack 1397 .lower(); 1398 } 1399 1400 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1401 .unsupportedIf([=](const LegalityQuery &Query) { 1402 const LLT &EltTy = Query.Types[1].getElementType(); 1403 return Query.Types[0] != EltTy; 1404 }); 1405 1406 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1407 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1408 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1409 1410 // FIXME: Doesn't handle extract of illegal sizes. 1411 getActionDefinitionsBuilder(Op) 1412 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1413 // FIXME: Multiples of 16 should not be legal. 1414 .legalIf([=](const LegalityQuery &Query) { 1415 const LLT BigTy = Query.Types[BigTyIdx]; 1416 const LLT LitTy = Query.Types[LitTyIdx]; 1417 return (BigTy.getSizeInBits() % 32 == 0) && 1418 (LitTy.getSizeInBits() % 16 == 0); 1419 }) 1420 .widenScalarIf( 1421 [=](const LegalityQuery &Query) { 1422 const LLT BigTy = Query.Types[BigTyIdx]; 1423 return (BigTy.getScalarSizeInBits() < 16); 1424 }, 1425 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1426 .widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT LitTy = Query.Types[LitTyIdx]; 1429 return (LitTy.getScalarSizeInBits() < 16); 1430 }, 1431 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1432 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1433 .widenScalarToNextPow2(BigTyIdx, 32); 1434 1435 } 1436 1437 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1438 .legalForCartesianProduct(AllS32Vectors, {S32}) 1439 .legalForCartesianProduct(AllS64Vectors, {S64}) 1440 .clampNumElements(0, V16S32, V32S32) 1441 .clampNumElements(0, V2S64, V16S64) 1442 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1443 1444 if (ST.hasScalarPackInsts()) { 1445 BuildVector 1446 // FIXME: Should probably widen s1 vectors straight to s32 1447 .minScalarOrElt(0, S16) 1448 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1449 .minScalar(1, S32); 1450 1451 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1452 .legalFor({V2S16, S32}) 1453 .lower(); 1454 BuildVector.minScalarOrElt(0, S32); 1455 } else { 1456 BuildVector.customFor({V2S16, S16}); 1457 BuildVector.minScalarOrElt(0, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .customFor({V2S16, S32}) 1461 .lower(); 1462 } 1463 1464 BuildVector.legalIf(isRegisterType(0)); 1465 1466 // FIXME: Clamp maximum size 1467 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1468 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1469 .clampMaxNumElements(0, S32, 32) 1470 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1471 .clampMaxNumElements(0, S16, 64); 1472 1473 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1474 // pre-legalize. 1475 if (ST.hasVOP3PInsts()) { 1476 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1477 .customFor({V2S16, V2S16}) 1478 .lower(); 1479 } else 1480 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1481 1482 // Merge/Unmerge 1483 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1484 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1485 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1486 1487 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1488 const LLT Ty = Query.Types[TypeIdx]; 1489 if (Ty.isVector()) { 1490 const LLT &EltTy = Ty.getElementType(); 1491 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1492 return true; 1493 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1494 return true; 1495 } 1496 return false; 1497 }; 1498 1499 auto &Builder = getActionDefinitionsBuilder(Op) 1500 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1501 .lowerFor({{S16, V2S16}}) 1502 .lowerIf([=](const LegalityQuery &Query) { 1503 const LLT BigTy = Query.Types[BigTyIdx]; 1504 return BigTy.getSizeInBits() == 32; 1505 }) 1506 // Try to widen to s16 first for small types. 1507 // TODO: Only do this on targets with legal s16 shifts 1508 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1509 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1510 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1511 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1512 elementTypeIs(1, S16)), 1513 changeTo(1, V2S16)) 1514 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1515 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1516 // valid. 1517 .clampScalar(LitTyIdx, S32, S512) 1518 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1519 // Break up vectors with weird elements into scalars 1520 .fewerElementsIf( 1521 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1522 scalarize(0)) 1523 .fewerElementsIf( 1524 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1525 scalarize(1)) 1526 .clampScalar(BigTyIdx, S32, MaxScalar); 1527 1528 if (Op == G_MERGE_VALUES) { 1529 Builder.widenScalarIf( 1530 // TODO: Use 16-bit shifts if legal for 8-bit values? 1531 [=](const LegalityQuery &Query) { 1532 const LLT Ty = Query.Types[LitTyIdx]; 1533 return Ty.getSizeInBits() < 32; 1534 }, 1535 changeTo(LitTyIdx, S32)); 1536 } 1537 1538 Builder.widenScalarIf( 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[BigTyIdx]; 1541 return !isPowerOf2_32(Ty.getSizeInBits()) && 1542 Ty.getSizeInBits() % 16 != 0; 1543 }, 1544 [=](const LegalityQuery &Query) { 1545 // Pick the next power of 2, or a multiple of 64 over 128. 1546 // Whichever is smaller. 1547 const LLT &Ty = Query.Types[BigTyIdx]; 1548 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1549 if (NewSizeInBits >= 256) { 1550 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1551 if (RoundedTo < NewSizeInBits) 1552 NewSizeInBits = RoundedTo; 1553 } 1554 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1555 }) 1556 // Any vectors left are the wrong size. Scalarize them. 1557 .scalarize(0) 1558 .scalarize(1); 1559 } 1560 1561 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1562 // RegBankSelect. 1563 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1564 .legalFor({{S32}, {S64}}); 1565 1566 if (ST.hasVOP3PInsts()) { 1567 SextInReg.lowerFor({{V2S16}}) 1568 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1569 // get more vector shift opportunities, since we'll get those when 1570 // expanded. 1571 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1572 } else if (ST.has16BitInsts()) { 1573 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1574 } else { 1575 // Prefer to promote to s32 before lowering if we don't have 16-bit 1576 // shifts. This avoid a lot of intermediate truncate and extend operations. 1577 SextInReg.lowerFor({{S32}, {S64}}); 1578 } 1579 1580 SextInReg 1581 .scalarize(0) 1582 .clampScalar(0, S32, S64) 1583 .lower(); 1584 1585 getActionDefinitionsBuilder(G_FSHR) 1586 .legalFor({{S32, S32}}) 1587 .scalarize(0) 1588 .lower(); 1589 1590 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1591 .legalFor({S64}); 1592 1593 getActionDefinitionsBuilder(G_FENCE) 1594 .alwaysLegal(); 1595 1596 getActionDefinitionsBuilder({ 1597 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1598 G_FCOPYSIGN, 1599 1600 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1601 G_ATOMICRMW_NAND, 1602 G_ATOMICRMW_FSUB, 1603 G_READ_REGISTER, 1604 G_WRITE_REGISTER, 1605 1606 G_SADDO, G_SSUBO, 1607 1608 // TODO: Implement 1609 G_FMINIMUM, G_FMAXIMUM, 1610 G_FSHL 1611 }).lower(); 1612 1613 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1614 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1615 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1616 .unsupported(); 1617 1618 computeTables(); 1619 verify(*ST.getInstrInfo()); 1620 } 1621 1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1623 MachineInstr &MI) const { 1624 MachineIRBuilder &B = Helper.MIRBuilder; 1625 MachineRegisterInfo &MRI = *B.getMRI(); 1626 1627 switch (MI.getOpcode()) { 1628 case TargetOpcode::G_ADDRSPACE_CAST: 1629 return legalizeAddrSpaceCast(MI, MRI, B); 1630 case TargetOpcode::G_FRINT: 1631 return legalizeFrint(MI, MRI, B); 1632 case TargetOpcode::G_FCEIL: 1633 return legalizeFceil(MI, MRI, B); 1634 case TargetOpcode::G_FREM: 1635 return legalizeFrem(MI, MRI, B); 1636 case TargetOpcode::G_INTRINSIC_TRUNC: 1637 return legalizeIntrinsicTrunc(MI, MRI, B); 1638 case TargetOpcode::G_SITOFP: 1639 return legalizeITOFP(MI, MRI, B, true); 1640 case TargetOpcode::G_UITOFP: 1641 return legalizeITOFP(MI, MRI, B, false); 1642 case TargetOpcode::G_FPTOSI: 1643 return legalizeFPTOI(MI, MRI, B, true); 1644 case TargetOpcode::G_FPTOUI: 1645 return legalizeFPTOI(MI, MRI, B, false); 1646 case TargetOpcode::G_FMINNUM: 1647 case TargetOpcode::G_FMAXNUM: 1648 case TargetOpcode::G_FMINNUM_IEEE: 1649 case TargetOpcode::G_FMAXNUM_IEEE: 1650 return legalizeMinNumMaxNum(Helper, MI); 1651 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1652 return legalizeExtractVectorElt(MI, MRI, B); 1653 case TargetOpcode::G_INSERT_VECTOR_ELT: 1654 return legalizeInsertVectorElt(MI, MRI, B); 1655 case TargetOpcode::G_SHUFFLE_VECTOR: 1656 return legalizeShuffleVector(MI, MRI, B); 1657 case TargetOpcode::G_FSIN: 1658 case TargetOpcode::G_FCOS: 1659 return legalizeSinCos(MI, MRI, B); 1660 case TargetOpcode::G_GLOBAL_VALUE: 1661 return legalizeGlobalValue(MI, MRI, B); 1662 case TargetOpcode::G_LOAD: 1663 return legalizeLoad(Helper, MI); 1664 case TargetOpcode::G_FMAD: 1665 return legalizeFMad(MI, MRI, B); 1666 case TargetOpcode::G_FDIV: 1667 return legalizeFDIV(MI, MRI, B); 1668 case TargetOpcode::G_UDIV: 1669 case TargetOpcode::G_UREM: 1670 return legalizeUDIV_UREM(MI, MRI, B); 1671 case TargetOpcode::G_SDIV: 1672 case TargetOpcode::G_SREM: 1673 return legalizeSDIV_SREM(MI, MRI, B); 1674 case TargetOpcode::G_ATOMIC_CMPXCHG: 1675 return legalizeAtomicCmpXChg(MI, MRI, B); 1676 case TargetOpcode::G_FLOG: 1677 return legalizeFlog(MI, B, numbers::ln2f); 1678 case TargetOpcode::G_FLOG10: 1679 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1680 case TargetOpcode::G_FEXP: 1681 return legalizeFExp(MI, B); 1682 case TargetOpcode::G_FPOW: 1683 return legalizeFPow(MI, B); 1684 case TargetOpcode::G_FFLOOR: 1685 return legalizeFFloor(MI, MRI, B); 1686 case TargetOpcode::G_BUILD_VECTOR: 1687 return legalizeBuildVector(MI, MRI, B); 1688 default: 1689 return false; 1690 } 1691 1692 llvm_unreachable("expected switch to return"); 1693 } 1694 1695 Register AMDGPULegalizerInfo::getSegmentAperture( 1696 unsigned AS, 1697 MachineRegisterInfo &MRI, 1698 MachineIRBuilder &B) const { 1699 MachineFunction &MF = B.getMF(); 1700 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1701 const LLT S32 = LLT::scalar(32); 1702 1703 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1704 1705 if (ST.hasApertureRegs()) { 1706 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1707 // getreg. 1708 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1709 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1710 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1711 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1712 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1713 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1714 unsigned Encoding = 1715 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1716 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1717 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1718 1719 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1720 1721 B.buildInstr(AMDGPU::S_GETREG_B32) 1722 .addDef(GetReg) 1723 .addImm(Encoding); 1724 MRI.setType(GetReg, S32); 1725 1726 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1727 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1728 } 1729 1730 Register QueuePtr = MRI.createGenericVirtualRegister( 1731 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1732 1733 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1734 return Register(); 1735 1736 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1737 // private_segment_aperture_base_hi. 1738 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1739 1740 // TODO: can we be smarter about machine pointer info? 1741 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1742 MachineMemOperand *MMO = MF.getMachineMemOperand( 1743 PtrInfo, 1744 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1745 MachineMemOperand::MOInvariant, 1746 4, commonAlignment(Align(64), StructOffset)); 1747 1748 Register LoadAddr; 1749 1750 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1751 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1752 } 1753 1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1755 MachineInstr &MI, MachineRegisterInfo &MRI, 1756 MachineIRBuilder &B) const { 1757 MachineFunction &MF = B.getMF(); 1758 1759 const LLT S32 = LLT::scalar(32); 1760 Register Dst = MI.getOperand(0).getReg(); 1761 Register Src = MI.getOperand(1).getReg(); 1762 1763 LLT DstTy = MRI.getType(Dst); 1764 LLT SrcTy = MRI.getType(Src); 1765 unsigned DestAS = DstTy.getAddressSpace(); 1766 unsigned SrcAS = SrcTy.getAddressSpace(); 1767 1768 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1769 // vector element. 1770 assert(!DstTy.isVector()); 1771 1772 const AMDGPUTargetMachine &TM 1773 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1774 1775 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1776 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1777 return true; 1778 } 1779 1780 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1781 // Truncate. 1782 B.buildExtract(Dst, Src, 0); 1783 MI.eraseFromParent(); 1784 return true; 1785 } 1786 1787 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1789 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1790 1791 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1792 // another. Merge operands are required to be the same type, but creating an 1793 // extra ptrtoint would be kind of pointless. 1794 auto HighAddr = B.buildConstant( 1795 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1796 B.buildMerge(Dst, {Src, HighAddr}); 1797 MI.eraseFromParent(); 1798 return true; 1799 } 1800 1801 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1802 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1803 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1804 unsigned NullVal = TM.getNullPointerValue(DestAS); 1805 1806 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1807 auto FlatNull = B.buildConstant(SrcTy, 0); 1808 1809 // Extract low 32-bits of the pointer. 1810 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1811 1812 auto CmpRes = 1813 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1814 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1821 return false; 1822 1823 if (!ST.hasFlatAddressSpace()) 1824 return false; 1825 1826 auto SegmentNull = 1827 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1828 auto FlatNull = 1829 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1830 1831 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1832 if (!ApertureReg.isValid()) 1833 return false; 1834 1835 auto CmpRes = 1836 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1837 1838 // Coerce the type of the low half of the result so we can use merge_values. 1839 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1840 1841 // TODO: Should we allow mismatched types but matching sizes in merges to 1842 // avoid the ptrtoint? 1843 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1844 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1845 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeFrint( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 Register Src = MI.getOperand(1).getReg(); 1854 LLT Ty = MRI.getType(Src); 1855 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1856 1857 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1858 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1859 1860 auto C1 = B.buildFConstant(Ty, C1Val); 1861 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1862 1863 // TODO: Should this propagate fast-math-flags? 1864 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1865 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1866 1867 auto C2 = B.buildFConstant(Ty, C2Val); 1868 auto Fabs = B.buildFAbs(Ty, Src); 1869 1870 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1871 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizeFceil( 1877 MachineInstr &MI, MachineRegisterInfo &MRI, 1878 MachineIRBuilder &B) const { 1879 1880 const LLT S1 = LLT::scalar(1); 1881 const LLT S64 = LLT::scalar(64); 1882 1883 Register Src = MI.getOperand(1).getReg(); 1884 assert(MRI.getType(Src) == S64); 1885 1886 // result = trunc(src) 1887 // if (src > 0.0 && src != result) 1888 // result += 1.0 1889 1890 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1891 1892 const auto Zero = B.buildFConstant(S64, 0.0); 1893 const auto One = B.buildFConstant(S64, 1.0); 1894 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1895 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1896 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1897 auto Add = B.buildSelect(S64, And, One, Zero); 1898 1899 // TODO: Should this propagate fast-math-flags? 1900 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeFrem( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B) const { 1907 Register DstReg = MI.getOperand(0).getReg(); 1908 Register Src0Reg = MI.getOperand(1).getReg(); 1909 Register Src1Reg = MI.getOperand(2).getReg(); 1910 auto Flags = MI.getFlags(); 1911 LLT Ty = MRI.getType(DstReg); 1912 1913 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1914 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1915 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1916 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1917 MI.eraseFromParent(); 1918 return true; 1919 } 1920 1921 static MachineInstrBuilder extractF64Exponent(Register Hi, 1922 MachineIRBuilder &B) { 1923 const unsigned FractBits = 52; 1924 const unsigned ExpBits = 11; 1925 LLT S32 = LLT::scalar(32); 1926 1927 auto Const0 = B.buildConstant(S32, FractBits - 32); 1928 auto Const1 = B.buildConstant(S32, ExpBits); 1929 1930 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1931 .addUse(Hi) 1932 .addUse(Const0.getReg(0)) 1933 .addUse(Const1.getReg(0)); 1934 1935 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1936 } 1937 1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1939 MachineInstr &MI, MachineRegisterInfo &MRI, 1940 MachineIRBuilder &B) const { 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S32 = LLT::scalar(32); 1943 const LLT S64 = LLT::scalar(64); 1944 1945 Register Src = MI.getOperand(1).getReg(); 1946 assert(MRI.getType(Src) == S64); 1947 1948 // TODO: Should this use extract since the low half is unused? 1949 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1950 Register Hi = Unmerge.getReg(1); 1951 1952 // Extract the upper half, since this is where we will find the sign and 1953 // exponent. 1954 auto Exp = extractF64Exponent(Hi, B); 1955 1956 const unsigned FractBits = 52; 1957 1958 // Extract the sign bit. 1959 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1960 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1961 1962 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1963 1964 const auto Zero32 = B.buildConstant(S32, 0); 1965 1966 // Extend back to 64-bits. 1967 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1968 1969 auto Shr = B.buildAShr(S64, FractMask, Exp); 1970 auto Not = B.buildNot(S64, Shr); 1971 auto Tmp0 = B.buildAnd(S64, Src, Not); 1972 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1973 1974 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1975 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1976 1977 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1978 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeITOFP( 1984 MachineInstr &MI, MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B, bool Signed) const { 1986 1987 Register Dst = MI.getOperand(0).getReg(); 1988 Register Src = MI.getOperand(1).getReg(); 1989 1990 const LLT S64 = LLT::scalar(64); 1991 const LLT S32 = LLT::scalar(32); 1992 1993 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1994 1995 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1996 1997 auto CvtHi = Signed ? 1998 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1999 B.buildUITOFP(S64, Unmerge.getReg(1)); 2000 2001 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2002 2003 auto ThirtyTwo = B.buildConstant(S32, 32); 2004 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2005 .addUse(CvtHi.getReg(0)) 2006 .addUse(ThirtyTwo.getReg(0)); 2007 2008 // TODO: Should this propagate fast-math-flags? 2009 B.buildFAdd(Dst, LdExp, CvtLo); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 // TODO: Copied from DAG implementation. Verify logic and document how this 2015 // actually works. 2016 bool AMDGPULegalizerInfo::legalizeFPTOI( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 MachineIRBuilder &B, bool Signed) const { 2019 2020 Register Dst = MI.getOperand(0).getReg(); 2021 Register Src = MI.getOperand(1).getReg(); 2022 2023 const LLT S64 = LLT::scalar(64); 2024 const LLT S32 = LLT::scalar(32); 2025 2026 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2027 2028 unsigned Flags = MI.getFlags(); 2029 2030 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2031 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2032 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2033 2034 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2035 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2036 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2037 2038 auto Hi = Signed ? 2039 B.buildFPTOSI(S32, FloorMul) : 2040 B.buildFPTOUI(S32, FloorMul); 2041 auto Lo = B.buildFPTOUI(S32, Fma); 2042 2043 B.buildMerge(Dst, { Lo, Hi }); 2044 MI.eraseFromParent(); 2045 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2050 MachineInstr &MI) const { 2051 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2052 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2053 2054 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2055 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2056 2057 // With ieee_mode disabled, the instructions have the correct behavior 2058 // already for G_FMINNUM/G_FMAXNUM 2059 if (!MFI->getMode().IEEE) 2060 return !IsIEEEOp; 2061 2062 if (IsIEEEOp) 2063 return true; 2064 2065 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2066 } 2067 2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2069 MachineInstr &MI, MachineRegisterInfo &MRI, 2070 MachineIRBuilder &B) const { 2071 // TODO: Should move some of this into LegalizerHelper. 2072 2073 // TODO: Promote dynamic indexing of s16 to s32 2074 2075 // FIXME: Artifact combiner probably should have replaced the truncated 2076 // constant before this, so we shouldn't need 2077 // getConstantVRegValWithLookThrough. 2078 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2079 MI.getOperand(2).getReg(), MRI); 2080 if (!IdxVal) // Dynamic case will be selected to register indexing. 2081 return true; 2082 2083 Register Dst = MI.getOperand(0).getReg(); 2084 Register Vec = MI.getOperand(1).getReg(); 2085 2086 LLT VecTy = MRI.getType(Vec); 2087 LLT EltTy = VecTy.getElementType(); 2088 assert(EltTy == MRI.getType(Dst)); 2089 2090 if (IdxVal->Value < VecTy.getNumElements()) 2091 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2092 else 2093 B.buildUndef(Dst); 2094 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2100 MachineInstr &MI, MachineRegisterInfo &MRI, 2101 MachineIRBuilder &B) const { 2102 // TODO: Should move some of this into LegalizerHelper. 2103 2104 // TODO: Promote dynamic indexing of s16 to s32 2105 2106 // FIXME: Artifact combiner probably should have replaced the truncated 2107 // constant before this, so we shouldn't need 2108 // getConstantVRegValWithLookThrough. 2109 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2110 MI.getOperand(3).getReg(), MRI); 2111 if (!IdxVal) // Dynamic case will be selected to register indexing. 2112 return true; 2113 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Vec = MI.getOperand(1).getReg(); 2116 Register Ins = MI.getOperand(2).getReg(); 2117 2118 LLT VecTy = MRI.getType(Vec); 2119 LLT EltTy = VecTy.getElementType(); 2120 assert(EltTy == MRI.getType(Ins)); 2121 2122 if (IdxVal->Value < VecTy.getNumElements()) 2123 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2124 else 2125 B.buildUndef(Dst); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2132 MachineInstr &MI, MachineRegisterInfo &MRI, 2133 MachineIRBuilder &B) const { 2134 const LLT V2S16 = LLT::vector(2, 16); 2135 2136 Register Dst = MI.getOperand(0).getReg(); 2137 Register Src0 = MI.getOperand(1).getReg(); 2138 LLT DstTy = MRI.getType(Dst); 2139 LLT SrcTy = MRI.getType(Src0); 2140 2141 if (SrcTy == V2S16 && DstTy == V2S16 && 2142 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2143 return true; 2144 2145 MachineIRBuilder HelperBuilder(MI); 2146 GISelObserverWrapper DummyObserver; 2147 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2148 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2149 } 2150 2151 bool AMDGPULegalizerInfo::legalizeSinCos( 2152 MachineInstr &MI, MachineRegisterInfo &MRI, 2153 MachineIRBuilder &B) const { 2154 2155 Register DstReg = MI.getOperand(0).getReg(); 2156 Register SrcReg = MI.getOperand(1).getReg(); 2157 LLT Ty = MRI.getType(DstReg); 2158 unsigned Flags = MI.getFlags(); 2159 2160 Register TrigVal; 2161 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2162 if (ST.hasTrigReducedRange()) { 2163 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2164 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2165 .addUse(MulVal.getReg(0)) 2166 .setMIFlags(Flags).getReg(0); 2167 } else 2168 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2169 2170 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2171 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2172 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2173 .addUse(TrigVal) 2174 .setMIFlags(Flags); 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2180 MachineIRBuilder &B, 2181 const GlobalValue *GV, 2182 int64_t Offset, 2183 unsigned GAFlags) const { 2184 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2185 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2186 // to the following code sequence: 2187 // 2188 // For constant address space: 2189 // s_getpc_b64 s[0:1] 2190 // s_add_u32 s0, s0, $symbol 2191 // s_addc_u32 s1, s1, 0 2192 // 2193 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2194 // a fixup or relocation is emitted to replace $symbol with a literal 2195 // constant, which is a pc-relative offset from the encoding of the $symbol 2196 // operand to the global variable. 2197 // 2198 // For global address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2201 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // fixups or relocations are emitted to replace $symbol@*@lo and 2205 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2206 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2207 // operand to the global variable. 2208 // 2209 // What we want here is an offset from the value returned by s_getpc 2210 // (which is the address of the s_add_u32 instruction) to the global 2211 // variable, but since the encoding of $symbol starts 4 bytes after the start 2212 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2213 // small. This requires us to add 4 to the global variable offset in order to 2214 // compute the correct address. 2215 2216 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2217 2218 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2219 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2220 2221 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2222 .addDef(PCReg); 2223 2224 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2225 if (GAFlags == SIInstrInfo::MO_NONE) 2226 MIB.addImm(0); 2227 else 2228 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2229 2230 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2231 2232 if (PtrTy.getSizeInBits() == 32) 2233 B.buildExtract(DstReg, PCReg, 0); 2234 return true; 2235 } 2236 2237 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2238 MachineInstr &MI, MachineRegisterInfo &MRI, 2239 MachineIRBuilder &B) const { 2240 Register DstReg = MI.getOperand(0).getReg(); 2241 LLT Ty = MRI.getType(DstReg); 2242 unsigned AS = Ty.getAddressSpace(); 2243 2244 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2245 MachineFunction &MF = B.getMF(); 2246 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2247 2248 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2249 if (!MFI->isEntryFunction()) { 2250 const Function &Fn = MF.getFunction(); 2251 DiagnosticInfoUnsupported BadLDSDecl( 2252 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2253 DS_Warning); 2254 Fn.getContext().diagnose(BadLDSDecl); 2255 2256 // We currently don't have a way to correctly allocate LDS objects that 2257 // aren't directly associated with a kernel. We do force inlining of 2258 // functions that use local objects. However, if these dead functions are 2259 // not eliminated, we don't want a compile time error. Just emit a warning 2260 // and a trap, since there should be no callable path here. 2261 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2262 B.buildUndef(DstReg); 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 // TODO: We could emit code to handle the initialization somewhere. 2268 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2269 const SITargetLowering *TLI = ST.getTargetLowering(); 2270 if (!TLI->shouldUseLDSConstAddress(GV)) { 2271 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2272 return true; // Leave in place; 2273 } 2274 2275 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2276 Type *Ty = GV->getValueType(); 2277 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2278 // zero-sized type in other languages to declare the dynamic shared 2279 // memory which size is not known at the compile time. They will be 2280 // allocated by the runtime and placed directly after the static 2281 // allocated ones. They all share the same offset. 2282 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2283 // Adjust alignment for that dynamic shared memory array. 2284 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2285 LLT S32 = LLT::scalar(32); 2286 auto Sz = 2287 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2288 B.buildIntToPtr(DstReg, Sz); 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 } 2293 2294 B.buildConstant( 2295 DstReg, 2296 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2297 MI.eraseFromParent(); 2298 return true; 2299 } 2300 2301 const Function &Fn = MF.getFunction(); 2302 DiagnosticInfoUnsupported BadInit( 2303 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2304 Fn.getContext().diagnose(BadInit); 2305 return true; 2306 } 2307 2308 const SITargetLowering *TLI = ST.getTargetLowering(); 2309 2310 if (TLI->shouldEmitFixup(GV)) { 2311 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2312 MI.eraseFromParent(); 2313 return true; 2314 } 2315 2316 if (TLI->shouldEmitPCReloc(GV)) { 2317 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2318 MI.eraseFromParent(); 2319 return true; 2320 } 2321 2322 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2323 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2324 2325 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2326 MachinePointerInfo::getGOT(MF), 2327 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2328 MachineMemOperand::MOInvariant, 2329 8 /*Size*/, Align(8)); 2330 2331 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2332 2333 if (Ty.getSizeInBits() == 32) { 2334 // Truncate if this is a 32-bit constant adrdess. 2335 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2336 B.buildExtract(DstReg, Load, 0); 2337 } else 2338 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2339 2340 MI.eraseFromParent(); 2341 return true; 2342 } 2343 2344 static LLT widenToNextPowerOf2(LLT Ty) { 2345 if (Ty.isVector()) 2346 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2347 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2351 MachineInstr &MI) const { 2352 MachineIRBuilder &B = Helper.MIRBuilder; 2353 MachineRegisterInfo &MRI = *B.getMRI(); 2354 GISelChangeObserver &Observer = Helper.Observer; 2355 2356 Register PtrReg = MI.getOperand(1).getReg(); 2357 LLT PtrTy = MRI.getType(PtrReg); 2358 unsigned AddrSpace = PtrTy.getAddressSpace(); 2359 2360 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2361 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2362 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2363 Observer.changingInstr(MI); 2364 MI.getOperand(1).setReg(Cast.getReg(0)); 2365 Observer.changedInstr(MI); 2366 return true; 2367 } 2368 2369 Register ValReg = MI.getOperand(0).getReg(); 2370 LLT ValTy = MRI.getType(ValReg); 2371 2372 MachineMemOperand *MMO = *MI.memoperands_begin(); 2373 const unsigned ValSize = ValTy.getSizeInBits(); 2374 const unsigned MemSize = 8 * MMO->getSize(); 2375 const Align MemAlign = MMO->getAlign(); 2376 const unsigned AlignInBits = 8 * MemAlign.value(); 2377 2378 // Widen non-power-of-2 loads to the alignment if needed 2379 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2380 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2381 2382 // This was already the correct extending load result type, so just adjust 2383 // the memory type. 2384 if (WideMemSize == ValSize) { 2385 MachineFunction &MF = B.getMF(); 2386 2387 MachineMemOperand *WideMMO = 2388 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2389 Observer.changingInstr(MI); 2390 MI.setMemRefs(MF, {WideMMO}); 2391 Observer.changedInstr(MI); 2392 return true; 2393 } 2394 2395 // Don't bother handling edge case that should probably never be produced. 2396 if (ValSize > WideMemSize) 2397 return false; 2398 2399 LLT WideTy = widenToNextPowerOf2(ValTy); 2400 2401 Register WideLoad; 2402 if (!WideTy.isVector()) { 2403 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2404 B.buildTrunc(ValReg, WideLoad).getReg(0); 2405 } else { 2406 // Extract the subvector. 2407 2408 if (isRegisterType(ValTy)) { 2409 // If this a case where G_EXTRACT is legal, use it. 2410 // (e.g. <3 x s32> -> <4 x s32>) 2411 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2412 B.buildExtract(ValReg, WideLoad, 0); 2413 } else { 2414 // For cases where the widened type isn't a nice register value, unmerge 2415 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2416 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2417 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2418 B.setInsertPt(B.getMBB(), MI.getIterator()); 2419 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2420 } 2421 } 2422 2423 MI.eraseFromParent(); 2424 return true; 2425 } 2426 2427 return false; 2428 } 2429 2430 bool AMDGPULegalizerInfo::legalizeFMad( 2431 MachineInstr &MI, MachineRegisterInfo &MRI, 2432 MachineIRBuilder &B) const { 2433 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2434 assert(Ty.isScalar()); 2435 2436 MachineFunction &MF = B.getMF(); 2437 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2438 2439 // TODO: Always legal with future ftz flag. 2440 // FIXME: Do we need just output? 2441 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2442 return true; 2443 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2444 return true; 2445 2446 MachineIRBuilder HelperBuilder(MI); 2447 GISelObserverWrapper DummyObserver; 2448 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2449 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2450 } 2451 2452 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2453 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2454 Register DstReg = MI.getOperand(0).getReg(); 2455 Register PtrReg = MI.getOperand(1).getReg(); 2456 Register CmpVal = MI.getOperand(2).getReg(); 2457 Register NewVal = MI.getOperand(3).getReg(); 2458 2459 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2460 "this should not have been custom lowered"); 2461 2462 LLT ValTy = MRI.getType(CmpVal); 2463 LLT VecTy = LLT::vector(2, ValTy); 2464 2465 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2466 2467 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2468 .addDef(DstReg) 2469 .addUse(PtrReg) 2470 .addUse(PackedVal) 2471 .setMemRefs(MI.memoperands()); 2472 2473 MI.eraseFromParent(); 2474 return true; 2475 } 2476 2477 bool AMDGPULegalizerInfo::legalizeFlog( 2478 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2479 Register Dst = MI.getOperand(0).getReg(); 2480 Register Src = MI.getOperand(1).getReg(); 2481 LLT Ty = B.getMRI()->getType(Dst); 2482 unsigned Flags = MI.getFlags(); 2483 2484 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2485 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2486 2487 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2488 MI.eraseFromParent(); 2489 return true; 2490 } 2491 2492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2493 MachineIRBuilder &B) const { 2494 Register Dst = MI.getOperand(0).getReg(); 2495 Register Src = MI.getOperand(1).getReg(); 2496 unsigned Flags = MI.getFlags(); 2497 LLT Ty = B.getMRI()->getType(Dst); 2498 2499 auto K = B.buildFConstant(Ty, numbers::log2e); 2500 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2501 B.buildFExp2(Dst, Mul, Flags); 2502 MI.eraseFromParent(); 2503 return true; 2504 } 2505 2506 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2507 MachineIRBuilder &B) const { 2508 Register Dst = MI.getOperand(0).getReg(); 2509 Register Src0 = MI.getOperand(1).getReg(); 2510 Register Src1 = MI.getOperand(2).getReg(); 2511 unsigned Flags = MI.getFlags(); 2512 LLT Ty = B.getMRI()->getType(Dst); 2513 const LLT S16 = LLT::scalar(16); 2514 const LLT S32 = LLT::scalar(32); 2515 2516 if (Ty == S32) { 2517 auto Log = B.buildFLog2(S32, Src0, Flags); 2518 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2519 .addUse(Log.getReg(0)) 2520 .addUse(Src1) 2521 .setMIFlags(Flags); 2522 B.buildFExp2(Dst, Mul, Flags); 2523 } else if (Ty == S16) { 2524 // There's no f16 fmul_legacy, so we need to convert for it. 2525 auto Log = B.buildFLog2(S16, Src0, Flags); 2526 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2527 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2528 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2529 .addUse(Ext0.getReg(0)) 2530 .addUse(Ext1.getReg(0)) 2531 .setMIFlags(Flags); 2532 2533 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2534 } else 2535 return false; 2536 2537 MI.eraseFromParent(); 2538 return true; 2539 } 2540 2541 // Find a source register, ignoring any possible source modifiers. 2542 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2543 Register ModSrc = OrigSrc; 2544 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2545 ModSrc = SrcFNeg->getOperand(1).getReg(); 2546 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2547 ModSrc = SrcFAbs->getOperand(1).getReg(); 2548 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2549 ModSrc = SrcFAbs->getOperand(1).getReg(); 2550 return ModSrc; 2551 } 2552 2553 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2554 MachineRegisterInfo &MRI, 2555 MachineIRBuilder &B) const { 2556 2557 const LLT S1 = LLT::scalar(1); 2558 const LLT S64 = LLT::scalar(64); 2559 Register Dst = MI.getOperand(0).getReg(); 2560 Register OrigSrc = MI.getOperand(1).getReg(); 2561 unsigned Flags = MI.getFlags(); 2562 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2563 "this should not have been custom lowered"); 2564 2565 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2566 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2567 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2568 // V_FRACT bug is: 2569 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2570 // 2571 // Convert floor(x) to (x - fract(x)) 2572 2573 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2574 .addUse(OrigSrc) 2575 .setMIFlags(Flags); 2576 2577 // Give source modifier matching some assistance before obscuring a foldable 2578 // pattern. 2579 2580 // TODO: We can avoid the neg on the fract? The input sign to fract 2581 // shouldn't matter? 2582 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2583 2584 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2585 2586 Register Min = MRI.createGenericVirtualRegister(S64); 2587 2588 // We don't need to concern ourselves with the snan handling difference, so 2589 // use the one which will directly select. 2590 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2591 if (MFI->getMode().IEEE) 2592 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2593 else 2594 B.buildFMinNum(Min, Fract, Const, Flags); 2595 2596 Register CorrectedFract = Min; 2597 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2598 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2599 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2600 } 2601 2602 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2603 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2604 2605 MI.eraseFromParent(); 2606 return true; 2607 } 2608 2609 // Turn an illegal packed v2s16 build vector into bit operations. 2610 // TODO: This should probably be a bitcast action in LegalizerHelper. 2611 bool AMDGPULegalizerInfo::legalizeBuildVector( 2612 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2613 Register Dst = MI.getOperand(0).getReg(); 2614 const LLT S32 = LLT::scalar(32); 2615 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2616 2617 Register Src0 = MI.getOperand(1).getReg(); 2618 Register Src1 = MI.getOperand(2).getReg(); 2619 assert(MRI.getType(Src0) == LLT::scalar(16)); 2620 2621 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2622 B.buildBitcast(Dst, Merge); 2623 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 // Check that this is a G_XOR x, -1 2629 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2630 if (MI.getOpcode() != TargetOpcode::G_XOR) 2631 return false; 2632 auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 2633 return ConstVal && *ConstVal == -1; 2634 } 2635 2636 // Return the use branch instruction, otherwise null if the usage is invalid. 2637 static MachineInstr * 2638 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2639 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2640 Register CondDef = MI.getOperand(0).getReg(); 2641 if (!MRI.hasOneNonDBGUse(CondDef)) 2642 return nullptr; 2643 2644 MachineBasicBlock *Parent = MI.getParent(); 2645 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2646 2647 if (isNot(MRI, *UseMI)) { 2648 Register NegatedCond = UseMI->getOperand(0).getReg(); 2649 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2650 return nullptr; 2651 2652 // We're deleting the def of this value, so we need to remove it. 2653 UseMI->eraseFromParent(); 2654 2655 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2656 Negated = true; 2657 } 2658 2659 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2660 return nullptr; 2661 2662 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2663 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2664 if (Next == Parent->end()) { 2665 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2666 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2667 return nullptr; 2668 UncondBrTarget = &*NextMBB; 2669 } else { 2670 if (Next->getOpcode() != AMDGPU::G_BR) 2671 return nullptr; 2672 Br = &*Next; 2673 UncondBrTarget = Br->getOperand(0).getMBB(); 2674 } 2675 2676 return UseMI; 2677 } 2678 2679 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2680 const ArgDescriptor *Arg, 2681 const TargetRegisterClass *ArgRC, 2682 LLT ArgTy) const { 2683 MCRegister SrcReg = Arg->getRegister(); 2684 assert(SrcReg.isPhysical() && "Physical register expected"); 2685 assert(DstReg.isVirtual() && "Virtual register expected"); 2686 2687 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2688 ArgTy); 2689 if (Arg->isMasked()) { 2690 // TODO: Should we try to emit this once in the entry block? 2691 const LLT S32 = LLT::scalar(32); 2692 const unsigned Mask = Arg->getMask(); 2693 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2694 2695 Register AndMaskSrc = LiveIn; 2696 2697 if (Shift != 0) { 2698 auto ShiftAmt = B.buildConstant(S32, Shift); 2699 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2700 } 2701 2702 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2703 } else { 2704 B.buildCopy(DstReg, LiveIn); 2705 } 2706 2707 return true; 2708 } 2709 2710 bool AMDGPULegalizerInfo::loadInputValue( 2711 Register DstReg, MachineIRBuilder &B, 2712 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2713 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2714 const ArgDescriptor *Arg; 2715 const TargetRegisterClass *ArgRC; 2716 LLT ArgTy; 2717 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2718 2719 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2720 return false; // TODO: Handle these 2721 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2722 } 2723 2724 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2725 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2726 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2727 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2728 return false; 2729 2730 MI.eraseFromParent(); 2731 return true; 2732 } 2733 2734 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2735 MachineRegisterInfo &MRI, 2736 MachineIRBuilder &B) const { 2737 Register Dst = MI.getOperand(0).getReg(); 2738 LLT DstTy = MRI.getType(Dst); 2739 LLT S16 = LLT::scalar(16); 2740 LLT S32 = LLT::scalar(32); 2741 LLT S64 = LLT::scalar(64); 2742 2743 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2744 return true; 2745 2746 if (DstTy == S16) 2747 return legalizeFDIV16(MI, MRI, B); 2748 if (DstTy == S32) 2749 return legalizeFDIV32(MI, MRI, B); 2750 if (DstTy == S64) 2751 return legalizeFDIV64(MI, MRI, B); 2752 2753 return false; 2754 } 2755 2756 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2757 Register DstReg, 2758 Register X, 2759 Register Y, 2760 bool IsDiv) const { 2761 const LLT S1 = LLT::scalar(1); 2762 const LLT S32 = LLT::scalar(32); 2763 2764 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2765 // algorithm used here. 2766 2767 // Initial estimate of inv(y). 2768 auto FloatY = B.buildUITOFP(S32, Y); 2769 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2770 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2771 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2772 auto Z = B.buildFPTOUI(S32, ScaledY); 2773 2774 // One round of UNR. 2775 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2776 auto NegYZ = B.buildMul(S32, NegY, Z); 2777 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2778 2779 // Quotient/remainder estimate. 2780 auto Q = B.buildUMulH(S32, X, Z); 2781 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2782 2783 // First quotient/remainder refinement. 2784 auto One = B.buildConstant(S32, 1); 2785 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2786 if (IsDiv) 2787 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2788 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2789 2790 // Second quotient/remainder refinement. 2791 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2792 if (IsDiv) 2793 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2794 else 2795 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2796 } 2797 2798 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2799 MachineRegisterInfo &MRI, 2800 MachineIRBuilder &B) const { 2801 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2802 Register DstReg = MI.getOperand(0).getReg(); 2803 Register Num = MI.getOperand(1).getReg(); 2804 Register Den = MI.getOperand(2).getReg(); 2805 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2806 MI.eraseFromParent(); 2807 return true; 2808 } 2809 2810 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2811 // 2812 // Return lo, hi of result 2813 // 2814 // %cvt.lo = G_UITOFP Val.lo 2815 // %cvt.hi = G_UITOFP Val.hi 2816 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2817 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2818 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2819 // %mul2 = G_FMUL %mul1, 2**(-32) 2820 // %trunc = G_INTRINSIC_TRUNC %mul2 2821 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2822 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2823 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2824 Register Val) { 2825 const LLT S32 = LLT::scalar(32); 2826 auto Unmerge = B.buildUnmerge(S32, Val); 2827 2828 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2829 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2830 2831 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2832 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2833 2834 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2835 auto Mul1 = 2836 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2837 2838 // 2**(-32) 2839 auto Mul2 = 2840 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2841 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2842 2843 // -(2**32) 2844 auto Mad2 = B.buildFMAD(S32, Trunc, 2845 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2846 2847 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2848 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2849 2850 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2851 } 2852 2853 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2854 Register DstReg, 2855 Register Numer, 2856 Register Denom, 2857 bool IsDiv) const { 2858 const LLT S32 = LLT::scalar(32); 2859 const LLT S64 = LLT::scalar(64); 2860 const LLT S1 = LLT::scalar(1); 2861 Register RcpLo, RcpHi; 2862 2863 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2864 2865 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2866 2867 auto Zero64 = B.buildConstant(S64, 0); 2868 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2869 2870 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2871 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2872 2873 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2874 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2875 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2876 2877 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2878 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2879 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2880 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2881 2882 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2883 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2884 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2885 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2886 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2887 2888 auto Zero32 = B.buildConstant(S32, 0); 2889 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2890 auto Add2_HiC = 2891 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2892 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2893 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2894 2895 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2896 Register NumerLo = UnmergeNumer.getReg(0); 2897 Register NumerHi = UnmergeNumer.getReg(1); 2898 2899 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2900 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2901 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2902 Register Mul3_Lo = UnmergeMul3.getReg(0); 2903 Register Mul3_Hi = UnmergeMul3.getReg(1); 2904 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2905 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2906 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2907 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2908 2909 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2910 Register DenomLo = UnmergeDenom.getReg(0); 2911 Register DenomHi = UnmergeDenom.getReg(1); 2912 2913 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2914 auto C1 = B.buildSExt(S32, CmpHi); 2915 2916 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2917 auto C2 = B.buildSExt(S32, CmpLo); 2918 2919 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2920 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2921 2922 // TODO: Here and below portions of the code can be enclosed into if/endif. 2923 // Currently control flow is unconditional and we have 4 selects after 2924 // potential endif to substitute PHIs. 2925 2926 // if C3 != 0 ... 2927 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2928 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2929 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2930 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2931 2932 auto One64 = B.buildConstant(S64, 1); 2933 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2934 2935 auto C4 = 2936 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2937 auto C5 = 2938 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2939 auto C6 = B.buildSelect( 2940 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2941 2942 // if (C6 != 0) 2943 auto Add4 = B.buildAdd(S64, Add3, One64); 2944 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2945 2946 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2947 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2948 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2949 2950 // endif C6 2951 // endif C3 2952 2953 if (IsDiv) { 2954 auto Sel1 = B.buildSelect( 2955 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2956 B.buildSelect(DstReg, 2957 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2958 } else { 2959 auto Sel2 = B.buildSelect( 2960 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2961 B.buildSelect(DstReg, 2962 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2963 } 2964 } 2965 2966 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2967 MachineRegisterInfo &MRI, 2968 MachineIRBuilder &B) const { 2969 const LLT S64 = LLT::scalar(64); 2970 const LLT S32 = LLT::scalar(32); 2971 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2972 Register DstReg = MI.getOperand(0).getReg(); 2973 Register Num = MI.getOperand(1).getReg(); 2974 Register Den = MI.getOperand(2).getReg(); 2975 LLT Ty = MRI.getType(DstReg); 2976 2977 if (Ty == S32) 2978 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2979 else if (Ty == S64) 2980 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2981 else 2982 return false; 2983 2984 MI.eraseFromParent(); 2985 return true; 2986 2987 } 2988 2989 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2990 MachineRegisterInfo &MRI, 2991 MachineIRBuilder &B) const { 2992 const LLT S64 = LLT::scalar(64); 2993 const LLT S32 = LLT::scalar(32); 2994 2995 Register DstReg = MI.getOperand(0).getReg(); 2996 const LLT Ty = MRI.getType(DstReg); 2997 if (Ty != S32 && Ty != S64) 2998 return false; 2999 3000 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3001 3002 Register LHS = MI.getOperand(1).getReg(); 3003 Register RHS = MI.getOperand(2).getReg(); 3004 3005 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3006 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3007 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3008 3009 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3010 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3011 3012 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3013 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3014 3015 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3016 if (Ty == S32) 3017 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3018 else 3019 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3020 3021 Register Sign; 3022 if (IsDiv) 3023 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3024 else 3025 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3026 3027 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3028 B.buildSub(DstReg, UDivRem, Sign); 3029 3030 MI.eraseFromParent(); 3031 return true; 3032 } 3033 3034 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3035 MachineRegisterInfo &MRI, 3036 MachineIRBuilder &B) const { 3037 Register Res = MI.getOperand(0).getReg(); 3038 Register LHS = MI.getOperand(1).getReg(); 3039 Register RHS = MI.getOperand(2).getReg(); 3040 3041 uint16_t Flags = MI.getFlags(); 3042 3043 LLT ResTy = MRI.getType(Res); 3044 LLT S32 = LLT::scalar(32); 3045 LLT S64 = LLT::scalar(64); 3046 3047 const MachineFunction &MF = B.getMF(); 3048 bool Unsafe = 3049 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3050 3051 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3052 return false; 3053 3054 if (!Unsafe && ResTy == S32 && 3055 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3056 return false; 3057 3058 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3059 // 1 / x -> RCP(x) 3060 if (CLHS->isExactlyValue(1.0)) { 3061 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3062 .addUse(RHS) 3063 .setMIFlags(Flags); 3064 3065 MI.eraseFromParent(); 3066 return true; 3067 } 3068 3069 // -1 / x -> RCP( FNEG(x) ) 3070 if (CLHS->isExactlyValue(-1.0)) { 3071 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3072 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3073 .addUse(FNeg.getReg(0)) 3074 .setMIFlags(Flags); 3075 3076 MI.eraseFromParent(); 3077 return true; 3078 } 3079 } 3080 3081 // x / y -> x * (1.0 / y) 3082 if (Unsafe) { 3083 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3084 .addUse(RHS) 3085 .setMIFlags(Flags); 3086 B.buildFMul(Res, LHS, RCP, Flags); 3087 3088 MI.eraseFromParent(); 3089 return true; 3090 } 3091 3092 return false; 3093 } 3094 3095 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3096 MachineRegisterInfo &MRI, 3097 MachineIRBuilder &B) const { 3098 Register Res = MI.getOperand(0).getReg(); 3099 Register LHS = MI.getOperand(1).getReg(); 3100 Register RHS = MI.getOperand(2).getReg(); 3101 3102 uint16_t Flags = MI.getFlags(); 3103 3104 LLT S16 = LLT::scalar(16); 3105 LLT S32 = LLT::scalar(32); 3106 3107 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3108 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3109 3110 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3111 .addUse(RHSExt.getReg(0)) 3112 .setMIFlags(Flags); 3113 3114 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3115 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3116 3117 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3118 .addUse(RDst.getReg(0)) 3119 .addUse(RHS) 3120 .addUse(LHS) 3121 .setMIFlags(Flags); 3122 3123 MI.eraseFromParent(); 3124 return true; 3125 } 3126 3127 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3128 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3129 static void toggleSPDenormMode(bool Enable, 3130 MachineIRBuilder &B, 3131 const GCNSubtarget &ST, 3132 AMDGPU::SIModeRegisterDefaults Mode) { 3133 // Set SP denorm mode to this value. 3134 unsigned SPDenormMode = 3135 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3136 3137 if (ST.hasDenormModeInst()) { 3138 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3139 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3140 3141 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3142 B.buildInstr(AMDGPU::S_DENORM_MODE) 3143 .addImm(NewDenormModeValue); 3144 3145 } else { 3146 // Select FP32 bit field in mode register. 3147 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3148 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3149 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3150 3151 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3152 .addImm(SPDenormMode) 3153 .addImm(SPDenormModeBitField); 3154 } 3155 } 3156 3157 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3158 MachineRegisterInfo &MRI, 3159 MachineIRBuilder &B) const { 3160 Register Res = MI.getOperand(0).getReg(); 3161 Register LHS = MI.getOperand(1).getReg(); 3162 Register RHS = MI.getOperand(2).getReg(); 3163 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3164 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3165 3166 uint16_t Flags = MI.getFlags(); 3167 3168 LLT S32 = LLT::scalar(32); 3169 LLT S1 = LLT::scalar(1); 3170 3171 auto One = B.buildFConstant(S32, 1.0f); 3172 3173 auto DenominatorScaled = 3174 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3175 .addUse(LHS) 3176 .addUse(RHS) 3177 .addImm(0) 3178 .setMIFlags(Flags); 3179 auto NumeratorScaled = 3180 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3181 .addUse(LHS) 3182 .addUse(RHS) 3183 .addImm(1) 3184 .setMIFlags(Flags); 3185 3186 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3187 .addUse(DenominatorScaled.getReg(0)) 3188 .setMIFlags(Flags); 3189 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3190 3191 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3192 // aren't modeled as reading it. 3193 if (!Mode.allFP32Denormals()) 3194 toggleSPDenormMode(true, B, ST, Mode); 3195 3196 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3197 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3198 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3199 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3200 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3201 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3202 3203 if (!Mode.allFP32Denormals()) 3204 toggleSPDenormMode(false, B, ST, Mode); 3205 3206 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3207 .addUse(Fma4.getReg(0)) 3208 .addUse(Fma1.getReg(0)) 3209 .addUse(Fma3.getReg(0)) 3210 .addUse(NumeratorScaled.getReg(1)) 3211 .setMIFlags(Flags); 3212 3213 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3214 .addUse(Fmas.getReg(0)) 3215 .addUse(RHS) 3216 .addUse(LHS) 3217 .setMIFlags(Flags); 3218 3219 MI.eraseFromParent(); 3220 return true; 3221 } 3222 3223 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3224 MachineRegisterInfo &MRI, 3225 MachineIRBuilder &B) const { 3226 Register Res = MI.getOperand(0).getReg(); 3227 Register LHS = MI.getOperand(1).getReg(); 3228 Register RHS = MI.getOperand(2).getReg(); 3229 3230 uint16_t Flags = MI.getFlags(); 3231 3232 LLT S64 = LLT::scalar(64); 3233 LLT S1 = LLT::scalar(1); 3234 3235 auto One = B.buildFConstant(S64, 1.0); 3236 3237 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3238 .addUse(LHS) 3239 .addUse(RHS) 3240 .addImm(0) 3241 .setMIFlags(Flags); 3242 3243 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3244 3245 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3246 .addUse(DivScale0.getReg(0)) 3247 .setMIFlags(Flags); 3248 3249 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3250 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3251 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3252 3253 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3254 .addUse(LHS) 3255 .addUse(RHS) 3256 .addImm(1) 3257 .setMIFlags(Flags); 3258 3259 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3260 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3261 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3262 3263 Register Scale; 3264 if (!ST.hasUsableDivScaleConditionOutput()) { 3265 // Workaround a hardware bug on SI where the condition output from div_scale 3266 // is not usable. 3267 3268 LLT S32 = LLT::scalar(32); 3269 3270 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3271 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3272 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3273 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3274 3275 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3276 Scale1Unmerge.getReg(1)); 3277 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3278 Scale0Unmerge.getReg(1)); 3279 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3280 } else { 3281 Scale = DivScale1.getReg(1); 3282 } 3283 3284 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3285 .addUse(Fma4.getReg(0)) 3286 .addUse(Fma3.getReg(0)) 3287 .addUse(Mul.getReg(0)) 3288 .addUse(Scale) 3289 .setMIFlags(Flags); 3290 3291 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3292 .addUse(Fmas.getReg(0)) 3293 .addUse(RHS) 3294 .addUse(LHS) 3295 .setMIFlags(Flags); 3296 3297 MI.eraseFromParent(); 3298 return true; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B) const { 3304 Register Res = MI.getOperand(0).getReg(); 3305 Register LHS = MI.getOperand(2).getReg(); 3306 Register RHS = MI.getOperand(3).getReg(); 3307 uint16_t Flags = MI.getFlags(); 3308 3309 LLT S32 = LLT::scalar(32); 3310 LLT S1 = LLT::scalar(1); 3311 3312 auto Abs = B.buildFAbs(S32, RHS, Flags); 3313 const APFloat C0Val(1.0f); 3314 3315 auto C0 = B.buildConstant(S32, 0x6f800000); 3316 auto C1 = B.buildConstant(S32, 0x2f800000); 3317 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3318 3319 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3320 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3321 3322 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3323 3324 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3325 .addUse(Mul0.getReg(0)) 3326 .setMIFlags(Flags); 3327 3328 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3329 3330 B.buildFMul(Res, Sel, Mul1, Flags); 3331 3332 MI.eraseFromParent(); 3333 return true; 3334 } 3335 3336 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3337 // FIXME: Why do we handle this one but not other removed instructions? 3338 // 3339 // Reciprocal square root. The clamp prevents infinite results, clamping 3340 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3341 // +-max_float. 3342 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3343 MachineRegisterInfo &MRI, 3344 MachineIRBuilder &B) const { 3345 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3346 return true; 3347 3348 Register Dst = MI.getOperand(0).getReg(); 3349 Register Src = MI.getOperand(2).getReg(); 3350 auto Flags = MI.getFlags(); 3351 3352 LLT Ty = MRI.getType(Dst); 3353 3354 const fltSemantics *FltSemantics; 3355 if (Ty == LLT::scalar(32)) 3356 FltSemantics = &APFloat::IEEEsingle(); 3357 else if (Ty == LLT::scalar(64)) 3358 FltSemantics = &APFloat::IEEEdouble(); 3359 else 3360 return false; 3361 3362 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3363 .addUse(Src) 3364 .setMIFlags(Flags); 3365 3366 // We don't need to concern ourselves with the snan handling difference, since 3367 // the rsq quieted (or not) so use the one which will directly select. 3368 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3369 const bool UseIEEE = MFI->getMode().IEEE; 3370 3371 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3372 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3373 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3374 3375 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3376 3377 if (UseIEEE) 3378 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3379 else 3380 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3381 MI.eraseFromParent(); 3382 return true; 3383 } 3384 3385 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3386 switch (IID) { 3387 case Intrinsic::amdgcn_ds_fadd: 3388 return AMDGPU::G_ATOMICRMW_FADD; 3389 case Intrinsic::amdgcn_ds_fmin: 3390 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3391 case Intrinsic::amdgcn_ds_fmax: 3392 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3393 default: 3394 llvm_unreachable("not a DS FP intrinsic"); 3395 } 3396 } 3397 3398 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3399 MachineInstr &MI, 3400 Intrinsic::ID IID) const { 3401 GISelChangeObserver &Observer = Helper.Observer; 3402 Observer.changingInstr(MI); 3403 3404 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3405 3406 // The remaining operands were used to set fields in the MemOperand on 3407 // construction. 3408 for (int I = 6; I > 3; --I) 3409 MI.RemoveOperand(I); 3410 3411 MI.RemoveOperand(1); // Remove the intrinsic ID. 3412 Observer.changedInstr(MI); 3413 return true; 3414 } 3415 3416 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3417 MachineRegisterInfo &MRI, 3418 MachineIRBuilder &B) const { 3419 uint64_t Offset = 3420 ST.getTargetLowering()->getImplicitParameterOffset( 3421 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3422 LLT DstTy = MRI.getType(DstReg); 3423 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3424 3425 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3426 if (!loadInputValue(KernargPtrReg, B, 3427 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3428 return false; 3429 3430 // FIXME: This should be nuw 3431 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3432 return true; 3433 } 3434 3435 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3436 MachineRegisterInfo &MRI, 3437 MachineIRBuilder &B) const { 3438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3439 if (!MFI->isEntryFunction()) { 3440 return legalizePreloadedArgIntrin(MI, MRI, B, 3441 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3442 } 3443 3444 Register DstReg = MI.getOperand(0).getReg(); 3445 if (!getImplicitArgPtr(DstReg, MRI, B)) 3446 return false; 3447 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3453 MachineRegisterInfo &MRI, 3454 MachineIRBuilder &B, 3455 unsigned AddrSpace) const { 3456 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3457 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3458 Register Hi32 = Unmerge.getReg(1); 3459 3460 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3461 MI.eraseFromParent(); 3462 return true; 3463 } 3464 3465 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3466 // offset (the offset that is included in bounds checking and swizzling, to be 3467 // split between the instruction's voffset and immoffset fields) and soffset 3468 // (the offset that is excluded from bounds checking and swizzling, to go in 3469 // the instruction's soffset field). This function takes the first kind of 3470 // offset and figures out how to split it between voffset and immoffset. 3471 std::tuple<Register, unsigned, unsigned> 3472 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3473 Register OrigOffset) const { 3474 const unsigned MaxImm = 4095; 3475 Register BaseReg; 3476 unsigned TotalConstOffset; 3477 MachineInstr *OffsetDef; 3478 const LLT S32 = LLT::scalar(32); 3479 3480 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3481 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3482 3483 unsigned ImmOffset = TotalConstOffset; 3484 3485 // If the immediate value is too big for the immoffset field, put the value 3486 // and -4096 into the immoffset field so that the value that is copied/added 3487 // for the voffset field is a multiple of 4096, and it stands more chance 3488 // of being CSEd with the copy/add for another similar load/store. 3489 // However, do not do that rounding down to a multiple of 4096 if that is a 3490 // negative number, as it appears to be illegal to have a negative offset 3491 // in the vgpr, even if adding the immediate offset makes it positive. 3492 unsigned Overflow = ImmOffset & ~MaxImm; 3493 ImmOffset -= Overflow; 3494 if ((int32_t)Overflow < 0) { 3495 Overflow += ImmOffset; 3496 ImmOffset = 0; 3497 } 3498 3499 if (Overflow != 0) { 3500 if (!BaseReg) { 3501 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3502 } else { 3503 auto OverflowVal = B.buildConstant(S32, Overflow); 3504 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3505 } 3506 } 3507 3508 if (!BaseReg) 3509 BaseReg = B.buildConstant(S32, 0).getReg(0); 3510 3511 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3512 } 3513 3514 /// Handle register layout difference for f16 images for some subtargets. 3515 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3516 MachineRegisterInfo &MRI, 3517 Register Reg) const { 3518 if (!ST.hasUnpackedD16VMem()) 3519 return Reg; 3520 3521 const LLT S16 = LLT::scalar(16); 3522 const LLT S32 = LLT::scalar(32); 3523 LLT StoreVT = MRI.getType(Reg); 3524 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3525 3526 auto Unmerge = B.buildUnmerge(S16, Reg); 3527 3528 SmallVector<Register, 4> WideRegs; 3529 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3530 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3531 3532 int NumElts = StoreVT.getNumElements(); 3533 3534 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3535 } 3536 3537 Register AMDGPULegalizerInfo::fixStoreSourceType( 3538 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3539 MachineRegisterInfo *MRI = B.getMRI(); 3540 LLT Ty = MRI->getType(VData); 3541 3542 const LLT S16 = LLT::scalar(16); 3543 3544 // Fixup illegal register types for i8 stores. 3545 if (Ty == LLT::scalar(8) || Ty == S16) { 3546 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3547 return AnyExt; 3548 } 3549 3550 if (Ty.isVector()) { 3551 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3552 if (IsFormat) 3553 return handleD16VData(B, *MRI, VData); 3554 } 3555 } 3556 3557 return VData; 3558 } 3559 3560 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3561 MachineRegisterInfo &MRI, 3562 MachineIRBuilder &B, 3563 bool IsTyped, 3564 bool IsFormat) const { 3565 Register VData = MI.getOperand(1).getReg(); 3566 LLT Ty = MRI.getType(VData); 3567 LLT EltTy = Ty.getScalarType(); 3568 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3569 const LLT S32 = LLT::scalar(32); 3570 3571 VData = fixStoreSourceType(B, VData, IsFormat); 3572 Register RSrc = MI.getOperand(2).getReg(); 3573 3574 MachineMemOperand *MMO = *MI.memoperands_begin(); 3575 const int MemSize = MMO->getSize(); 3576 3577 unsigned ImmOffset; 3578 unsigned TotalOffset; 3579 3580 // The typed intrinsics add an immediate after the registers. 3581 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3582 3583 // The struct intrinsic variants add one additional operand over raw. 3584 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3585 Register VIndex; 3586 int OpOffset = 0; 3587 if (HasVIndex) { 3588 VIndex = MI.getOperand(3).getReg(); 3589 OpOffset = 1; 3590 } 3591 3592 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3593 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3594 3595 unsigned Format = 0; 3596 if (IsTyped) { 3597 Format = MI.getOperand(5 + OpOffset).getImm(); 3598 ++OpOffset; 3599 } 3600 3601 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3602 3603 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3604 if (TotalOffset != 0) 3605 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3606 3607 unsigned Opc; 3608 if (IsTyped) { 3609 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3610 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3611 } else if (IsFormat) { 3612 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3613 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3614 } else { 3615 switch (MemSize) { 3616 case 1: 3617 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3618 break; 3619 case 2: 3620 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3621 break; 3622 default: 3623 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3624 break; 3625 } 3626 } 3627 3628 if (!VIndex) 3629 VIndex = B.buildConstant(S32, 0).getReg(0); 3630 3631 auto MIB = B.buildInstr(Opc) 3632 .addUse(VData) // vdata 3633 .addUse(RSrc) // rsrc 3634 .addUse(VIndex) // vindex 3635 .addUse(VOffset) // voffset 3636 .addUse(SOffset) // soffset 3637 .addImm(ImmOffset); // offset(imm) 3638 3639 if (IsTyped) 3640 MIB.addImm(Format); 3641 3642 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3643 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3644 .addMemOperand(MMO); 3645 3646 MI.eraseFromParent(); 3647 return true; 3648 } 3649 3650 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3651 MachineRegisterInfo &MRI, 3652 MachineIRBuilder &B, 3653 bool IsFormat, 3654 bool IsTyped) const { 3655 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3656 MachineMemOperand *MMO = *MI.memoperands_begin(); 3657 const int MemSize = MMO->getSize(); 3658 const LLT S32 = LLT::scalar(32); 3659 3660 Register Dst = MI.getOperand(0).getReg(); 3661 Register RSrc = MI.getOperand(2).getReg(); 3662 3663 // The typed intrinsics add an immediate after the registers. 3664 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3665 3666 // The struct intrinsic variants add one additional operand over raw. 3667 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3668 Register VIndex; 3669 int OpOffset = 0; 3670 if (HasVIndex) { 3671 VIndex = MI.getOperand(3).getReg(); 3672 OpOffset = 1; 3673 } 3674 3675 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3676 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3677 3678 unsigned Format = 0; 3679 if (IsTyped) { 3680 Format = MI.getOperand(5 + OpOffset).getImm(); 3681 ++OpOffset; 3682 } 3683 3684 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3685 unsigned ImmOffset; 3686 unsigned TotalOffset; 3687 3688 LLT Ty = MRI.getType(Dst); 3689 LLT EltTy = Ty.getScalarType(); 3690 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3691 const bool Unpacked = ST.hasUnpackedD16VMem(); 3692 3693 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3694 if (TotalOffset != 0) 3695 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3696 3697 unsigned Opc; 3698 3699 if (IsTyped) { 3700 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3701 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3702 } else if (IsFormat) { 3703 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3704 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3705 } else { 3706 switch (MemSize) { 3707 case 1: 3708 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3709 break; 3710 case 2: 3711 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3712 break; 3713 default: 3714 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3715 break; 3716 } 3717 } 3718 3719 Register LoadDstReg; 3720 3721 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3722 LLT UnpackedTy = Ty.changeElementSize(32); 3723 3724 if (IsExtLoad) 3725 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3726 else if (Unpacked && IsD16 && Ty.isVector()) 3727 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3728 else 3729 LoadDstReg = Dst; 3730 3731 if (!VIndex) 3732 VIndex = B.buildConstant(S32, 0).getReg(0); 3733 3734 auto MIB = B.buildInstr(Opc) 3735 .addDef(LoadDstReg) // vdata 3736 .addUse(RSrc) // rsrc 3737 .addUse(VIndex) // vindex 3738 .addUse(VOffset) // voffset 3739 .addUse(SOffset) // soffset 3740 .addImm(ImmOffset); // offset(imm) 3741 3742 if (IsTyped) 3743 MIB.addImm(Format); 3744 3745 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3746 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3747 .addMemOperand(MMO); 3748 3749 if (LoadDstReg != Dst) { 3750 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3751 3752 // Widen result for extending loads was widened. 3753 if (IsExtLoad) 3754 B.buildTrunc(Dst, LoadDstReg); 3755 else { 3756 // Repack to original 16-bit vector result 3757 // FIXME: G_TRUNC should work, but legalization currently fails 3758 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3759 SmallVector<Register, 4> Repack; 3760 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3761 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3762 B.buildMerge(Dst, Repack); 3763 } 3764 } 3765 3766 MI.eraseFromParent(); 3767 return true; 3768 } 3769 3770 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3771 MachineIRBuilder &B, 3772 bool IsInc) const { 3773 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3774 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3775 B.buildInstr(Opc) 3776 .addDef(MI.getOperand(0).getReg()) 3777 .addUse(MI.getOperand(2).getReg()) 3778 .addUse(MI.getOperand(3).getReg()) 3779 .cloneMemRefs(MI); 3780 MI.eraseFromParent(); 3781 return true; 3782 } 3783 3784 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3785 switch (IntrID) { 3786 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3787 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3788 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3789 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3790 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3791 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3792 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3793 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3794 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3795 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3796 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3797 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3798 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3799 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3800 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3801 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3802 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3803 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3804 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3805 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3806 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3807 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3808 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3809 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3810 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3811 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3812 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3813 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3814 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3815 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3816 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3817 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3818 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3819 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3820 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3821 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3822 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3823 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3824 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3825 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3826 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3827 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3828 default: 3829 llvm_unreachable("unhandled atomic opcode"); 3830 } 3831 } 3832 3833 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3834 MachineIRBuilder &B, 3835 Intrinsic::ID IID) const { 3836 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3837 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3838 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3839 3840 Register Dst; 3841 3842 int OpOffset = 0; 3843 if (HasReturn) { 3844 // A few FP atomics do not support return values. 3845 Dst = MI.getOperand(0).getReg(); 3846 } else { 3847 OpOffset = -1; 3848 } 3849 3850 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3851 Register CmpVal; 3852 3853 if (IsCmpSwap) { 3854 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3855 ++OpOffset; 3856 } 3857 3858 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3859 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3860 3861 // The struct intrinsic variants add one additional operand over raw. 3862 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3863 Register VIndex; 3864 if (HasVIndex) { 3865 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3866 ++OpOffset; 3867 } 3868 3869 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3870 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3871 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3872 3873 MachineMemOperand *MMO = *MI.memoperands_begin(); 3874 3875 unsigned ImmOffset; 3876 unsigned TotalOffset; 3877 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3878 if (TotalOffset != 0) 3879 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3880 3881 if (!VIndex) 3882 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3883 3884 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3885 3886 if (HasReturn) 3887 MIB.addDef(Dst); 3888 3889 MIB.addUse(VData); // vdata 3890 3891 if (IsCmpSwap) 3892 MIB.addReg(CmpVal); 3893 3894 MIB.addUse(RSrc) // rsrc 3895 .addUse(VIndex) // vindex 3896 .addUse(VOffset) // voffset 3897 .addUse(SOffset) // soffset 3898 .addImm(ImmOffset) // offset(imm) 3899 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3900 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3901 .addMemOperand(MMO); 3902 3903 MI.eraseFromParent(); 3904 return true; 3905 } 3906 3907 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3908 /// vector with s16 typed elements. 3909 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3910 SmallVectorImpl<Register> &PackedAddrs, 3911 int AddrIdx, int DimIdx, int EndIdx, 3912 int NumGradients) { 3913 const LLT S16 = LLT::scalar(16); 3914 const LLT V2S16 = LLT::vector(2, 16); 3915 3916 for (int I = AddrIdx; I < EndIdx; ++I) { 3917 MachineOperand &SrcOp = MI.getOperand(I); 3918 if (!SrcOp.isReg()) 3919 continue; // _L to _LZ may have eliminated this. 3920 3921 Register AddrReg = SrcOp.getReg(); 3922 3923 if (I < DimIdx) { 3924 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3925 PackedAddrs.push_back(AddrReg); 3926 } else { 3927 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3928 // derivatives dx/dh and dx/dv are packed with undef. 3929 if (((I + 1) >= EndIdx) || 3930 ((NumGradients / 2) % 2 == 1 && 3931 (I == DimIdx + (NumGradients / 2) - 1 || 3932 I == DimIdx + NumGradients - 1)) || 3933 // Check for _L to _LZ optimization 3934 !MI.getOperand(I + 1).isReg()) { 3935 PackedAddrs.push_back( 3936 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3937 .getReg(0)); 3938 } else { 3939 PackedAddrs.push_back( 3940 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3941 .getReg(0)); 3942 ++I; 3943 } 3944 } 3945 } 3946 } 3947 3948 /// Convert from separate vaddr components to a single vector address register, 3949 /// and replace the remaining operands with $noreg. 3950 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3951 int DimIdx, int NumVAddrs) { 3952 const LLT S32 = LLT::scalar(32); 3953 3954 SmallVector<Register, 8> AddrRegs; 3955 for (int I = 0; I != NumVAddrs; ++I) { 3956 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3957 if (SrcOp.isReg()) { 3958 AddrRegs.push_back(SrcOp.getReg()); 3959 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3960 } 3961 } 3962 3963 int NumAddrRegs = AddrRegs.size(); 3964 if (NumAddrRegs != 1) { 3965 // Round up to 8 elements for v5-v7 3966 // FIXME: Missing intermediate sized register classes and instructions. 3967 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3968 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3969 auto Undef = B.buildUndef(S32); 3970 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3971 NumAddrRegs = RoundedNumRegs; 3972 } 3973 3974 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3975 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3976 } 3977 3978 for (int I = 1; I != NumVAddrs; ++I) { 3979 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3980 if (SrcOp.isReg()) 3981 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3982 } 3983 } 3984 3985 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3986 /// 3987 /// Depending on the subtarget, load/store with 16-bit element data need to be 3988 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3989 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3990 /// registers. 3991 /// 3992 /// We don't want to directly select image instructions just yet, but also want 3993 /// to exposes all register repacking to the legalizer/combiners. We also don't 3994 /// want a selected instrution entering RegBankSelect. In order to avoid 3995 /// defining a multitude of intermediate image instructions, directly hack on 3996 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3997 /// now unnecessary arguments with $noreg. 3998 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3999 MachineInstr &MI, MachineIRBuilder &B, 4000 GISelChangeObserver &Observer, 4001 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 4002 4003 const int NumDefs = MI.getNumExplicitDefs(); 4004 bool IsTFE = NumDefs == 2; 4005 // We are only processing the operands of d16 image operations on subtargets 4006 // that use the unpacked register layout, or need to repack the TFE result. 4007 4008 // TODO: Do we need to guard against already legalized intrinsics? 4009 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4010 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 4011 4012 MachineRegisterInfo *MRI = B.getMRI(); 4013 const LLT S32 = LLT::scalar(32); 4014 const LLT S16 = LLT::scalar(16); 4015 const LLT V2S16 = LLT::vector(2, 16); 4016 4017 // Index of first address argument 4018 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 4019 4020 int NumVAddrs, NumGradients; 4021 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 4022 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 4023 getDMaskIdx(BaseOpcode, NumDefs); 4024 unsigned DMask = 0; 4025 4026 // Check for 16 bit addresses and pack if true. 4027 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 4028 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 4029 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 4030 const bool IsG16 = GradTy == S16; 4031 const bool IsA16 = AddrTy == S16; 4032 4033 int DMaskLanes = 0; 4034 if (!BaseOpcode->Atomic) { 4035 DMask = MI.getOperand(DMaskIdx).getImm(); 4036 if (BaseOpcode->Gather4) { 4037 DMaskLanes = 4; 4038 } else if (DMask != 0) { 4039 DMaskLanes = countPopulation(DMask); 4040 } else if (!IsTFE && !BaseOpcode->Store) { 4041 // If dmask is 0, this is a no-op load. This can be eliminated. 4042 B.buildUndef(MI.getOperand(0)); 4043 MI.eraseFromParent(); 4044 return true; 4045 } 4046 } 4047 4048 Observer.changingInstr(MI); 4049 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4050 4051 unsigned NewOpcode = NumDefs == 0 ? 4052 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4053 4054 // Track that we legalized this 4055 MI.setDesc(B.getTII().get(NewOpcode)); 4056 4057 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4058 // dmask to be at least 1 otherwise the instruction will fail 4059 if (IsTFE && DMask == 0) { 4060 DMask = 0x1; 4061 DMaskLanes = 1; 4062 MI.getOperand(DMaskIdx).setImm(DMask); 4063 } 4064 4065 if (BaseOpcode->Atomic) { 4066 Register VData0 = MI.getOperand(2).getReg(); 4067 LLT Ty = MRI->getType(VData0); 4068 4069 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4070 if (Ty.isVector()) 4071 return false; 4072 4073 if (BaseOpcode->AtomicX2) { 4074 Register VData1 = MI.getOperand(3).getReg(); 4075 // The two values are packed in one register. 4076 LLT PackedTy = LLT::vector(2, Ty); 4077 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4078 MI.getOperand(2).setReg(Concat.getReg(0)); 4079 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4080 } 4081 } 4082 4083 int CorrectedNumVAddrs = NumVAddrs; 4084 4085 // Optimize _L to _LZ when _L is zero 4086 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4087 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4088 const ConstantFP *ConstantLod; 4089 const int LodIdx = AddrIdx + NumVAddrs - 1; 4090 4091 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4092 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4093 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4094 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4095 LZMappingInfo->LZ, ImageDimIntr->Dim); 4096 4097 // The starting indexes should remain in the same place. 4098 --NumVAddrs; 4099 --CorrectedNumVAddrs; 4100 4101 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4102 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4103 MI.RemoveOperand(LodIdx); 4104 } 4105 } 4106 } 4107 4108 // Optimize _mip away, when 'lod' is zero 4109 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4110 int64_t ConstantLod; 4111 const int LodIdx = AddrIdx + NumVAddrs - 1; 4112 4113 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4114 if (ConstantLod == 0) { 4115 // TODO: Change intrinsic opcode and remove operand instead or replacing 4116 // it with 0, as the _L to _LZ handling is done above. 4117 MI.getOperand(LodIdx).ChangeToImmediate(0); 4118 --CorrectedNumVAddrs; 4119 } 4120 } 4121 } 4122 4123 // Rewrite the addressing register layout before doing anything else. 4124 if (IsA16 || IsG16) { 4125 if (IsA16) { 4126 // Target must support the feature and gradients need to be 16 bit too 4127 if (!ST.hasA16() || !IsG16) 4128 return false; 4129 } else if (!ST.hasG16()) 4130 return false; 4131 4132 if (NumVAddrs > 1) { 4133 SmallVector<Register, 4> PackedRegs; 4134 // Don't compress addresses for G16 4135 const int PackEndIdx = 4136 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4137 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4138 PackEndIdx, NumGradients); 4139 4140 if (!IsA16) { 4141 // Add uncompressed address 4142 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4143 int AddrReg = MI.getOperand(I).getReg(); 4144 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4145 PackedRegs.push_back(AddrReg); 4146 } 4147 } 4148 4149 // See also below in the non-a16 branch 4150 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4151 4152 if (!UseNSA && PackedRegs.size() > 1) { 4153 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4154 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4155 PackedRegs[0] = Concat.getReg(0); 4156 PackedRegs.resize(1); 4157 } 4158 4159 const int NumPacked = PackedRegs.size(); 4160 for (int I = 0; I != NumVAddrs; ++I) { 4161 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4162 if (!SrcOp.isReg()) { 4163 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4164 continue; 4165 } 4166 4167 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4168 4169 if (I < NumPacked) 4170 SrcOp.setReg(PackedRegs[I]); 4171 else 4172 SrcOp.setReg(AMDGPU::NoRegister); 4173 } 4174 } 4175 } else { 4176 // If the register allocator cannot place the address registers contiguously 4177 // without introducing moves, then using the non-sequential address encoding 4178 // is always preferable, since it saves VALU instructions and is usually a 4179 // wash in terms of code size or even better. 4180 // 4181 // However, we currently have no way of hinting to the register allocator 4182 // that MIMG addresses should be placed contiguously when it is possible to 4183 // do so, so force non-NSA for the common 2-address case as a heuristic. 4184 // 4185 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4186 // allocation when possible. 4187 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4188 4189 if (!UseNSA && NumVAddrs > 1) 4190 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4191 } 4192 4193 int Flags = 0; 4194 if (IsA16) 4195 Flags |= 1; 4196 if (IsG16) 4197 Flags |= 2; 4198 MI.addOperand(MachineOperand::CreateImm(Flags)); 4199 4200 if (BaseOpcode->Store) { // No TFE for stores? 4201 // TODO: Handle dmask trim 4202 Register VData = MI.getOperand(1).getReg(); 4203 LLT Ty = MRI->getType(VData); 4204 if (!Ty.isVector() || Ty.getElementType() != S16) 4205 return true; 4206 4207 Register RepackedReg = handleD16VData(B, *MRI, VData); 4208 if (RepackedReg != VData) { 4209 MI.getOperand(1).setReg(RepackedReg); 4210 } 4211 4212 return true; 4213 } 4214 4215 Register DstReg = MI.getOperand(0).getReg(); 4216 LLT Ty = MRI->getType(DstReg); 4217 const LLT EltTy = Ty.getScalarType(); 4218 const bool IsD16 = Ty.getScalarType() == S16; 4219 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4220 4221 // Confirm that the return type is large enough for the dmask specified 4222 if (NumElts < DMaskLanes) 4223 return false; 4224 4225 if (NumElts > 4 || DMaskLanes > 4) 4226 return false; 4227 4228 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4229 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4230 4231 // The raw dword aligned data component of the load. The only legal cases 4232 // where this matters should be when using the packed D16 format, for 4233 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4234 LLT RoundedTy; 4235 4236 // S32 vector to to cover all data, plus TFE result element. 4237 LLT TFETy; 4238 4239 // Register type to use for each loaded component. Will be S32 or V2S16. 4240 LLT RegTy; 4241 4242 if (IsD16 && ST.hasUnpackedD16VMem()) { 4243 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4244 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4245 RegTy = S32; 4246 } else { 4247 unsigned EltSize = EltTy.getSizeInBits(); 4248 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4249 unsigned RoundedSize = 32 * RoundedElts; 4250 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4251 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4252 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4253 } 4254 4255 // The return type does not need adjustment. 4256 // TODO: Should we change s16 case to s32 or <2 x s16>? 4257 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4258 return true; 4259 4260 Register Dst1Reg; 4261 4262 // Insert after the instruction. 4263 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4264 4265 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4266 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4267 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4268 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4269 4270 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4271 4272 MI.getOperand(0).setReg(NewResultReg); 4273 4274 // In the IR, TFE is supposed to be used with a 2 element struct return 4275 // type. The intruction really returns these two values in one contiguous 4276 // register, with one additional dword beyond the loaded data. Rewrite the 4277 // return type to use a single register result. 4278 4279 if (IsTFE) { 4280 Dst1Reg = MI.getOperand(1).getReg(); 4281 if (MRI->getType(Dst1Reg) != S32) 4282 return false; 4283 4284 // TODO: Make sure the TFE operand bit is set. 4285 MI.RemoveOperand(1); 4286 4287 // Handle the easy case that requires no repack instructions. 4288 if (Ty == S32) { 4289 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4290 return true; 4291 } 4292 } 4293 4294 // Now figure out how to copy the new result register back into the old 4295 // result. 4296 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4297 4298 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4299 4300 if (ResultNumRegs == 1) { 4301 assert(!IsTFE); 4302 ResultRegs[0] = NewResultReg; 4303 } else { 4304 // We have to repack into a new vector of some kind. 4305 for (int I = 0; I != NumDataRegs; ++I) 4306 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4307 B.buildUnmerge(ResultRegs, NewResultReg); 4308 4309 // Drop the final TFE element to get the data part. The TFE result is 4310 // directly written to the right place already. 4311 if (IsTFE) 4312 ResultRegs.resize(NumDataRegs); 4313 } 4314 4315 // For an s16 scalar result, we form an s32 result with a truncate regardless 4316 // of packed vs. unpacked. 4317 if (IsD16 && !Ty.isVector()) { 4318 B.buildTrunc(DstReg, ResultRegs[0]); 4319 return true; 4320 } 4321 4322 // Avoid a build/concat_vector of 1 entry. 4323 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4324 B.buildBitcast(DstReg, ResultRegs[0]); 4325 return true; 4326 } 4327 4328 assert(Ty.isVector()); 4329 4330 if (IsD16) { 4331 // For packed D16 results with TFE enabled, all the data components are 4332 // S32. Cast back to the expected type. 4333 // 4334 // TODO: We don't really need to use load s32 elements. We would only need one 4335 // cast for the TFE result if a multiple of v2s16 was used. 4336 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4337 for (Register &Reg : ResultRegs) 4338 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4339 } else if (ST.hasUnpackedD16VMem()) { 4340 for (Register &Reg : ResultRegs) 4341 Reg = B.buildTrunc(S16, Reg).getReg(0); 4342 } 4343 } 4344 4345 auto padWithUndef = [&](LLT Ty, int NumElts) { 4346 if (NumElts == 0) 4347 return; 4348 Register Undef = B.buildUndef(Ty).getReg(0); 4349 for (int I = 0; I != NumElts; ++I) 4350 ResultRegs.push_back(Undef); 4351 }; 4352 4353 // Pad out any elements eliminated due to the dmask. 4354 LLT ResTy = MRI->getType(ResultRegs[0]); 4355 if (!ResTy.isVector()) { 4356 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4357 B.buildBuildVector(DstReg, ResultRegs); 4358 return true; 4359 } 4360 4361 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4362 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4363 4364 // Deal with the one annoying legal case. 4365 const LLT V3S16 = LLT::vector(3, 16); 4366 if (Ty == V3S16) { 4367 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4368 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4369 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4370 return true; 4371 } 4372 4373 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4374 B.buildConcatVectors(DstReg, ResultRegs); 4375 return true; 4376 } 4377 4378 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4379 LegalizerHelper &Helper, MachineInstr &MI) const { 4380 MachineIRBuilder &B = Helper.MIRBuilder; 4381 GISelChangeObserver &Observer = Helper.Observer; 4382 4383 Register Dst = MI.getOperand(0).getReg(); 4384 LLT Ty = B.getMRI()->getType(Dst); 4385 unsigned Size = Ty.getSizeInBits(); 4386 MachineFunction &MF = B.getMF(); 4387 4388 Observer.changingInstr(MI); 4389 4390 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4391 Ty = getBitcastRegisterType(Ty); 4392 Helper.bitcastDst(MI, Ty, 0); 4393 Dst = MI.getOperand(0).getReg(); 4394 B.setInsertPt(B.getMBB(), MI); 4395 } 4396 4397 // FIXME: We don't really need this intermediate instruction. The intrinsic 4398 // should be fixed to have a memory operand. Since it's readnone, we're not 4399 // allowed to add one. 4400 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4401 MI.RemoveOperand(1); // Remove intrinsic ID 4402 4403 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4404 // TODO: Should this use datalayout alignment? 4405 const unsigned MemSize = (Size + 7) / 8; 4406 const Align MemAlign(4); 4407 MachineMemOperand *MMO = MF.getMachineMemOperand( 4408 MachinePointerInfo(), 4409 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4410 MachineMemOperand::MOInvariant, 4411 MemSize, MemAlign); 4412 MI.addMemOperand(MF, MMO); 4413 4414 // There are no 96-bit result scalar loads, but widening to 128-bit should 4415 // always be legal. We may need to restore this to a 96-bit result if it turns 4416 // out this needs to be converted to a vector load during RegBankSelect. 4417 if (!isPowerOf2_32(Size)) { 4418 if (Ty.isVector()) 4419 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4420 else 4421 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4422 } 4423 4424 Observer.changedInstr(MI); 4425 return true; 4426 } 4427 4428 // TODO: Move to selection 4429 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4430 MachineRegisterInfo &MRI, 4431 MachineIRBuilder &B) const { 4432 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4433 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4434 !ST.isTrapHandlerEnabled()) { 4435 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4436 } else { 4437 // Pass queue pointer to trap handler as input, and insert trap instruction 4438 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4439 MachineRegisterInfo &MRI = *B.getMRI(); 4440 4441 Register LiveIn = 4442 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4443 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4444 return false; 4445 4446 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4447 B.buildCopy(SGPR01, LiveIn); 4448 B.buildInstr(AMDGPU::S_TRAP) 4449 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4450 .addReg(SGPR01, RegState::Implicit); 4451 } 4452 4453 MI.eraseFromParent(); 4454 return true; 4455 } 4456 4457 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4458 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4459 // Is non-HSA path or trap-handler disabled? then, report a warning 4460 // accordingly 4461 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4462 !ST.isTrapHandlerEnabled()) { 4463 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4464 "debugtrap handler not supported", 4465 MI.getDebugLoc(), DS_Warning); 4466 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4467 Ctx.diagnose(NoTrap); 4468 } else { 4469 // Insert debug-trap instruction 4470 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4471 } 4472 4473 MI.eraseFromParent(); 4474 return true; 4475 } 4476 4477 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4478 MachineInstr &MI) const { 4479 MachineIRBuilder &B = Helper.MIRBuilder; 4480 MachineRegisterInfo &MRI = *B.getMRI(); 4481 4482 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4483 auto IntrID = MI.getIntrinsicID(); 4484 switch (IntrID) { 4485 case Intrinsic::amdgcn_if: 4486 case Intrinsic::amdgcn_else: { 4487 MachineInstr *Br = nullptr; 4488 MachineBasicBlock *UncondBrTarget = nullptr; 4489 bool Negated = false; 4490 if (MachineInstr *BrCond = 4491 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4492 const SIRegisterInfo *TRI 4493 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4494 4495 Register Def = MI.getOperand(1).getReg(); 4496 Register Use = MI.getOperand(3).getReg(); 4497 4498 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4499 4500 if (Negated) 4501 std::swap(CondBrTarget, UncondBrTarget); 4502 4503 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4504 if (IntrID == Intrinsic::amdgcn_if) { 4505 B.buildInstr(AMDGPU::SI_IF) 4506 .addDef(Def) 4507 .addUse(Use) 4508 .addMBB(UncondBrTarget); 4509 } else { 4510 B.buildInstr(AMDGPU::SI_ELSE) 4511 .addDef(Def) 4512 .addUse(Use) 4513 .addMBB(UncondBrTarget) 4514 .addImm(0); 4515 } 4516 4517 if (Br) { 4518 Br->getOperand(0).setMBB(CondBrTarget); 4519 } else { 4520 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4521 // since we're swapping branch targets it needs to be reinserted. 4522 // FIXME: IRTranslator should probably not do this 4523 B.buildBr(*CondBrTarget); 4524 } 4525 4526 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4527 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4528 MI.eraseFromParent(); 4529 BrCond->eraseFromParent(); 4530 return true; 4531 } 4532 4533 return false; 4534 } 4535 case Intrinsic::amdgcn_loop: { 4536 MachineInstr *Br = nullptr; 4537 MachineBasicBlock *UncondBrTarget = nullptr; 4538 bool Negated = false; 4539 if (MachineInstr *BrCond = 4540 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4541 const SIRegisterInfo *TRI 4542 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4543 4544 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4545 Register Reg = MI.getOperand(2).getReg(); 4546 4547 if (Negated) 4548 std::swap(CondBrTarget, UncondBrTarget); 4549 4550 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4551 B.buildInstr(AMDGPU::SI_LOOP) 4552 .addUse(Reg) 4553 .addMBB(UncondBrTarget); 4554 4555 if (Br) 4556 Br->getOperand(0).setMBB(CondBrTarget); 4557 else 4558 B.buildBr(*CondBrTarget); 4559 4560 MI.eraseFromParent(); 4561 BrCond->eraseFromParent(); 4562 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4563 return true; 4564 } 4565 4566 return false; 4567 } 4568 case Intrinsic::amdgcn_kernarg_segment_ptr: 4569 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4570 // This only makes sense to call in a kernel, so just lower to null. 4571 B.buildConstant(MI.getOperand(0).getReg(), 0); 4572 MI.eraseFromParent(); 4573 return true; 4574 } 4575 4576 return legalizePreloadedArgIntrin( 4577 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4578 case Intrinsic::amdgcn_implicitarg_ptr: 4579 return legalizeImplicitArgPtr(MI, MRI, B); 4580 case Intrinsic::amdgcn_workitem_id_x: 4581 return legalizePreloadedArgIntrin(MI, MRI, B, 4582 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4583 case Intrinsic::amdgcn_workitem_id_y: 4584 return legalizePreloadedArgIntrin(MI, MRI, B, 4585 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4586 case Intrinsic::amdgcn_workitem_id_z: 4587 return legalizePreloadedArgIntrin(MI, MRI, B, 4588 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4589 case Intrinsic::amdgcn_workgroup_id_x: 4590 return legalizePreloadedArgIntrin(MI, MRI, B, 4591 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4592 case Intrinsic::amdgcn_workgroup_id_y: 4593 return legalizePreloadedArgIntrin(MI, MRI, B, 4594 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4595 case Intrinsic::amdgcn_workgroup_id_z: 4596 return legalizePreloadedArgIntrin(MI, MRI, B, 4597 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4598 case Intrinsic::amdgcn_dispatch_ptr: 4599 return legalizePreloadedArgIntrin(MI, MRI, B, 4600 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4601 case Intrinsic::amdgcn_queue_ptr: 4602 return legalizePreloadedArgIntrin(MI, MRI, B, 4603 AMDGPUFunctionArgInfo::QUEUE_PTR); 4604 case Intrinsic::amdgcn_implicit_buffer_ptr: 4605 return legalizePreloadedArgIntrin( 4606 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4607 case Intrinsic::amdgcn_dispatch_id: 4608 return legalizePreloadedArgIntrin(MI, MRI, B, 4609 AMDGPUFunctionArgInfo::DISPATCH_ID); 4610 case Intrinsic::amdgcn_fdiv_fast: 4611 return legalizeFDIVFastIntrin(MI, MRI, B); 4612 case Intrinsic::amdgcn_is_shared: 4613 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4614 case Intrinsic::amdgcn_is_private: 4615 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4616 case Intrinsic::amdgcn_wavefrontsize: { 4617 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4618 MI.eraseFromParent(); 4619 return true; 4620 } 4621 case Intrinsic::amdgcn_s_buffer_load: 4622 return legalizeSBufferLoad(Helper, MI); 4623 case Intrinsic::amdgcn_raw_buffer_store: 4624 case Intrinsic::amdgcn_struct_buffer_store: 4625 return legalizeBufferStore(MI, MRI, B, false, false); 4626 case Intrinsic::amdgcn_raw_buffer_store_format: 4627 case Intrinsic::amdgcn_struct_buffer_store_format: 4628 return legalizeBufferStore(MI, MRI, B, false, true); 4629 case Intrinsic::amdgcn_raw_tbuffer_store: 4630 case Intrinsic::amdgcn_struct_tbuffer_store: 4631 return legalizeBufferStore(MI, MRI, B, true, true); 4632 case Intrinsic::amdgcn_raw_buffer_load: 4633 case Intrinsic::amdgcn_struct_buffer_load: 4634 return legalizeBufferLoad(MI, MRI, B, false, false); 4635 case Intrinsic::amdgcn_raw_buffer_load_format: 4636 case Intrinsic::amdgcn_struct_buffer_load_format: 4637 return legalizeBufferLoad(MI, MRI, B, true, false); 4638 case Intrinsic::amdgcn_raw_tbuffer_load: 4639 case Intrinsic::amdgcn_struct_tbuffer_load: 4640 return legalizeBufferLoad(MI, MRI, B, true, true); 4641 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4642 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4643 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4644 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4645 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4646 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4647 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4648 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4649 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4650 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4651 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4652 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4653 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4654 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4655 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4656 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4657 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4658 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4659 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4660 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4661 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4662 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4663 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4664 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4665 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4666 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4667 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4668 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4669 return legalizeBufferAtomic(MI, B, IntrID); 4670 case Intrinsic::amdgcn_atomic_inc: 4671 return legalizeAtomicIncDec(MI, B, true); 4672 case Intrinsic::amdgcn_atomic_dec: 4673 return legalizeAtomicIncDec(MI, B, false); 4674 case Intrinsic::trap: 4675 return legalizeTrapIntrinsic(MI, MRI, B); 4676 case Intrinsic::debugtrap: 4677 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4678 case Intrinsic::amdgcn_rsq_clamp: 4679 return legalizeRsqClampIntrinsic(MI, MRI, B); 4680 case Intrinsic::amdgcn_ds_fadd: 4681 case Intrinsic::amdgcn_ds_fmin: 4682 case Intrinsic::amdgcn_ds_fmax: 4683 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4684 default: { 4685 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4686 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4687 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4688 return true; 4689 } 4690 } 4691 4692 return true; 4693 } 4694