1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 /// Return true if we should legalize a load by widening an odd sized memory 364 /// access up to the alignment. Note this case when the memory access itself 365 /// changes, not the size of the result register. 366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 367 unsigned AlignInBits, unsigned AddrSpace, 368 unsigned Opcode) { 369 // We don't want to widen cases that are naturally legal. 370 if (isPowerOf2_32(SizeInBits)) 371 return false; 372 373 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 374 // end up widening these for a scalar load during RegBankSelect, since there 375 // aren't 96-bit scalar loads. 376 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 377 return false; 378 379 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 380 return false; 381 382 // A load is known dereferenceable up to the alignment, so it's legal to widen 383 // to it. 384 // 385 // TODO: Could check dereferenceable for less aligned cases. 386 unsigned RoundedSize = NextPowerOf2(SizeInBits); 387 if (AlignInBits < RoundedSize) 388 return false; 389 390 // Do not widen if it would introduce a slow unaligned load. 391 const SITargetLowering *TLI = ST.getTargetLowering(); 392 bool Fast = false; 393 return TLI->allowsMisalignedMemoryAccessesImpl( 394 RoundedSize, AddrSpace, Align(AlignInBits / 8), 395 MachineMemOperand::MOLoad, &Fast) && 396 Fast; 397 } 398 399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 400 unsigned Opcode) { 401 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 402 return false; 403 404 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 405 Query.MMODescrs[0].AlignInBits, 406 Query.Types[1].getAddressSpace(), Opcode); 407 } 408 409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 410 const GCNTargetMachine &TM) 411 : ST(ST_) { 412 using namespace TargetOpcode; 413 414 auto GetAddrSpacePtr = [&TM](unsigned AS) { 415 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 416 }; 417 418 const LLT S1 = LLT::scalar(1); 419 const LLT S16 = LLT::scalar(16); 420 const LLT S32 = LLT::scalar(32); 421 const LLT S64 = LLT::scalar(64); 422 const LLT S128 = LLT::scalar(128); 423 const LLT S256 = LLT::scalar(256); 424 const LLT S512 = LLT::scalar(512); 425 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 426 427 const LLT V2S16 = LLT::vector(2, 16); 428 const LLT V4S16 = LLT::vector(4, 16); 429 430 const LLT V2S32 = LLT::vector(2, 32); 431 const LLT V3S32 = LLT::vector(3, 32); 432 const LLT V4S32 = LLT::vector(4, 32); 433 const LLT V5S32 = LLT::vector(5, 32); 434 const LLT V6S32 = LLT::vector(6, 32); 435 const LLT V7S32 = LLT::vector(7, 32); 436 const LLT V8S32 = LLT::vector(8, 32); 437 const LLT V9S32 = LLT::vector(9, 32); 438 const LLT V10S32 = LLT::vector(10, 32); 439 const LLT V11S32 = LLT::vector(11, 32); 440 const LLT V12S32 = LLT::vector(12, 32); 441 const LLT V13S32 = LLT::vector(13, 32); 442 const LLT V14S32 = LLT::vector(14, 32); 443 const LLT V15S32 = LLT::vector(15, 32); 444 const LLT V16S32 = LLT::vector(16, 32); 445 const LLT V32S32 = LLT::vector(32, 32); 446 447 const LLT V2S64 = LLT::vector(2, 64); 448 const LLT V3S64 = LLT::vector(3, 64); 449 const LLT V4S64 = LLT::vector(4, 64); 450 const LLT V5S64 = LLT::vector(5, 64); 451 const LLT V6S64 = LLT::vector(6, 64); 452 const LLT V7S64 = LLT::vector(7, 64); 453 const LLT V8S64 = LLT::vector(8, 64); 454 const LLT V16S64 = LLT::vector(16, 64); 455 456 std::initializer_list<LLT> AllS32Vectors = 457 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 458 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 459 std::initializer_list<LLT> AllS64Vectors = 460 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 461 462 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 463 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 464 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 465 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 466 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 467 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 468 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 469 470 const LLT CodePtr = FlatPtr; 471 472 const std::initializer_list<LLT> AddrSpaces64 = { 473 GlobalPtr, ConstantPtr, FlatPtr 474 }; 475 476 const std::initializer_list<LLT> AddrSpaces32 = { 477 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 478 }; 479 480 const std::initializer_list<LLT> FPTypesBase = { 481 S32, S64 482 }; 483 484 const std::initializer_list<LLT> FPTypes16 = { 485 S32, S64, S16 486 }; 487 488 const std::initializer_list<LLT> FPTypesPK16 = { 489 S32, S64, S16, V2S16 490 }; 491 492 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 493 494 setAction({G_BRCOND, S1}, Legal); // VCC branches 495 setAction({G_BRCOND, S32}, Legal); // SCC branches 496 497 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 498 // elements for v3s16 499 getActionDefinitionsBuilder(G_PHI) 500 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 501 .legalFor(AllS32Vectors) 502 .legalFor(AllS64Vectors) 503 .legalFor(AddrSpaces64) 504 .legalFor(AddrSpaces32) 505 .legalIf(isPointer(0)) 506 .clampScalar(0, S16, S256) 507 .widenScalarToNextPow2(0, 32) 508 .clampMaxNumElements(0, S32, 16) 509 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 510 .scalarize(0); 511 512 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 513 // Full set of gfx9 features. 514 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 515 .legalFor({S32, S16, V2S16}) 516 .clampScalar(0, S16, S32) 517 .clampMaxNumElements(0, S16, 2) 518 .scalarize(0) 519 .widenScalarToNextPow2(0, 32); 520 521 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 522 .legalFor({S32, S16, V2S16}) // Clamp modifier 523 .minScalarOrElt(0, S16) 524 .clampMaxNumElements(0, S16, 2) 525 .scalarize(0) 526 .widenScalarToNextPow2(0, 32) 527 .lower(); 528 } else if (ST.has16BitInsts()) { 529 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 530 .legalFor({S32, S16}) 531 .clampScalar(0, S16, S32) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 534 535 // Technically the saturating operations require clamp bit support, but this 536 // was introduced at the same time as 16-bit operations. 537 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 538 .legalFor({S32, S16}) // Clamp modifier 539 .minScalar(0, S16) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 16) 542 .lower(); 543 544 // We're just lowering this, but it helps get a better result to try to 545 // coerce to the desired type first. 546 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 547 .minScalar(0, S16) 548 .scalarize(0) 549 .lower(); 550 } else { 551 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 552 .legalFor({S32}) 553 .clampScalar(0, S32, S32) 554 .scalarize(0); 555 556 if (ST.hasIntClamp()) { 557 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 558 .legalFor({S32}) // Clamp modifier. 559 .scalarize(0) 560 .minScalarOrElt(0, S32) 561 .lower(); 562 } else { 563 // Clamp bit support was added in VI, along with 16-bit operations. 564 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 565 .minScalar(0, S32) 566 .scalarize(0) 567 .lower(); 568 } 569 570 // FIXME: DAG expansion gets better results. The widening uses the smaller 571 // range values and goes for the min/max lowering directly. 572 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 573 .minScalar(0, S32) 574 .scalarize(0) 575 .lower(); 576 } 577 578 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 579 .customFor({S32, S64}) 580 .clampScalar(0, S32, S64) 581 .widenScalarToNextPow2(0, 32) 582 .scalarize(0); 583 584 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 585 .legalFor({S32}) 586 .clampScalar(0, S32, S32) 587 .scalarize(0); 588 589 // Report legal for any types we can handle anywhere. For the cases only legal 590 // on the SALU, RegBankSelect will be able to re-legalize. 591 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 592 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 593 .clampScalar(0, S32, S64) 594 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 595 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 596 .widenScalarToNextPow2(0) 597 .scalarize(0); 598 599 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 600 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 601 .legalFor({{S32, S1}, {S32, S32}}) 602 .minScalar(0, S32) 603 // TODO: .scalarize(0) 604 .lower(); 605 606 getActionDefinitionsBuilder(G_BITCAST) 607 // Don't worry about the size constraint. 608 .legalIf(all(isRegisterType(0), isRegisterType(1))) 609 .lower(); 610 611 612 getActionDefinitionsBuilder(G_CONSTANT) 613 .legalFor({S1, S32, S64, S16, GlobalPtr, 614 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 615 .legalIf(isPointer(0)) 616 .clampScalar(0, S32, S64) 617 .widenScalarToNextPow2(0); 618 619 getActionDefinitionsBuilder(G_FCONSTANT) 620 .legalFor({S32, S64, S16}) 621 .clampScalar(0, S16, S64); 622 623 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 624 .legalIf(isRegisterType(0)) 625 // s1 and s16 are special cases because they have legal operations on 626 // them, but don't really occupy registers in the normal way. 627 .legalFor({S1, S16}) 628 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 629 .clampScalarOrElt(0, S32, MaxScalar) 630 .widenScalarToNextPow2(0, 32) 631 .clampMaxNumElements(0, S32, 16); 632 633 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 634 635 // If the amount is divergent, we have to do a wave reduction to get the 636 // maximum value, so this is expanded during RegBankSelect. 637 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 638 .legalFor({{PrivatePtr, S32}}); 639 640 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 641 .customIf(typeIsNot(0, PrivatePtr)); 642 643 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 644 645 auto &FPOpActions = getActionDefinitionsBuilder( 646 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 647 .legalFor({S32, S64}); 648 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 649 .customFor({S32, S64}); 650 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 651 .customFor({S32, S64}); 652 653 if (ST.has16BitInsts()) { 654 if (ST.hasVOP3PInsts()) 655 FPOpActions.legalFor({S16, V2S16}); 656 else 657 FPOpActions.legalFor({S16}); 658 659 TrigActions.customFor({S16}); 660 FDIVActions.customFor({S16}); 661 } 662 663 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 664 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 665 666 if (ST.hasVOP3PInsts()) { 667 MinNumMaxNum.customFor(FPTypesPK16) 668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 669 .clampMaxNumElements(0, S16, 2) 670 .clampScalar(0, S16, S64) 671 .scalarize(0); 672 } else if (ST.has16BitInsts()) { 673 MinNumMaxNum.customFor(FPTypes16) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else { 677 MinNumMaxNum.customFor(FPTypesBase) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } 681 682 if (ST.hasVOP3PInsts()) 683 FPOpActions.clampMaxNumElements(0, S16, 2); 684 685 FPOpActions 686 .scalarize(0) 687 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 688 689 TrigActions 690 .scalarize(0) 691 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 692 693 FDIVActions 694 .scalarize(0) 695 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 696 697 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 698 .legalFor(FPTypesPK16) 699 .clampMaxNumElements(0, S16, 2) 700 .scalarize(0) 701 .clampScalar(0, S16, S64); 702 703 if (ST.has16BitInsts()) { 704 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 705 .legalFor({S32, S64, S16}) 706 .scalarize(0) 707 .clampScalar(0, S16, S64); 708 } else { 709 getActionDefinitionsBuilder(G_FSQRT) 710 .legalFor({S32, S64}) 711 .scalarize(0) 712 .clampScalar(0, S32, S64); 713 714 if (ST.hasFractBug()) { 715 getActionDefinitionsBuilder(G_FFLOOR) 716 .customFor({S64}) 717 .legalFor({S32, S64}) 718 .scalarize(0) 719 .clampScalar(0, S32, S64); 720 } else { 721 getActionDefinitionsBuilder(G_FFLOOR) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } 726 } 727 728 getActionDefinitionsBuilder(G_FPTRUNC) 729 .legalFor({{S32, S64}, {S16, S32}}) 730 .scalarize(0) 731 .lower(); 732 733 getActionDefinitionsBuilder(G_FPEXT) 734 .legalFor({{S64, S32}, {S32, S16}}) 735 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 736 .scalarize(0); 737 738 getActionDefinitionsBuilder(G_FSUB) 739 // Use actual fsub instruction 740 .legalFor({S32}) 741 // Must use fadd + fneg 742 .lowerFor({S64, S16, V2S16}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 // Whether this is legal depends on the floating point mode for the function. 747 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 748 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 749 FMad.customFor({S32, S16}); 750 else if (ST.hasMadMacF32Insts()) 751 FMad.customFor({S32}); 752 else if (ST.hasMadF16()) 753 FMad.customFor({S16}); 754 FMad.scalarize(0) 755 .lower(); 756 757 auto &FRem = getActionDefinitionsBuilder(G_FREM); 758 if (ST.has16BitInsts()) { 759 FRem.customFor({S16, S32, S64}); 760 } else { 761 FRem.minScalar(0, S32) 762 .customFor({S32, S64}); 763 } 764 FRem.scalarize(0); 765 766 // TODO: Do we need to clamp maximum bitwidth? 767 getActionDefinitionsBuilder(G_TRUNC) 768 .legalIf(isScalar(0)) 769 .legalFor({{V2S16, V2S32}}) 770 .clampMaxNumElements(0, S16, 2) 771 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 772 // situations (like an invalid implicit use), we don't want to infinite loop 773 // in the legalizer. 774 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 775 .alwaysLegal(); 776 777 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 778 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 779 {S32, S1}, {S64, S1}, {S16, S1}}) 780 .scalarize(0) 781 .clampScalar(0, S32, S64) 782 .widenScalarToNextPow2(1, 32); 783 784 // TODO: Split s1->s64 during regbankselect for VALU. 785 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 786 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 787 .lowerFor({{S32, S64}}) 788 .lowerIf(typeIs(1, S1)) 789 .customFor({{S64, S64}}); 790 if (ST.has16BitInsts()) 791 IToFP.legalFor({{S16, S16}}); 792 IToFP.clampScalar(1, S32, S64) 793 .minScalar(0, S32) 794 .scalarize(0) 795 .widenScalarToNextPow2(1); 796 797 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 798 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 799 .customFor({{S64, S64}}) 800 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 801 if (ST.has16BitInsts()) 802 FPToI.legalFor({{S16, S16}}); 803 else 804 FPToI.minScalar(1, S32); 805 806 FPToI.minScalar(0, S32) 807 .scalarize(0) 808 .lower(); 809 810 // Lower roundeven into G_FRINT 811 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 812 .scalarize(0) 813 .lower(); 814 815 if (ST.has16BitInsts()) { 816 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 817 .legalFor({S16, S32, S64}) 818 .clampScalar(0, S16, S64) 819 .scalarize(0); 820 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S32, S64}) 823 .clampScalar(0, S32, S64) 824 .scalarize(0); 825 } else { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32}) 828 .customFor({S64}) 829 .clampScalar(0, S32, S64) 830 .scalarize(0); 831 } 832 833 getActionDefinitionsBuilder(G_PTR_ADD) 834 .legalIf(all(isPointer(0), sameSize(0, 1))) 835 .scalarize(0) 836 .scalarSameSizeAs(1, 0); 837 838 getActionDefinitionsBuilder(G_PTRMASK) 839 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 840 .scalarSameSizeAs(1, 0) 841 .scalarize(0); 842 843 auto &CmpBuilder = 844 getActionDefinitionsBuilder(G_ICMP) 845 // The compare output type differs based on the register bank of the output, 846 // so make both s1 and s32 legal. 847 // 848 // Scalar compares producing output in scc will be promoted to s32, as that 849 // is the allocatable register type that will be needed for the copy from 850 // scc. This will be promoted during RegBankSelect, and we assume something 851 // before that won't try to use s32 result types. 852 // 853 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 854 // bank. 855 .legalForCartesianProduct( 856 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 857 .legalForCartesianProduct( 858 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 859 if (ST.has16BitInsts()) { 860 CmpBuilder.legalFor({{S1, S16}}); 861 } 862 863 CmpBuilder 864 .widenScalarToNextPow2(1) 865 .clampScalar(1, S32, S64) 866 .scalarize(0) 867 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 868 869 getActionDefinitionsBuilder(G_FCMP) 870 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 871 .widenScalarToNextPow2(1) 872 .clampScalar(1, S32, S64) 873 .scalarize(0); 874 875 // FIXME: fpow has a selection pattern that should move to custom lowering. 876 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 877 if (ST.has16BitInsts()) 878 Exp2Ops.legalFor({S32, S16}); 879 else 880 Exp2Ops.legalFor({S32}); 881 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 882 Exp2Ops.scalarize(0); 883 884 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 885 if (ST.has16BitInsts()) 886 ExpOps.customFor({{S32}, {S16}}); 887 else 888 ExpOps.customFor({S32}); 889 ExpOps.clampScalar(0, MinScalarFPTy, S32) 890 .scalarize(0); 891 892 getActionDefinitionsBuilder(G_FPOWI) 893 .clampScalar(0, MinScalarFPTy, S32) 894 .lower(); 895 896 // The 64-bit versions produce 32-bit results, but only on the SALU. 897 getActionDefinitionsBuilder(G_CTPOP) 898 .legalFor({{S32, S32}, {S32, S64}}) 899 .clampScalar(0, S32, S32) 900 .clampScalar(1, S32, S64) 901 .scalarize(0) 902 .widenScalarToNextPow2(0, 32) 903 .widenScalarToNextPow2(1, 32); 904 905 // The hardware instructions return a different result on 0 than the generic 906 // instructions expect. The hardware produces -1, but these produce the 907 // bitwidth. 908 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 909 .scalarize(0) 910 .clampScalar(0, S32, S32) 911 .clampScalar(1, S32, S64) 912 .widenScalarToNextPow2(0, 32) 913 .widenScalarToNextPow2(1, 32) 914 .lower(); 915 916 // The 64-bit versions produce 32-bit results, but only on the SALU. 917 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 918 .legalFor({{S32, S32}, {S32, S64}}) 919 .clampScalar(0, S32, S32) 920 .clampScalar(1, S32, S64) 921 .scalarize(0) 922 .widenScalarToNextPow2(0, 32) 923 .widenScalarToNextPow2(1, 32); 924 925 getActionDefinitionsBuilder(G_BITREVERSE) 926 .legalFor({S32}) 927 .clampScalar(0, S32, S32) 928 .scalarize(0); 929 930 if (ST.has16BitInsts()) { 931 getActionDefinitionsBuilder(G_BSWAP) 932 .legalFor({S16, S32, V2S16}) 933 .clampMaxNumElements(0, S16, 2) 934 // FIXME: Fixing non-power-of-2 before clamp is workaround for 935 // narrowScalar limitation. 936 .widenScalarToNextPow2(0) 937 .clampScalar(0, S16, S32) 938 .scalarize(0); 939 940 if (ST.hasVOP3PInsts()) { 941 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 942 .legalFor({S32, S16, V2S16}) 943 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 944 .clampMaxNumElements(0, S16, 2) 945 .minScalar(0, S16) 946 .widenScalarToNextPow2(0) 947 .scalarize(0) 948 .lower(); 949 } else { 950 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 951 .legalFor({S32, S16}) 952 .widenScalarToNextPow2(0) 953 .minScalar(0, S16) 954 .scalarize(0) 955 .lower(); 956 } 957 } else { 958 // TODO: Should have same legality without v_perm_b32 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S32}) 961 .lowerIf(scalarNarrowerThan(0, 32)) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .maxScalar(0, S32) 966 .scalarize(0) 967 .lower(); 968 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 970 .legalFor({S32}) 971 .minScalar(0, S32) 972 .widenScalarToNextPow2(0) 973 .scalarize(0) 974 .lower(); 975 } 976 977 getActionDefinitionsBuilder(G_INTTOPTR) 978 // List the common cases 979 .legalForCartesianProduct(AddrSpaces64, {S64}) 980 .legalForCartesianProduct(AddrSpaces32, {S32}) 981 .scalarize(0) 982 // Accept any address space as long as the size matches 983 .legalIf(sameSize(0, 1)) 984 .widenScalarIf(smallerThan(1, 0), 985 [](const LegalityQuery &Query) { 986 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 987 }) 988 .narrowScalarIf(largerThan(1, 0), 989 [](const LegalityQuery &Query) { 990 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 991 }); 992 993 getActionDefinitionsBuilder(G_PTRTOINT) 994 // List the common cases 995 .legalForCartesianProduct(AddrSpaces64, {S64}) 996 .legalForCartesianProduct(AddrSpaces32, {S32}) 997 .scalarize(0) 998 // Accept any address space as long as the size matches 999 .legalIf(sameSize(0, 1)) 1000 .widenScalarIf(smallerThan(0, 1), 1001 [](const LegalityQuery &Query) { 1002 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1003 }) 1004 .narrowScalarIf( 1005 largerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }); 1009 1010 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1011 .scalarize(0) 1012 .custom(); 1013 1014 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1015 bool IsLoad) -> bool { 1016 const LLT DstTy = Query.Types[0]; 1017 1018 // Split vector extloads. 1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1020 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1021 1022 if (MemSize < DstTy.getSizeInBits()) 1023 MemSize = std::max(MemSize, AlignBits); 1024 1025 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1026 return true; 1027 1028 const LLT PtrTy = Query.Types[1]; 1029 unsigned AS = PtrTy.getAddressSpace(); 1030 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1031 return true; 1032 1033 // Catch weird sized loads that don't evenly divide into the access sizes 1034 // TODO: May be able to widen depending on alignment etc. 1035 unsigned NumRegs = (MemSize + 31) / 32; 1036 if (NumRegs == 3) { 1037 if (!ST.hasDwordx3LoadStores()) 1038 return true; 1039 } else { 1040 // If the alignment allows, these should have been widened. 1041 if (!isPowerOf2_32(NumRegs)) 1042 return true; 1043 } 1044 1045 if (AlignBits < MemSize) { 1046 const SITargetLowering *TLI = ST.getTargetLowering(); 1047 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1048 Align(AlignBits / 8)); 1049 } 1050 1051 return false; 1052 }; 1053 1054 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1055 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1056 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1057 1058 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1059 // LDS 1060 // TODO: Unsupported flat for SI. 1061 1062 for (unsigned Op : {G_LOAD, G_STORE}) { 1063 const bool IsStore = Op == G_STORE; 1064 1065 auto &Actions = getActionDefinitionsBuilder(Op); 1066 // Explicitly list some common cases. 1067 // TODO: Does this help compile time at all? 1068 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1069 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1070 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1071 {S64, GlobalPtr, 64, GlobalAlign32}, 1072 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1073 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1074 {S32, GlobalPtr, 8, GlobalAlign8}, 1075 {S32, GlobalPtr, 16, GlobalAlign16}, 1076 1077 {S32, LocalPtr, 32, 32}, 1078 {S64, LocalPtr, 64, 32}, 1079 {V2S32, LocalPtr, 64, 32}, 1080 {S32, LocalPtr, 8, 8}, 1081 {S32, LocalPtr, 16, 16}, 1082 {V2S16, LocalPtr, 32, 32}, 1083 1084 {S32, PrivatePtr, 32, 32}, 1085 {S32, PrivatePtr, 8, 8}, 1086 {S32, PrivatePtr, 16, 16}, 1087 {V2S16, PrivatePtr, 32, 32}, 1088 1089 {S32, ConstantPtr, 32, GlobalAlign32}, 1090 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1091 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1092 {S64, ConstantPtr, 64, GlobalAlign32}, 1093 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1094 Actions.legalIf( 1095 [=](const LegalityQuery &Query) -> bool { 1096 return isLoadStoreLegal(ST, Query, Op); 1097 }); 1098 1099 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1100 // 64-bits. 1101 // 1102 // TODO: Should generalize bitcast action into coerce, which will also cover 1103 // inserting addrspacecasts. 1104 Actions.customIf(typeIs(1, Constant32Ptr)); 1105 1106 // Turn any illegal element vectors into something easier to deal 1107 // with. These will ultimately produce 32-bit scalar shifts to extract the 1108 // parts anyway. 1109 // 1110 // For odd 16-bit element vectors, prefer to split those into pieces with 1111 // 16-bit vector parts. 1112 Actions.bitcastIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1115 Query.MMODescrs[0].SizeInBits); 1116 }, bitcastToRegisterType(0)); 1117 1118 if (!IsStore) { 1119 // Widen suitably aligned loads by loading extra bytes. The standard 1120 // legalization actions can't properly express widening memory operands. 1121 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1122 return shouldWidenLoad(ST, Query, G_LOAD); 1123 }); 1124 } 1125 1126 // FIXME: load/store narrowing should be moved to lower action 1127 Actions 1128 .narrowScalarIf( 1129 [=](const LegalityQuery &Query) -> bool { 1130 return !Query.Types[0].isVector() && 1131 needToSplitMemOp(Query, Op == G_LOAD); 1132 }, 1133 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1134 const LLT DstTy = Query.Types[0]; 1135 const LLT PtrTy = Query.Types[1]; 1136 1137 const unsigned DstSize = DstTy.getSizeInBits(); 1138 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1139 1140 // Split extloads. 1141 if (DstSize > MemSize) 1142 return std::make_pair(0, LLT::scalar(MemSize)); 1143 1144 if (!isPowerOf2_32(DstSize)) { 1145 // We're probably decomposing an odd sized store. Try to split 1146 // to the widest type. TODO: Account for alignment. As-is it 1147 // should be OK, since the new parts will be further legalized. 1148 unsigned FloorSize = PowerOf2Floor(DstSize); 1149 return std::make_pair(0, LLT::scalar(FloorSize)); 1150 } 1151 1152 if (DstSize > 32 && (DstSize % 32 != 0)) { 1153 // FIXME: Need a way to specify non-extload of larger size if 1154 // suitably aligned. 1155 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1156 } 1157 1158 unsigned MaxSize = maxSizeForAddrSpace(ST, 1159 PtrTy.getAddressSpace(), 1160 Op == G_LOAD); 1161 if (MemSize > MaxSize) 1162 return std::make_pair(0, LLT::scalar(MaxSize)); 1163 1164 unsigned Align = Query.MMODescrs[0].AlignInBits; 1165 return std::make_pair(0, LLT::scalar(Align)); 1166 }) 1167 .fewerElementsIf( 1168 [=](const LegalityQuery &Query) -> bool { 1169 return Query.Types[0].isVector() && 1170 needToSplitMemOp(Query, Op == G_LOAD); 1171 }, 1172 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1173 const LLT DstTy = Query.Types[0]; 1174 const LLT PtrTy = Query.Types[1]; 1175 1176 LLT EltTy = DstTy.getElementType(); 1177 unsigned MaxSize = maxSizeForAddrSpace(ST, 1178 PtrTy.getAddressSpace(), 1179 Op == G_LOAD); 1180 1181 // FIXME: Handle widened to power of 2 results better. This ends 1182 // up scalarizing. 1183 // FIXME: 3 element stores scalarized on SI 1184 1185 // Split if it's too large for the address space. 1186 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1187 unsigned NumElts = DstTy.getNumElements(); 1188 unsigned EltSize = EltTy.getSizeInBits(); 1189 1190 if (MaxSize % EltSize == 0) { 1191 return std::make_pair( 1192 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1193 } 1194 1195 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1196 1197 // FIXME: Refine when odd breakdowns handled 1198 // The scalars will need to be re-legalized. 1199 if (NumPieces == 1 || NumPieces >= NumElts || 1200 NumElts % NumPieces != 0) 1201 return std::make_pair(0, EltTy); 1202 1203 return std::make_pair(0, 1204 LLT::vector(NumElts / NumPieces, EltTy)); 1205 } 1206 1207 // FIXME: We could probably handle weird extending loads better. 1208 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1209 if (DstTy.getSizeInBits() > MemSize) 1210 return std::make_pair(0, EltTy); 1211 1212 unsigned EltSize = EltTy.getSizeInBits(); 1213 unsigned DstSize = DstTy.getSizeInBits(); 1214 if (!isPowerOf2_32(DstSize)) { 1215 // We're probably decomposing an odd sized store. Try to split 1216 // to the widest type. TODO: Account for alignment. As-is it 1217 // should be OK, since the new parts will be further legalized. 1218 unsigned FloorSize = PowerOf2Floor(DstSize); 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1221 } 1222 1223 // Need to split because of alignment. 1224 unsigned Align = Query.MMODescrs[0].AlignInBits; 1225 if (EltSize > Align && 1226 (EltSize / Align < DstTy.getNumElements())) { 1227 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1228 } 1229 1230 // May need relegalization for the scalars. 1231 return std::make_pair(0, EltTy); 1232 }) 1233 .lowerIfMemSizeNotPow2() 1234 .minScalar(0, S32); 1235 1236 if (IsStore) 1237 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1238 1239 Actions 1240 .widenScalarToNextPow2(0) 1241 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1242 .lower(); 1243 } 1244 1245 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1246 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1247 {S32, GlobalPtr, 16, 2 * 8}, 1248 {S32, LocalPtr, 8, 8}, 1249 {S32, LocalPtr, 16, 16}, 1250 {S32, PrivatePtr, 8, 8}, 1251 {S32, PrivatePtr, 16, 16}, 1252 {S32, ConstantPtr, 8, 8}, 1253 {S32, ConstantPtr, 16, 2 * 8}}); 1254 if (ST.hasFlatAddressSpace()) { 1255 ExtLoads.legalForTypesWithMemDesc( 1256 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1257 } 1258 1259 ExtLoads.clampScalar(0, S32, S32) 1260 .widenScalarToNextPow2(0) 1261 .unsupportedIfMemSizeNotPow2() 1262 .lower(); 1263 1264 auto &Atomics = getActionDefinitionsBuilder( 1265 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1266 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1267 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1268 G_ATOMICRMW_UMIN}) 1269 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1270 {S64, GlobalPtr}, {S64, LocalPtr}, 1271 {S32, RegionPtr}, {S64, RegionPtr}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1274 } 1275 1276 if (ST.hasLDSFPAtomics()) { 1277 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1278 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1279 } 1280 1281 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1282 // demarshalling 1283 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1284 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1285 {S32, FlatPtr}, {S64, FlatPtr}}) 1286 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1287 {S32, RegionPtr}, {S64, RegionPtr}}); 1288 // TODO: Pointer types, any 32-bit or 64-bit vector 1289 1290 // Condition should be s32 for scalar, s1 for vector. 1291 getActionDefinitionsBuilder(G_SELECT) 1292 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1293 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1294 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1295 .clampScalar(0, S16, S64) 1296 .scalarize(1) 1297 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1298 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1299 .clampMaxNumElements(0, S32, 2) 1300 .clampMaxNumElements(0, LocalPtr, 2) 1301 .clampMaxNumElements(0, PrivatePtr, 2) 1302 .scalarize(0) 1303 .widenScalarToNextPow2(0) 1304 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1305 1306 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1307 // be more flexible with the shift amount type. 1308 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1309 .legalFor({{S32, S32}, {S64, S32}}); 1310 if (ST.has16BitInsts()) { 1311 if (ST.hasVOP3PInsts()) { 1312 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1313 .clampMaxNumElements(0, S16, 2); 1314 } else 1315 Shifts.legalFor({{S16, S16}}); 1316 1317 // TODO: Support 16-bit shift amounts for all types 1318 Shifts.widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1321 // 32-bit amount. 1322 const LLT ValTy = Query.Types[0]; 1323 const LLT AmountTy = Query.Types[1]; 1324 return ValTy.getSizeInBits() <= 16 && 1325 AmountTy.getSizeInBits() < 16; 1326 }, changeTo(1, S16)); 1327 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1328 Shifts.clampScalar(1, S32, S32); 1329 Shifts.clampScalar(0, S16, S64); 1330 Shifts.widenScalarToNextPow2(0, 16); 1331 1332 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1333 .minScalar(0, S16) 1334 .scalarize(0) 1335 .lower(); 1336 } else { 1337 // Make sure we legalize the shift amount type first, as the general 1338 // expansion for the shifted type will produce much worse code if it hasn't 1339 // been truncated already. 1340 Shifts.clampScalar(1, S32, S32); 1341 Shifts.clampScalar(0, S32, S64); 1342 Shifts.widenScalarToNextPow2(0, 32); 1343 1344 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1345 .minScalar(0, S32) 1346 .scalarize(0) 1347 .lower(); 1348 } 1349 Shifts.scalarize(0); 1350 1351 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1352 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1353 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1354 unsigned IdxTypeIdx = 2; 1355 1356 getActionDefinitionsBuilder(Op) 1357 .customIf([=](const LegalityQuery &Query) { 1358 const LLT EltTy = Query.Types[EltTypeIdx]; 1359 const LLT VecTy = Query.Types[VecTypeIdx]; 1360 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1361 const unsigned EltSize = EltTy.getSizeInBits(); 1362 return (EltSize == 32 || EltSize == 64) && 1363 VecTy.getSizeInBits() % 32 == 0 && 1364 VecTy.getSizeInBits() <= MaxRegisterSize && 1365 IdxTy.getSizeInBits() == 32; 1366 }) 1367 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1368 bitcastToVectorElement32(VecTypeIdx)) 1369 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1370 .bitcastIf( 1371 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1372 [=](const LegalityQuery &Query) { 1373 // For > 64-bit element types, try to turn this into a 64-bit 1374 // element vector since we may be able to do better indexing 1375 // if this is scalar. If not, fall back to 32. 1376 const LLT EltTy = Query.Types[EltTypeIdx]; 1377 const LLT VecTy = Query.Types[VecTypeIdx]; 1378 const unsigned DstEltSize = EltTy.getSizeInBits(); 1379 const unsigned VecSize = VecTy.getSizeInBits(); 1380 1381 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1382 return std::make_pair( 1383 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1384 }) 1385 .clampScalar(EltTypeIdx, S32, S64) 1386 .clampScalar(VecTypeIdx, S32, S64) 1387 .clampScalar(IdxTypeIdx, S32, S32) 1388 .clampMaxNumElements(VecTypeIdx, S32, 32) 1389 // TODO: Clamp elements for 64-bit vectors? 1390 // It should only be necessary with variable indexes. 1391 // As a last resort, lower to the stack 1392 .lower(); 1393 } 1394 1395 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1396 .unsupportedIf([=](const LegalityQuery &Query) { 1397 const LLT &EltTy = Query.Types[1].getElementType(); 1398 return Query.Types[0] != EltTy; 1399 }); 1400 1401 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1402 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1403 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1404 1405 // FIXME: Doesn't handle extract of illegal sizes. 1406 getActionDefinitionsBuilder(Op) 1407 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1408 // FIXME: Multiples of 16 should not be legal. 1409 .legalIf([=](const LegalityQuery &Query) { 1410 const LLT BigTy = Query.Types[BigTyIdx]; 1411 const LLT LitTy = Query.Types[LitTyIdx]; 1412 return (BigTy.getSizeInBits() % 32 == 0) && 1413 (LitTy.getSizeInBits() % 16 == 0); 1414 }) 1415 .widenScalarIf( 1416 [=](const LegalityQuery &Query) { 1417 const LLT BigTy = Query.Types[BigTyIdx]; 1418 return (BigTy.getScalarSizeInBits() < 16); 1419 }, 1420 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1421 .widenScalarIf( 1422 [=](const LegalityQuery &Query) { 1423 const LLT LitTy = Query.Types[LitTyIdx]; 1424 return (LitTy.getScalarSizeInBits() < 16); 1425 }, 1426 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1427 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1428 .widenScalarToNextPow2(BigTyIdx, 32); 1429 1430 } 1431 1432 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1433 .legalForCartesianProduct(AllS32Vectors, {S32}) 1434 .legalForCartesianProduct(AllS64Vectors, {S64}) 1435 .clampNumElements(0, V16S32, V32S32) 1436 .clampNumElements(0, V2S64, V16S64) 1437 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1438 1439 if (ST.hasScalarPackInsts()) { 1440 BuildVector 1441 // FIXME: Should probably widen s1 vectors straight to s32 1442 .minScalarOrElt(0, S16) 1443 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1444 .minScalar(1, S32); 1445 1446 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1447 .legalFor({V2S16, S32}) 1448 .lower(); 1449 BuildVector.minScalarOrElt(0, S32); 1450 } else { 1451 BuildVector.customFor({V2S16, S16}); 1452 BuildVector.minScalarOrElt(0, S32); 1453 1454 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1455 .customFor({V2S16, S32}) 1456 .lower(); 1457 } 1458 1459 BuildVector.legalIf(isRegisterType(0)); 1460 1461 // FIXME: Clamp maximum size 1462 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1463 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1464 .clampMaxNumElements(0, S32, 32) 1465 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1466 .clampMaxNumElements(0, S16, 64); 1467 1468 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1469 // pre-legalize. 1470 if (ST.hasVOP3PInsts()) { 1471 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1472 .customFor({V2S16, V2S16}) 1473 .lower(); 1474 } else 1475 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1476 1477 // Merge/Unmerge 1478 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1479 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1480 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1481 1482 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1483 const LLT Ty = Query.Types[TypeIdx]; 1484 if (Ty.isVector()) { 1485 const LLT &EltTy = Ty.getElementType(); 1486 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1487 return true; 1488 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1489 return true; 1490 } 1491 return false; 1492 }; 1493 1494 auto &Builder = getActionDefinitionsBuilder(Op) 1495 .lowerFor({{S16, V2S16}}) 1496 .lowerIf([=](const LegalityQuery &Query) { 1497 const LLT BigTy = Query.Types[BigTyIdx]; 1498 return BigTy.getSizeInBits() == 32; 1499 }) 1500 // Try to widen to s16 first for small types. 1501 // TODO: Only do this on targets with legal s16 shifts 1502 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1503 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1504 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1505 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1506 elementTypeIs(1, S16)), 1507 changeTo(1, V2S16)) 1508 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1509 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1510 // valid. 1511 .clampScalar(LitTyIdx, S32, S512) 1512 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1513 // Break up vectors with weird elements into scalars 1514 .fewerElementsIf( 1515 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1516 scalarize(0)) 1517 .fewerElementsIf( 1518 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1519 scalarize(1)) 1520 .clampScalar(BigTyIdx, S32, MaxScalar); 1521 1522 if (Op == G_MERGE_VALUES) { 1523 Builder.widenScalarIf( 1524 // TODO: Use 16-bit shifts if legal for 8-bit values? 1525 [=](const LegalityQuery &Query) { 1526 const LLT Ty = Query.Types[LitTyIdx]; 1527 return Ty.getSizeInBits() < 32; 1528 }, 1529 changeTo(LitTyIdx, S32)); 1530 } 1531 1532 Builder.widenScalarIf( 1533 [=](const LegalityQuery &Query) { 1534 const LLT Ty = Query.Types[BigTyIdx]; 1535 return !isPowerOf2_32(Ty.getSizeInBits()) && 1536 Ty.getSizeInBits() % 16 != 0; 1537 }, 1538 [=](const LegalityQuery &Query) { 1539 // Pick the next power of 2, or a multiple of 64 over 128. 1540 // Whichever is smaller. 1541 const LLT &Ty = Query.Types[BigTyIdx]; 1542 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1543 if (NewSizeInBits >= 256) { 1544 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1545 if (RoundedTo < NewSizeInBits) 1546 NewSizeInBits = RoundedTo; 1547 } 1548 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1549 }) 1550 .legalIf([=](const LegalityQuery &Query) { 1551 const LLT &BigTy = Query.Types[BigTyIdx]; 1552 const LLT &LitTy = Query.Types[LitTyIdx]; 1553 1554 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1555 return false; 1556 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1557 return false; 1558 1559 return BigTy.getSizeInBits() % 16 == 0 && 1560 LitTy.getSizeInBits() % 16 == 0 && 1561 BigTy.getSizeInBits() <= MaxRegisterSize; 1562 }) 1563 // Any vectors left are the wrong size. Scalarize them. 1564 .scalarize(0) 1565 .scalarize(1); 1566 } 1567 1568 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1569 // RegBankSelect. 1570 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1571 .legalFor({{S32}, {S64}}); 1572 1573 if (ST.hasVOP3PInsts()) { 1574 SextInReg.lowerFor({{V2S16}}) 1575 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1576 // get more vector shift opportunities, since we'll get those when 1577 // expanded. 1578 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1579 } else if (ST.has16BitInsts()) { 1580 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1581 } else { 1582 // Prefer to promote to s32 before lowering if we don't have 16-bit 1583 // shifts. This avoid a lot of intermediate truncate and extend operations. 1584 SextInReg.lowerFor({{S32}, {S64}}); 1585 } 1586 1587 SextInReg 1588 .scalarize(0) 1589 .clampScalar(0, S32, S64) 1590 .lower(); 1591 1592 getActionDefinitionsBuilder(G_FSHR) 1593 .legalFor({{S32, S32}}) 1594 .scalarize(0) 1595 .lower(); 1596 1597 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1598 .legalFor({S64}); 1599 1600 getActionDefinitionsBuilder(G_FENCE) 1601 .alwaysLegal(); 1602 1603 getActionDefinitionsBuilder({ 1604 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1605 G_FCOPYSIGN, 1606 1607 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1608 G_ATOMICRMW_NAND, 1609 G_ATOMICRMW_FSUB, 1610 G_READ_REGISTER, 1611 G_WRITE_REGISTER, 1612 1613 G_SADDO, G_SSUBO, 1614 1615 // TODO: Implement 1616 G_FMINIMUM, G_FMAXIMUM, 1617 G_FSHL 1618 }).lower(); 1619 1620 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1621 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1622 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1623 .unsupported(); 1624 1625 computeTables(); 1626 verify(*ST.getInstrInfo()); 1627 } 1628 1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1630 MachineInstr &MI) const { 1631 MachineIRBuilder &B = Helper.MIRBuilder; 1632 MachineRegisterInfo &MRI = *B.getMRI(); 1633 1634 switch (MI.getOpcode()) { 1635 case TargetOpcode::G_ADDRSPACE_CAST: 1636 return legalizeAddrSpaceCast(MI, MRI, B); 1637 case TargetOpcode::G_FRINT: 1638 return legalizeFrint(MI, MRI, B); 1639 case TargetOpcode::G_FCEIL: 1640 return legalizeFceil(MI, MRI, B); 1641 case TargetOpcode::G_FREM: 1642 return legalizeFrem(MI, MRI, B); 1643 case TargetOpcode::G_INTRINSIC_TRUNC: 1644 return legalizeIntrinsicTrunc(MI, MRI, B); 1645 case TargetOpcode::G_SITOFP: 1646 return legalizeITOFP(MI, MRI, B, true); 1647 case TargetOpcode::G_UITOFP: 1648 return legalizeITOFP(MI, MRI, B, false); 1649 case TargetOpcode::G_FPTOSI: 1650 return legalizeFPTOI(MI, MRI, B, true); 1651 case TargetOpcode::G_FPTOUI: 1652 return legalizeFPTOI(MI, MRI, B, false); 1653 case TargetOpcode::G_FMINNUM: 1654 case TargetOpcode::G_FMAXNUM: 1655 case TargetOpcode::G_FMINNUM_IEEE: 1656 case TargetOpcode::G_FMAXNUM_IEEE: 1657 return legalizeMinNumMaxNum(Helper, MI); 1658 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1659 return legalizeExtractVectorElt(MI, MRI, B); 1660 case TargetOpcode::G_INSERT_VECTOR_ELT: 1661 return legalizeInsertVectorElt(MI, MRI, B); 1662 case TargetOpcode::G_SHUFFLE_VECTOR: 1663 return legalizeShuffleVector(MI, MRI, B); 1664 case TargetOpcode::G_FSIN: 1665 case TargetOpcode::G_FCOS: 1666 return legalizeSinCos(MI, MRI, B); 1667 case TargetOpcode::G_GLOBAL_VALUE: 1668 return legalizeGlobalValue(MI, MRI, B); 1669 case TargetOpcode::G_LOAD: 1670 return legalizeLoad(Helper, MI); 1671 case TargetOpcode::G_FMAD: 1672 return legalizeFMad(MI, MRI, B); 1673 case TargetOpcode::G_FDIV: 1674 return legalizeFDIV(MI, MRI, B); 1675 case TargetOpcode::G_UDIV: 1676 case TargetOpcode::G_UREM: 1677 return legalizeUDIV_UREM(MI, MRI, B); 1678 case TargetOpcode::G_SDIV: 1679 case TargetOpcode::G_SREM: 1680 return legalizeSDIV_SREM(MI, MRI, B); 1681 case TargetOpcode::G_ATOMIC_CMPXCHG: 1682 return legalizeAtomicCmpXChg(MI, MRI, B); 1683 case TargetOpcode::G_FLOG: 1684 return legalizeFlog(MI, B, numbers::ln2f); 1685 case TargetOpcode::G_FLOG10: 1686 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1687 case TargetOpcode::G_FEXP: 1688 return legalizeFExp(MI, B); 1689 case TargetOpcode::G_FPOW: 1690 return legalizeFPow(MI, B); 1691 case TargetOpcode::G_FFLOOR: 1692 return legalizeFFloor(MI, MRI, B); 1693 case TargetOpcode::G_BUILD_VECTOR: 1694 return legalizeBuildVector(MI, MRI, B); 1695 default: 1696 return false; 1697 } 1698 1699 llvm_unreachable("expected switch to return"); 1700 } 1701 1702 Register AMDGPULegalizerInfo::getSegmentAperture( 1703 unsigned AS, 1704 MachineRegisterInfo &MRI, 1705 MachineIRBuilder &B) const { 1706 MachineFunction &MF = B.getMF(); 1707 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1708 const LLT S32 = LLT::scalar(32); 1709 1710 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1711 1712 if (ST.hasApertureRegs()) { 1713 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1714 // getreg. 1715 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1716 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1717 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1718 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1719 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1721 unsigned Encoding = 1722 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1723 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1724 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1725 1726 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1727 1728 B.buildInstr(AMDGPU::S_GETREG_B32) 1729 .addDef(GetReg) 1730 .addImm(Encoding); 1731 MRI.setType(GetReg, S32); 1732 1733 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1734 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1735 } 1736 1737 Register QueuePtr = MRI.createGenericVirtualRegister( 1738 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1739 1740 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1741 return Register(); 1742 1743 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1744 // private_segment_aperture_base_hi. 1745 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1746 1747 // TODO: can we be smarter about machine pointer info? 1748 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1749 MachineMemOperand *MMO = MF.getMachineMemOperand( 1750 PtrInfo, 1751 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1752 MachineMemOperand::MOInvariant, 1753 4, commonAlignment(Align(64), StructOffset)); 1754 1755 Register LoadAddr; 1756 1757 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1758 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1759 } 1760 1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1762 MachineInstr &MI, MachineRegisterInfo &MRI, 1763 MachineIRBuilder &B) const { 1764 MachineFunction &MF = B.getMF(); 1765 1766 const LLT S32 = LLT::scalar(32); 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Src = MI.getOperand(1).getReg(); 1769 1770 LLT DstTy = MRI.getType(Dst); 1771 LLT SrcTy = MRI.getType(Src); 1772 unsigned DestAS = DstTy.getAddressSpace(); 1773 unsigned SrcAS = SrcTy.getAddressSpace(); 1774 1775 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1776 // vector element. 1777 assert(!DstTy.isVector()); 1778 1779 const AMDGPUTargetMachine &TM 1780 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1781 1782 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1783 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1784 return true; 1785 } 1786 1787 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 // Truncate. 1789 B.buildExtract(Dst, Src, 0); 1790 MI.eraseFromParent(); 1791 return true; 1792 } 1793 1794 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1795 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1796 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1797 1798 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1799 // another. Merge operands are required to be the same type, but creating an 1800 // extra ptrtoint would be kind of pointless. 1801 auto HighAddr = B.buildConstant( 1802 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1803 B.buildMerge(Dst, {Src, HighAddr}); 1804 MI.eraseFromParent(); 1805 return true; 1806 } 1807 1808 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1809 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1810 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1811 unsigned NullVal = TM.getNullPointerValue(DestAS); 1812 1813 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1814 auto FlatNull = B.buildConstant(SrcTy, 0); 1815 1816 // Extract low 32-bits of the pointer. 1817 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1818 1819 auto CmpRes = 1820 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1821 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1822 1823 MI.eraseFromParent(); 1824 return true; 1825 } 1826 1827 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1828 return false; 1829 1830 if (!ST.hasFlatAddressSpace()) 1831 return false; 1832 1833 auto SegmentNull = 1834 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1835 auto FlatNull = 1836 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1837 1838 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1839 if (!ApertureReg.isValid()) 1840 return false; 1841 1842 auto CmpRes = 1843 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1844 1845 // Coerce the type of the low half of the result so we can use merge_values. 1846 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1847 1848 // TODO: Should we allow mismatched types but matching sizes in merges to 1849 // avoid the ptrtoint? 1850 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1851 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1852 1853 MI.eraseFromParent(); 1854 return true; 1855 } 1856 1857 bool AMDGPULegalizerInfo::legalizeFrint( 1858 MachineInstr &MI, MachineRegisterInfo &MRI, 1859 MachineIRBuilder &B) const { 1860 Register Src = MI.getOperand(1).getReg(); 1861 LLT Ty = MRI.getType(Src); 1862 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1863 1864 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1865 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1866 1867 auto C1 = B.buildFConstant(Ty, C1Val); 1868 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1869 1870 // TODO: Should this propagate fast-math-flags? 1871 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1872 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1873 1874 auto C2 = B.buildFConstant(Ty, C2Val); 1875 auto Fabs = B.buildFAbs(Ty, Src); 1876 1877 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1878 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeFceil( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B) const { 1886 1887 const LLT S1 = LLT::scalar(1); 1888 const LLT S64 = LLT::scalar(64); 1889 1890 Register Src = MI.getOperand(1).getReg(); 1891 assert(MRI.getType(Src) == S64); 1892 1893 // result = trunc(src) 1894 // if (src > 0.0 && src != result) 1895 // result += 1.0 1896 1897 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1898 1899 const auto Zero = B.buildFConstant(S64, 0.0); 1900 const auto One = B.buildFConstant(S64, 1.0); 1901 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1902 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1903 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1904 auto Add = B.buildSelect(S64, And, One, Zero); 1905 1906 // TODO: Should this propagate fast-math-flags? 1907 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrem( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register DstReg = MI.getOperand(0).getReg(); 1915 Register Src0Reg = MI.getOperand(1).getReg(); 1916 Register Src1Reg = MI.getOperand(2).getReg(); 1917 auto Flags = MI.getFlags(); 1918 LLT Ty = MRI.getType(DstReg); 1919 1920 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1921 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1922 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1923 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1924 MI.eraseFromParent(); 1925 return true; 1926 } 1927 1928 static MachineInstrBuilder extractF64Exponent(Register Hi, 1929 MachineIRBuilder &B) { 1930 const unsigned FractBits = 52; 1931 const unsigned ExpBits = 11; 1932 LLT S32 = LLT::scalar(32); 1933 1934 auto Const0 = B.buildConstant(S32, FractBits - 32); 1935 auto Const1 = B.buildConstant(S32, ExpBits); 1936 1937 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1938 .addUse(Hi) 1939 .addUse(Const0.getReg(0)) 1940 .addUse(Const1.getReg(0)); 1941 1942 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 const LLT S1 = LLT::scalar(1); 1949 const LLT S32 = LLT::scalar(32); 1950 const LLT S64 = LLT::scalar(64); 1951 1952 Register Src = MI.getOperand(1).getReg(); 1953 assert(MRI.getType(Src) == S64); 1954 1955 // TODO: Should this use extract since the low half is unused? 1956 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1957 Register Hi = Unmerge.getReg(1); 1958 1959 // Extract the upper half, since this is where we will find the sign and 1960 // exponent. 1961 auto Exp = extractF64Exponent(Hi, B); 1962 1963 const unsigned FractBits = 52; 1964 1965 // Extract the sign bit. 1966 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1967 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1968 1969 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1970 1971 const auto Zero32 = B.buildConstant(S32, 0); 1972 1973 // Extend back to 64-bits. 1974 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1975 1976 auto Shr = B.buildAShr(S64, FractMask, Exp); 1977 auto Not = B.buildNot(S64, Shr); 1978 auto Tmp0 = B.buildAnd(S64, Src, Not); 1979 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1980 1981 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1982 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1983 1984 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1985 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1986 MI.eraseFromParent(); 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeITOFP( 1991 MachineInstr &MI, MachineRegisterInfo &MRI, 1992 MachineIRBuilder &B, bool Signed) const { 1993 1994 Register Dst = MI.getOperand(0).getReg(); 1995 Register Src = MI.getOperand(1).getReg(); 1996 1997 const LLT S64 = LLT::scalar(64); 1998 const LLT S32 = LLT::scalar(32); 1999 2000 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2001 2002 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2003 2004 auto CvtHi = Signed ? 2005 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2006 B.buildUITOFP(S64, Unmerge.getReg(1)); 2007 2008 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2009 2010 auto ThirtyTwo = B.buildConstant(S32, 32); 2011 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2012 .addUse(CvtHi.getReg(0)) 2013 .addUse(ThirtyTwo.getReg(0)); 2014 2015 // TODO: Should this propagate fast-math-flags? 2016 B.buildFAdd(Dst, LdExp, CvtLo); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 // TODO: Copied from DAG implementation. Verify logic and document how this 2022 // actually works. 2023 bool AMDGPULegalizerInfo::legalizeFPTOI( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 MachineIRBuilder &B, bool Signed) const { 2026 2027 Register Dst = MI.getOperand(0).getReg(); 2028 Register Src = MI.getOperand(1).getReg(); 2029 2030 const LLT S64 = LLT::scalar(64); 2031 const LLT S32 = LLT::scalar(32); 2032 2033 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2034 2035 unsigned Flags = MI.getFlags(); 2036 2037 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2038 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2039 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2040 2041 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2042 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2043 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2044 2045 auto Hi = Signed ? 2046 B.buildFPTOSI(S32, FloorMul) : 2047 B.buildFPTOUI(S32, FloorMul); 2048 auto Lo = B.buildFPTOUI(S32, Fma); 2049 2050 B.buildMerge(Dst, { Lo, Hi }); 2051 MI.eraseFromParent(); 2052 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2057 MachineInstr &MI) const { 2058 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2059 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2060 2061 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2062 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2063 2064 // With ieee_mode disabled, the instructions have the correct behavior 2065 // already for G_FMINNUM/G_FMAXNUM 2066 if (!MFI->getMode().IEEE) 2067 return !IsIEEEOp; 2068 2069 if (IsIEEEOp) 2070 return true; 2071 2072 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, 2077 MachineIRBuilder &B) const { 2078 // TODO: Should move some of this into LegalizerHelper. 2079 2080 // TODO: Promote dynamic indexing of s16 to s32 2081 2082 // FIXME: Artifact combiner probably should have replaced the truncated 2083 // constant before this, so we shouldn't need 2084 // getConstantVRegValWithLookThrough. 2085 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2086 MI.getOperand(2).getReg(), MRI); 2087 if (!IdxVal) // Dynamic case will be selected to register indexing. 2088 return true; 2089 2090 Register Dst = MI.getOperand(0).getReg(); 2091 Register Vec = MI.getOperand(1).getReg(); 2092 2093 LLT VecTy = MRI.getType(Vec); 2094 LLT EltTy = VecTy.getElementType(); 2095 assert(EltTy == MRI.getType(Dst)); 2096 2097 if (IdxVal->Value < VecTy.getNumElements()) 2098 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2099 else 2100 B.buildUndef(Dst); 2101 2102 MI.eraseFromParent(); 2103 return true; 2104 } 2105 2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2107 MachineInstr &MI, MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 // TODO: Should move some of this into LegalizerHelper. 2110 2111 // TODO: Promote dynamic indexing of s16 to s32 2112 2113 // FIXME: Artifact combiner probably should have replaced the truncated 2114 // constant before this, so we shouldn't need 2115 // getConstantVRegValWithLookThrough. 2116 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2117 MI.getOperand(3).getReg(), MRI); 2118 if (!IdxVal) // Dynamic case will be selected to register indexing. 2119 return true; 2120 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Vec = MI.getOperand(1).getReg(); 2123 Register Ins = MI.getOperand(2).getReg(); 2124 2125 LLT VecTy = MRI.getType(Vec); 2126 LLT EltTy = VecTy.getElementType(); 2127 assert(EltTy == MRI.getType(Ins)); 2128 2129 if (IdxVal->Value < VecTy.getNumElements()) 2130 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2131 else 2132 B.buildUndef(Dst); 2133 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B) const { 2141 const LLT V2S16 = LLT::vector(2, 16); 2142 2143 Register Dst = MI.getOperand(0).getReg(); 2144 Register Src0 = MI.getOperand(1).getReg(); 2145 LLT DstTy = MRI.getType(Dst); 2146 LLT SrcTy = MRI.getType(Src0); 2147 2148 if (SrcTy == V2S16 && DstTy == V2S16 && 2149 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2150 return true; 2151 2152 MachineIRBuilder HelperBuilder(MI); 2153 GISelObserverWrapper DummyObserver; 2154 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2155 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeSinCos( 2159 MachineInstr &MI, MachineRegisterInfo &MRI, 2160 MachineIRBuilder &B) const { 2161 2162 Register DstReg = MI.getOperand(0).getReg(); 2163 Register SrcReg = MI.getOperand(1).getReg(); 2164 LLT Ty = MRI.getType(DstReg); 2165 unsigned Flags = MI.getFlags(); 2166 2167 Register TrigVal; 2168 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2169 if (ST.hasTrigReducedRange()) { 2170 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2171 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2172 .addUse(MulVal.getReg(0)) 2173 .setMIFlags(Flags).getReg(0); 2174 } else 2175 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2176 2177 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2178 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2179 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2180 .addUse(TrigVal) 2181 .setMIFlags(Flags); 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2187 MachineIRBuilder &B, 2188 const GlobalValue *GV, 2189 int64_t Offset, 2190 unsigned GAFlags) const { 2191 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2192 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2193 // to the following code sequence: 2194 // 2195 // For constant address space: 2196 // s_getpc_b64 s[0:1] 2197 // s_add_u32 s0, s0, $symbol 2198 // s_addc_u32 s1, s1, 0 2199 // 2200 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2201 // a fixup or relocation is emitted to replace $symbol with a literal 2202 // constant, which is a pc-relative offset from the encoding of the $symbol 2203 // operand to the global variable. 2204 // 2205 // For global address space: 2206 // s_getpc_b64 s[0:1] 2207 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2208 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2209 // 2210 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2211 // fixups or relocations are emitted to replace $symbol@*@lo and 2212 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2213 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2214 // operand to the global variable. 2215 // 2216 // What we want here is an offset from the value returned by s_getpc 2217 // (which is the address of the s_add_u32 instruction) to the global 2218 // variable, but since the encoding of $symbol starts 4 bytes after the start 2219 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2220 // small. This requires us to add 4 to the global variable offset in order to 2221 // compute the correct address. 2222 2223 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2224 2225 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2226 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2227 2228 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2229 .addDef(PCReg); 2230 2231 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2232 if (GAFlags == SIInstrInfo::MO_NONE) 2233 MIB.addImm(0); 2234 else 2235 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2236 2237 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2238 2239 if (PtrTy.getSizeInBits() == 32) 2240 B.buildExtract(DstReg, PCReg, 0); 2241 return true; 2242 } 2243 2244 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2245 MachineInstr &MI, MachineRegisterInfo &MRI, 2246 MachineIRBuilder &B) const { 2247 Register DstReg = MI.getOperand(0).getReg(); 2248 LLT Ty = MRI.getType(DstReg); 2249 unsigned AS = Ty.getAddressSpace(); 2250 2251 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2252 MachineFunction &MF = B.getMF(); 2253 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2254 2255 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2256 if (!MFI->isEntryFunction()) { 2257 const Function &Fn = MF.getFunction(); 2258 DiagnosticInfoUnsupported BadLDSDecl( 2259 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2260 DS_Warning); 2261 Fn.getContext().diagnose(BadLDSDecl); 2262 2263 // We currently don't have a way to correctly allocate LDS objects that 2264 // aren't directly associated with a kernel. We do force inlining of 2265 // functions that use local objects. However, if these dead functions are 2266 // not eliminated, we don't want a compile time error. Just emit a warning 2267 // and a trap, since there should be no callable path here. 2268 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2269 B.buildUndef(DstReg); 2270 MI.eraseFromParent(); 2271 return true; 2272 } 2273 2274 // TODO: We could emit code to handle the initialization somewhere. 2275 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2276 const SITargetLowering *TLI = ST.getTargetLowering(); 2277 if (!TLI->shouldUseLDSConstAddress(GV)) { 2278 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2279 return true; // Leave in place; 2280 } 2281 2282 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2283 Type *Ty = GV->getValueType(); 2284 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2285 // zero-sized type in other languages to declare the dynamic shared 2286 // memory which size is not known at the compile time. They will be 2287 // allocated by the runtime and placed directly after the static 2288 // allocated ones. They all share the same offset. 2289 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2290 // Adjust alignment for that dynamic shared memory array. 2291 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2292 LLT S32 = LLT::scalar(32); 2293 auto Sz = 2294 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2295 B.buildIntToPtr(DstReg, Sz); 2296 MI.eraseFromParent(); 2297 return true; 2298 } 2299 } 2300 2301 B.buildConstant( 2302 DstReg, 2303 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2304 MI.eraseFromParent(); 2305 return true; 2306 } 2307 2308 const Function &Fn = MF.getFunction(); 2309 DiagnosticInfoUnsupported BadInit( 2310 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2311 Fn.getContext().diagnose(BadInit); 2312 return true; 2313 } 2314 2315 const SITargetLowering *TLI = ST.getTargetLowering(); 2316 2317 if (TLI->shouldEmitFixup(GV)) { 2318 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2319 MI.eraseFromParent(); 2320 return true; 2321 } 2322 2323 if (TLI->shouldEmitPCReloc(GV)) { 2324 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2330 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2331 2332 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2333 MachinePointerInfo::getGOT(MF), 2334 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2335 MachineMemOperand::MOInvariant, 2336 8 /*Size*/, Align(8)); 2337 2338 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2339 2340 if (Ty.getSizeInBits() == 32) { 2341 // Truncate if this is a 32-bit constant adrdess. 2342 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2343 B.buildExtract(DstReg, Load, 0); 2344 } else 2345 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2346 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 static LLT widenToNextPowerOf2(LLT Ty) { 2352 if (Ty.isVector()) 2353 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2354 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2355 } 2356 2357 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2358 MachineInstr &MI) const { 2359 MachineIRBuilder &B = Helper.MIRBuilder; 2360 MachineRegisterInfo &MRI = *B.getMRI(); 2361 GISelChangeObserver &Observer = Helper.Observer; 2362 2363 Register PtrReg = MI.getOperand(1).getReg(); 2364 LLT PtrTy = MRI.getType(PtrReg); 2365 unsigned AddrSpace = PtrTy.getAddressSpace(); 2366 2367 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2368 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2369 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2370 Observer.changingInstr(MI); 2371 MI.getOperand(1).setReg(Cast.getReg(0)); 2372 Observer.changedInstr(MI); 2373 return true; 2374 } 2375 2376 Register ValReg = MI.getOperand(0).getReg(); 2377 LLT ValTy = MRI.getType(ValReg); 2378 2379 MachineMemOperand *MMO = *MI.memoperands_begin(); 2380 const unsigned ValSize = ValTy.getSizeInBits(); 2381 const unsigned MemSize = 8 * MMO->getSize(); 2382 const Align MemAlign = MMO->getAlign(); 2383 const unsigned AlignInBits = 8 * MemAlign.value(); 2384 2385 // Widen non-power-of-2 loads to the alignment if needed 2386 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2387 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2388 2389 // This was already the correct extending load result type, so just adjust 2390 // the memory type. 2391 if (WideMemSize == ValSize) { 2392 MachineFunction &MF = B.getMF(); 2393 2394 MachineMemOperand *WideMMO = 2395 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2396 Observer.changingInstr(MI); 2397 MI.setMemRefs(MF, {WideMMO}); 2398 Observer.changedInstr(MI); 2399 return true; 2400 } 2401 2402 // Don't bother handling edge case that should probably never be produced. 2403 if (ValSize > WideMemSize) 2404 return false; 2405 2406 LLT WideTy = widenToNextPowerOf2(ValTy); 2407 2408 Register WideLoad; 2409 if (!WideTy.isVector()) { 2410 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2411 B.buildTrunc(ValReg, WideLoad).getReg(0); 2412 } else { 2413 // Extract the subvector. 2414 2415 if (isRegisterType(ValTy)) { 2416 // If this a case where G_EXTRACT is legal, use it. 2417 // (e.g. <3 x s32> -> <4 x s32>) 2418 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2419 B.buildExtract(ValReg, WideLoad, 0); 2420 } else { 2421 // For cases where the widened type isn't a nice register value, unmerge 2422 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2423 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2424 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2425 B.setInsertPt(B.getMBB(), MI.getIterator()); 2426 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2427 } 2428 } 2429 2430 MI.eraseFromParent(); 2431 return true; 2432 } 2433 2434 return false; 2435 } 2436 2437 bool AMDGPULegalizerInfo::legalizeFMad( 2438 MachineInstr &MI, MachineRegisterInfo &MRI, 2439 MachineIRBuilder &B) const { 2440 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2441 assert(Ty.isScalar()); 2442 2443 MachineFunction &MF = B.getMF(); 2444 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2445 2446 // TODO: Always legal with future ftz flag. 2447 // FIXME: Do we need just output? 2448 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2449 return true; 2450 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2451 return true; 2452 2453 MachineIRBuilder HelperBuilder(MI); 2454 GISelObserverWrapper DummyObserver; 2455 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2456 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2457 } 2458 2459 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2460 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2461 Register DstReg = MI.getOperand(0).getReg(); 2462 Register PtrReg = MI.getOperand(1).getReg(); 2463 Register CmpVal = MI.getOperand(2).getReg(); 2464 Register NewVal = MI.getOperand(3).getReg(); 2465 2466 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2467 "this should not have been custom lowered"); 2468 2469 LLT ValTy = MRI.getType(CmpVal); 2470 LLT VecTy = LLT::vector(2, ValTy); 2471 2472 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2473 2474 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2475 .addDef(DstReg) 2476 .addUse(PtrReg) 2477 .addUse(PackedVal) 2478 .setMemRefs(MI.memoperands()); 2479 2480 MI.eraseFromParent(); 2481 return true; 2482 } 2483 2484 bool AMDGPULegalizerInfo::legalizeFlog( 2485 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2486 Register Dst = MI.getOperand(0).getReg(); 2487 Register Src = MI.getOperand(1).getReg(); 2488 LLT Ty = B.getMRI()->getType(Dst); 2489 unsigned Flags = MI.getFlags(); 2490 2491 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2492 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2493 2494 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2495 MI.eraseFromParent(); 2496 return true; 2497 } 2498 2499 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2500 MachineIRBuilder &B) const { 2501 Register Dst = MI.getOperand(0).getReg(); 2502 Register Src = MI.getOperand(1).getReg(); 2503 unsigned Flags = MI.getFlags(); 2504 LLT Ty = B.getMRI()->getType(Dst); 2505 2506 auto K = B.buildFConstant(Ty, numbers::log2e); 2507 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2508 B.buildFExp2(Dst, Mul, Flags); 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2514 MachineIRBuilder &B) const { 2515 Register Dst = MI.getOperand(0).getReg(); 2516 Register Src0 = MI.getOperand(1).getReg(); 2517 Register Src1 = MI.getOperand(2).getReg(); 2518 unsigned Flags = MI.getFlags(); 2519 LLT Ty = B.getMRI()->getType(Dst); 2520 const LLT S16 = LLT::scalar(16); 2521 const LLT S32 = LLT::scalar(32); 2522 2523 if (Ty == S32) { 2524 auto Log = B.buildFLog2(S32, Src0, Flags); 2525 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2526 .addUse(Log.getReg(0)) 2527 .addUse(Src1) 2528 .setMIFlags(Flags); 2529 B.buildFExp2(Dst, Mul, Flags); 2530 } else if (Ty == S16) { 2531 // There's no f16 fmul_legacy, so we need to convert for it. 2532 auto Log = B.buildFLog2(S16, Src0, Flags); 2533 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2534 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2535 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2536 .addUse(Ext0.getReg(0)) 2537 .addUse(Ext1.getReg(0)) 2538 .setMIFlags(Flags); 2539 2540 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2541 } else 2542 return false; 2543 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 // Find a source register, ignoring any possible source modifiers. 2549 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2550 Register ModSrc = OrigSrc; 2551 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2552 ModSrc = SrcFNeg->getOperand(1).getReg(); 2553 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2554 ModSrc = SrcFAbs->getOperand(1).getReg(); 2555 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2556 ModSrc = SrcFAbs->getOperand(1).getReg(); 2557 return ModSrc; 2558 } 2559 2560 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2561 MachineRegisterInfo &MRI, 2562 MachineIRBuilder &B) const { 2563 2564 const LLT S1 = LLT::scalar(1); 2565 const LLT S64 = LLT::scalar(64); 2566 Register Dst = MI.getOperand(0).getReg(); 2567 Register OrigSrc = MI.getOperand(1).getReg(); 2568 unsigned Flags = MI.getFlags(); 2569 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2570 "this should not have been custom lowered"); 2571 2572 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2573 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2574 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2575 // V_FRACT bug is: 2576 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2577 // 2578 // Convert floor(x) to (x - fract(x)) 2579 2580 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2581 .addUse(OrigSrc) 2582 .setMIFlags(Flags); 2583 2584 // Give source modifier matching some assistance before obscuring a foldable 2585 // pattern. 2586 2587 // TODO: We can avoid the neg on the fract? The input sign to fract 2588 // shouldn't matter? 2589 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2590 2591 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2592 2593 Register Min = MRI.createGenericVirtualRegister(S64); 2594 2595 // We don't need to concern ourselves with the snan handling difference, so 2596 // use the one which will directly select. 2597 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2598 if (MFI->getMode().IEEE) 2599 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2600 else 2601 B.buildFMinNum(Min, Fract, Const, Flags); 2602 2603 Register CorrectedFract = Min; 2604 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2605 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2606 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2607 } 2608 2609 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2610 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2611 2612 MI.eraseFromParent(); 2613 return true; 2614 } 2615 2616 // Turn an illegal packed v2s16 build vector into bit operations. 2617 // TODO: This should probably be a bitcast action in LegalizerHelper. 2618 bool AMDGPULegalizerInfo::legalizeBuildVector( 2619 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2620 Register Dst = MI.getOperand(0).getReg(); 2621 const LLT S32 = LLT::scalar(32); 2622 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2623 2624 Register Src0 = MI.getOperand(1).getReg(); 2625 Register Src1 = MI.getOperand(2).getReg(); 2626 assert(MRI.getType(Src0) == LLT::scalar(16)); 2627 2628 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2629 B.buildBitcast(Dst, Merge); 2630 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 2635 // Return the use branch instruction, otherwise null if the usage is invalid. 2636 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2637 MachineRegisterInfo &MRI, 2638 MachineInstr *&Br, 2639 MachineBasicBlock *&UncondBrTarget) { 2640 Register CondDef = MI.getOperand(0).getReg(); 2641 if (!MRI.hasOneNonDBGUse(CondDef)) 2642 return nullptr; 2643 2644 MachineBasicBlock *Parent = MI.getParent(); 2645 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2646 if (UseMI.getParent() != Parent || 2647 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2648 return nullptr; 2649 2650 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2651 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2652 if (Next == Parent->end()) { 2653 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2654 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2655 return nullptr; 2656 UncondBrTarget = &*NextMBB; 2657 } else { 2658 if (Next->getOpcode() != AMDGPU::G_BR) 2659 return nullptr; 2660 Br = &*Next; 2661 UncondBrTarget = Br->getOperand(0).getMBB(); 2662 } 2663 2664 return &UseMI; 2665 } 2666 2667 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2668 const ArgDescriptor *Arg, 2669 const TargetRegisterClass *ArgRC, 2670 LLT ArgTy) const { 2671 MCRegister SrcReg = Arg->getRegister(); 2672 assert(SrcReg.isPhysical() && "Physical register expected"); 2673 assert(DstReg.isVirtual() && "Virtual register expected"); 2674 2675 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2676 ArgTy); 2677 if (Arg->isMasked()) { 2678 // TODO: Should we try to emit this once in the entry block? 2679 const LLT S32 = LLT::scalar(32); 2680 const unsigned Mask = Arg->getMask(); 2681 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2682 2683 Register AndMaskSrc = LiveIn; 2684 2685 if (Shift != 0) { 2686 auto ShiftAmt = B.buildConstant(S32, Shift); 2687 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2688 } 2689 2690 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2691 } else { 2692 B.buildCopy(DstReg, LiveIn); 2693 } 2694 2695 return true; 2696 } 2697 2698 bool AMDGPULegalizerInfo::loadInputValue( 2699 Register DstReg, MachineIRBuilder &B, 2700 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2701 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2702 const ArgDescriptor *Arg; 2703 const TargetRegisterClass *ArgRC; 2704 LLT ArgTy; 2705 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2706 2707 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2708 return false; // TODO: Handle these 2709 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2713 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2714 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2715 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2716 return false; 2717 2718 MI.eraseFromParent(); 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2723 MachineRegisterInfo &MRI, 2724 MachineIRBuilder &B) const { 2725 Register Dst = MI.getOperand(0).getReg(); 2726 LLT DstTy = MRI.getType(Dst); 2727 LLT S16 = LLT::scalar(16); 2728 LLT S32 = LLT::scalar(32); 2729 LLT S64 = LLT::scalar(64); 2730 2731 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2732 return true; 2733 2734 if (DstTy == S16) 2735 return legalizeFDIV16(MI, MRI, B); 2736 if (DstTy == S32) 2737 return legalizeFDIV32(MI, MRI, B); 2738 if (DstTy == S64) 2739 return legalizeFDIV64(MI, MRI, B); 2740 2741 return false; 2742 } 2743 2744 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2745 Register DstReg, 2746 Register X, 2747 Register Y, 2748 bool IsDiv) const { 2749 const LLT S1 = LLT::scalar(1); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2753 // algorithm used here. 2754 2755 // Initial estimate of inv(y). 2756 auto FloatY = B.buildUITOFP(S32, Y); 2757 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2758 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2759 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2760 auto Z = B.buildFPTOUI(S32, ScaledY); 2761 2762 // One round of UNR. 2763 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2764 auto NegYZ = B.buildMul(S32, NegY, Z); 2765 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2766 2767 // Quotient/remainder estimate. 2768 auto Q = B.buildUMulH(S32, X, Z); 2769 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2770 2771 // First quotient/remainder refinement. 2772 auto One = B.buildConstant(S32, 1); 2773 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2774 if (IsDiv) 2775 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2776 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2777 2778 // Second quotient/remainder refinement. 2779 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2780 if (IsDiv) 2781 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2782 else 2783 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2784 } 2785 2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2787 MachineRegisterInfo &MRI, 2788 MachineIRBuilder &B) const { 2789 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2790 Register DstReg = MI.getOperand(0).getReg(); 2791 Register Num = MI.getOperand(1).getReg(); 2792 Register Den = MI.getOperand(2).getReg(); 2793 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2794 MI.eraseFromParent(); 2795 return true; 2796 } 2797 2798 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2799 // 2800 // Return lo, hi of result 2801 // 2802 // %cvt.lo = G_UITOFP Val.lo 2803 // %cvt.hi = G_UITOFP Val.hi 2804 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2805 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2806 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2807 // %mul2 = G_FMUL %mul1, 2**(-32) 2808 // %trunc = G_INTRINSIC_TRUNC %mul2 2809 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2810 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2811 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2812 Register Val) { 2813 const LLT S32 = LLT::scalar(32); 2814 auto Unmerge = B.buildUnmerge(S32, Val); 2815 2816 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2817 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2818 2819 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2820 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2821 2822 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2823 auto Mul1 = 2824 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2825 2826 // 2**(-32) 2827 auto Mul2 = 2828 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2829 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2830 2831 // -(2**32) 2832 auto Mad2 = B.buildFMAD(S32, Trunc, 2833 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2834 2835 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2836 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2837 2838 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2839 } 2840 2841 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2842 Register DstReg, 2843 Register Numer, 2844 Register Denom, 2845 bool IsDiv) const { 2846 const LLT S32 = LLT::scalar(32); 2847 const LLT S64 = LLT::scalar(64); 2848 const LLT S1 = LLT::scalar(1); 2849 Register RcpLo, RcpHi; 2850 2851 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2852 2853 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2854 2855 auto Zero64 = B.buildConstant(S64, 0); 2856 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2857 2858 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2859 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2860 2861 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2862 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2863 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2864 2865 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2866 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2867 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2868 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2869 2870 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2871 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2872 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2873 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2874 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2875 2876 auto Zero32 = B.buildConstant(S32, 0); 2877 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2878 auto Add2_HiC = 2879 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2880 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2881 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2882 2883 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2884 Register NumerLo = UnmergeNumer.getReg(0); 2885 Register NumerHi = UnmergeNumer.getReg(1); 2886 2887 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2888 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2889 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2890 Register Mul3_Lo = UnmergeMul3.getReg(0); 2891 Register Mul3_Hi = UnmergeMul3.getReg(1); 2892 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2893 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2894 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2895 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2896 2897 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2898 Register DenomLo = UnmergeDenom.getReg(0); 2899 Register DenomHi = UnmergeDenom.getReg(1); 2900 2901 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2902 auto C1 = B.buildSExt(S32, CmpHi); 2903 2904 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2905 auto C2 = B.buildSExt(S32, CmpLo); 2906 2907 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2908 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2909 2910 // TODO: Here and below portions of the code can be enclosed into if/endif. 2911 // Currently control flow is unconditional and we have 4 selects after 2912 // potential endif to substitute PHIs. 2913 2914 // if C3 != 0 ... 2915 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2916 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2917 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2918 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2919 2920 auto One64 = B.buildConstant(S64, 1); 2921 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2922 2923 auto C4 = 2924 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2925 auto C5 = 2926 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2927 auto C6 = B.buildSelect( 2928 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2929 2930 // if (C6 != 0) 2931 auto Add4 = B.buildAdd(S64, Add3, One64); 2932 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2933 2934 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2935 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2936 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2937 2938 // endif C6 2939 // endif C3 2940 2941 if (IsDiv) { 2942 auto Sel1 = B.buildSelect( 2943 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2944 B.buildSelect(DstReg, 2945 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2946 } else { 2947 auto Sel2 = B.buildSelect( 2948 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2949 B.buildSelect(DstReg, 2950 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2951 } 2952 } 2953 2954 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2955 MachineRegisterInfo &MRI, 2956 MachineIRBuilder &B) const { 2957 const LLT S64 = LLT::scalar(64); 2958 const LLT S32 = LLT::scalar(32); 2959 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2960 Register DstReg = MI.getOperand(0).getReg(); 2961 Register Num = MI.getOperand(1).getReg(); 2962 Register Den = MI.getOperand(2).getReg(); 2963 LLT Ty = MRI.getType(DstReg); 2964 2965 if (Ty == S32) 2966 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2967 else if (Ty == S64) 2968 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2969 else 2970 return false; 2971 2972 MI.eraseFromParent(); 2973 return true; 2974 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B) const { 2980 const LLT S64 = LLT::scalar(64); 2981 const LLT S32 = LLT::scalar(32); 2982 2983 Register DstReg = MI.getOperand(0).getReg(); 2984 const LLT Ty = MRI.getType(DstReg); 2985 if (Ty != S32 && Ty != S64) 2986 return false; 2987 2988 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2989 2990 Register LHS = MI.getOperand(1).getReg(); 2991 Register RHS = MI.getOperand(2).getReg(); 2992 2993 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2994 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2995 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2996 2997 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2998 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2999 3000 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3001 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3002 3003 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3004 if (Ty == S32) 3005 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3006 else 3007 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3008 3009 Register Sign; 3010 if (IsDiv) 3011 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3012 else 3013 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3014 3015 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3016 B.buildSub(DstReg, UDivRem, Sign); 3017 3018 MI.eraseFromParent(); 3019 return true; 3020 } 3021 3022 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3023 MachineRegisterInfo &MRI, 3024 MachineIRBuilder &B) const { 3025 Register Res = MI.getOperand(0).getReg(); 3026 Register LHS = MI.getOperand(1).getReg(); 3027 Register RHS = MI.getOperand(2).getReg(); 3028 3029 uint16_t Flags = MI.getFlags(); 3030 3031 LLT ResTy = MRI.getType(Res); 3032 LLT S32 = LLT::scalar(32); 3033 LLT S64 = LLT::scalar(64); 3034 3035 const MachineFunction &MF = B.getMF(); 3036 bool Unsafe = 3037 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3038 3039 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3040 return false; 3041 3042 if (!Unsafe && ResTy == S32 && 3043 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3044 return false; 3045 3046 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3047 // 1 / x -> RCP(x) 3048 if (CLHS->isExactlyValue(1.0)) { 3049 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3050 .addUse(RHS) 3051 .setMIFlags(Flags); 3052 3053 MI.eraseFromParent(); 3054 return true; 3055 } 3056 3057 // -1 / x -> RCP( FNEG(x) ) 3058 if (CLHS->isExactlyValue(-1.0)) { 3059 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3060 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3061 .addUse(FNeg.getReg(0)) 3062 .setMIFlags(Flags); 3063 3064 MI.eraseFromParent(); 3065 return true; 3066 } 3067 } 3068 3069 // x / y -> x * (1.0 / y) 3070 if (Unsafe) { 3071 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3072 .addUse(RHS) 3073 .setMIFlags(Flags); 3074 B.buildFMul(Res, LHS, RCP, Flags); 3075 3076 MI.eraseFromParent(); 3077 return true; 3078 } 3079 3080 return false; 3081 } 3082 3083 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3084 MachineRegisterInfo &MRI, 3085 MachineIRBuilder &B) const { 3086 Register Res = MI.getOperand(0).getReg(); 3087 Register LHS = MI.getOperand(1).getReg(); 3088 Register RHS = MI.getOperand(2).getReg(); 3089 3090 uint16_t Flags = MI.getFlags(); 3091 3092 LLT S16 = LLT::scalar(16); 3093 LLT S32 = LLT::scalar(32); 3094 3095 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3096 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3097 3098 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3099 .addUse(RHSExt.getReg(0)) 3100 .setMIFlags(Flags); 3101 3102 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3103 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3104 3105 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3106 .addUse(RDst.getReg(0)) 3107 .addUse(RHS) 3108 .addUse(LHS) 3109 .setMIFlags(Flags); 3110 3111 MI.eraseFromParent(); 3112 return true; 3113 } 3114 3115 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3116 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3117 static void toggleSPDenormMode(bool Enable, 3118 MachineIRBuilder &B, 3119 const GCNSubtarget &ST, 3120 AMDGPU::SIModeRegisterDefaults Mode) { 3121 // Set SP denorm mode to this value. 3122 unsigned SPDenormMode = 3123 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3124 3125 if (ST.hasDenormModeInst()) { 3126 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3127 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3128 3129 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3130 B.buildInstr(AMDGPU::S_DENORM_MODE) 3131 .addImm(NewDenormModeValue); 3132 3133 } else { 3134 // Select FP32 bit field in mode register. 3135 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3136 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3137 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3138 3139 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3140 .addImm(SPDenormMode) 3141 .addImm(SPDenormModeBitField); 3142 } 3143 } 3144 3145 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3146 MachineRegisterInfo &MRI, 3147 MachineIRBuilder &B) const { 3148 Register Res = MI.getOperand(0).getReg(); 3149 Register LHS = MI.getOperand(1).getReg(); 3150 Register RHS = MI.getOperand(2).getReg(); 3151 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3152 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3153 3154 uint16_t Flags = MI.getFlags(); 3155 3156 LLT S32 = LLT::scalar(32); 3157 LLT S1 = LLT::scalar(1); 3158 3159 auto One = B.buildFConstant(S32, 1.0f); 3160 3161 auto DenominatorScaled = 3162 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3163 .addUse(LHS) 3164 .addUse(RHS) 3165 .addImm(0) 3166 .setMIFlags(Flags); 3167 auto NumeratorScaled = 3168 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3169 .addUse(LHS) 3170 .addUse(RHS) 3171 .addImm(1) 3172 .setMIFlags(Flags); 3173 3174 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3175 .addUse(DenominatorScaled.getReg(0)) 3176 .setMIFlags(Flags); 3177 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3178 3179 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3180 // aren't modeled as reading it. 3181 if (!Mode.allFP32Denormals()) 3182 toggleSPDenormMode(true, B, ST, Mode); 3183 3184 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3185 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3186 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3187 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3188 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3189 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3190 3191 if (!Mode.allFP32Denormals()) 3192 toggleSPDenormMode(false, B, ST, Mode); 3193 3194 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3195 .addUse(Fma4.getReg(0)) 3196 .addUse(Fma1.getReg(0)) 3197 .addUse(Fma3.getReg(0)) 3198 .addUse(NumeratorScaled.getReg(1)) 3199 .setMIFlags(Flags); 3200 3201 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3202 .addUse(Fmas.getReg(0)) 3203 .addUse(RHS) 3204 .addUse(LHS) 3205 .setMIFlags(Flags); 3206 3207 MI.eraseFromParent(); 3208 return true; 3209 } 3210 3211 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3212 MachineRegisterInfo &MRI, 3213 MachineIRBuilder &B) const { 3214 Register Res = MI.getOperand(0).getReg(); 3215 Register LHS = MI.getOperand(1).getReg(); 3216 Register RHS = MI.getOperand(2).getReg(); 3217 3218 uint16_t Flags = MI.getFlags(); 3219 3220 LLT S64 = LLT::scalar(64); 3221 LLT S1 = LLT::scalar(1); 3222 3223 auto One = B.buildFConstant(S64, 1.0); 3224 3225 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3226 .addUse(LHS) 3227 .addUse(RHS) 3228 .addImm(0) 3229 .setMIFlags(Flags); 3230 3231 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3232 3233 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3234 .addUse(DivScale0.getReg(0)) 3235 .setMIFlags(Flags); 3236 3237 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3238 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3239 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3240 3241 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3242 .addUse(LHS) 3243 .addUse(RHS) 3244 .addImm(1) 3245 .setMIFlags(Flags); 3246 3247 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3248 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3249 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3250 3251 Register Scale; 3252 if (!ST.hasUsableDivScaleConditionOutput()) { 3253 // Workaround a hardware bug on SI where the condition output from div_scale 3254 // is not usable. 3255 3256 LLT S32 = LLT::scalar(32); 3257 3258 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3259 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3260 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3261 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3262 3263 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3264 Scale1Unmerge.getReg(1)); 3265 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3266 Scale0Unmerge.getReg(1)); 3267 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3268 } else { 3269 Scale = DivScale1.getReg(1); 3270 } 3271 3272 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3273 .addUse(Fma4.getReg(0)) 3274 .addUse(Fma3.getReg(0)) 3275 .addUse(Mul.getReg(0)) 3276 .addUse(Scale) 3277 .setMIFlags(Flags); 3278 3279 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3280 .addUse(Fmas.getReg(0)) 3281 .addUse(RHS) 3282 .addUse(LHS) 3283 .setMIFlags(Flags); 3284 3285 MI.eraseFromParent(); 3286 return true; 3287 } 3288 3289 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3290 MachineRegisterInfo &MRI, 3291 MachineIRBuilder &B) const { 3292 Register Res = MI.getOperand(0).getReg(); 3293 Register LHS = MI.getOperand(2).getReg(); 3294 Register RHS = MI.getOperand(3).getReg(); 3295 uint16_t Flags = MI.getFlags(); 3296 3297 LLT S32 = LLT::scalar(32); 3298 LLT S1 = LLT::scalar(1); 3299 3300 auto Abs = B.buildFAbs(S32, RHS, Flags); 3301 const APFloat C0Val(1.0f); 3302 3303 auto C0 = B.buildConstant(S32, 0x6f800000); 3304 auto C1 = B.buildConstant(S32, 0x2f800000); 3305 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3306 3307 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3308 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3309 3310 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3311 3312 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3313 .addUse(Mul0.getReg(0)) 3314 .setMIFlags(Flags); 3315 3316 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3317 3318 B.buildFMul(Res, Sel, Mul1, Flags); 3319 3320 MI.eraseFromParent(); 3321 return true; 3322 } 3323 3324 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3325 // FIXME: Why do we handle this one but not other removed instructions? 3326 // 3327 // Reciprocal square root. The clamp prevents infinite results, clamping 3328 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3329 // +-max_float. 3330 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3331 MachineRegisterInfo &MRI, 3332 MachineIRBuilder &B) const { 3333 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3334 return true; 3335 3336 Register Dst = MI.getOperand(0).getReg(); 3337 Register Src = MI.getOperand(2).getReg(); 3338 auto Flags = MI.getFlags(); 3339 3340 LLT Ty = MRI.getType(Dst); 3341 3342 const fltSemantics *FltSemantics; 3343 if (Ty == LLT::scalar(32)) 3344 FltSemantics = &APFloat::IEEEsingle(); 3345 else if (Ty == LLT::scalar(64)) 3346 FltSemantics = &APFloat::IEEEdouble(); 3347 else 3348 return false; 3349 3350 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3351 .addUse(Src) 3352 .setMIFlags(Flags); 3353 3354 // We don't need to concern ourselves with the snan handling difference, since 3355 // the rsq quieted (or not) so use the one which will directly select. 3356 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3357 const bool UseIEEE = MFI->getMode().IEEE; 3358 3359 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3360 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3361 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3362 3363 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3364 3365 if (UseIEEE) 3366 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3367 else 3368 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3369 MI.eraseFromParent(); 3370 return true; 3371 } 3372 3373 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3374 switch (IID) { 3375 case Intrinsic::amdgcn_ds_fadd: 3376 return AMDGPU::G_ATOMICRMW_FADD; 3377 case Intrinsic::amdgcn_ds_fmin: 3378 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3379 case Intrinsic::amdgcn_ds_fmax: 3380 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3381 default: 3382 llvm_unreachable("not a DS FP intrinsic"); 3383 } 3384 } 3385 3386 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3387 MachineInstr &MI, 3388 Intrinsic::ID IID) const { 3389 GISelChangeObserver &Observer = Helper.Observer; 3390 Observer.changingInstr(MI); 3391 3392 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3393 3394 // The remaining operands were used to set fields in the MemOperand on 3395 // construction. 3396 for (int I = 6; I > 3; --I) 3397 MI.RemoveOperand(I); 3398 3399 MI.RemoveOperand(1); // Remove the intrinsic ID. 3400 Observer.changedInstr(MI); 3401 return true; 3402 } 3403 3404 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3405 MachineRegisterInfo &MRI, 3406 MachineIRBuilder &B) const { 3407 uint64_t Offset = 3408 ST.getTargetLowering()->getImplicitParameterOffset( 3409 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3410 LLT DstTy = MRI.getType(DstReg); 3411 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3412 3413 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3414 if (!loadInputValue(KernargPtrReg, B, 3415 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3416 return false; 3417 3418 // FIXME: This should be nuw 3419 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3420 return true; 3421 } 3422 3423 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3424 MachineRegisterInfo &MRI, 3425 MachineIRBuilder &B) const { 3426 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3427 if (!MFI->isEntryFunction()) { 3428 return legalizePreloadedArgIntrin(MI, MRI, B, 3429 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3430 } 3431 3432 Register DstReg = MI.getOperand(0).getReg(); 3433 if (!getImplicitArgPtr(DstReg, MRI, B)) 3434 return false; 3435 3436 MI.eraseFromParent(); 3437 return true; 3438 } 3439 3440 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3441 MachineRegisterInfo &MRI, 3442 MachineIRBuilder &B, 3443 unsigned AddrSpace) const { 3444 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3445 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3446 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3447 MI.eraseFromParent(); 3448 return true; 3449 } 3450 3451 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3452 // offset (the offset that is included in bounds checking and swizzling, to be 3453 // split between the instruction's voffset and immoffset fields) and soffset 3454 // (the offset that is excluded from bounds checking and swizzling, to go in 3455 // the instruction's soffset field). This function takes the first kind of 3456 // offset and figures out how to split it between voffset and immoffset. 3457 std::tuple<Register, unsigned, unsigned> 3458 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3459 Register OrigOffset) const { 3460 const unsigned MaxImm = 4095; 3461 Register BaseReg; 3462 unsigned TotalConstOffset; 3463 MachineInstr *OffsetDef; 3464 const LLT S32 = LLT::scalar(32); 3465 3466 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3467 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3468 3469 unsigned ImmOffset = TotalConstOffset; 3470 3471 // If the immediate value is too big for the immoffset field, put the value 3472 // and -4096 into the immoffset field so that the value that is copied/added 3473 // for the voffset field is a multiple of 4096, and it stands more chance 3474 // of being CSEd with the copy/add for another similar load/store. 3475 // However, do not do that rounding down to a multiple of 4096 if that is a 3476 // negative number, as it appears to be illegal to have a negative offset 3477 // in the vgpr, even if adding the immediate offset makes it positive. 3478 unsigned Overflow = ImmOffset & ~MaxImm; 3479 ImmOffset -= Overflow; 3480 if ((int32_t)Overflow < 0) { 3481 Overflow += ImmOffset; 3482 ImmOffset = 0; 3483 } 3484 3485 if (Overflow != 0) { 3486 if (!BaseReg) { 3487 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3488 } else { 3489 auto OverflowVal = B.buildConstant(S32, Overflow); 3490 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3491 } 3492 } 3493 3494 if (!BaseReg) 3495 BaseReg = B.buildConstant(S32, 0).getReg(0); 3496 3497 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3498 } 3499 3500 /// Handle register layout difference for f16 images for some subtargets. 3501 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3502 MachineRegisterInfo &MRI, 3503 Register Reg) const { 3504 if (!ST.hasUnpackedD16VMem()) 3505 return Reg; 3506 3507 const LLT S16 = LLT::scalar(16); 3508 const LLT S32 = LLT::scalar(32); 3509 LLT StoreVT = MRI.getType(Reg); 3510 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3511 3512 auto Unmerge = B.buildUnmerge(S16, Reg); 3513 3514 SmallVector<Register, 4> WideRegs; 3515 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3516 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3517 3518 int NumElts = StoreVT.getNumElements(); 3519 3520 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3521 } 3522 3523 Register AMDGPULegalizerInfo::fixStoreSourceType( 3524 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3525 MachineRegisterInfo *MRI = B.getMRI(); 3526 LLT Ty = MRI->getType(VData); 3527 3528 const LLT S16 = LLT::scalar(16); 3529 3530 // Fixup illegal register types for i8 stores. 3531 if (Ty == LLT::scalar(8) || Ty == S16) { 3532 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3533 return AnyExt; 3534 } 3535 3536 if (Ty.isVector()) { 3537 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3538 if (IsFormat) 3539 return handleD16VData(B, *MRI, VData); 3540 } 3541 } 3542 3543 return VData; 3544 } 3545 3546 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3547 MachineRegisterInfo &MRI, 3548 MachineIRBuilder &B, 3549 bool IsTyped, 3550 bool IsFormat) const { 3551 Register VData = MI.getOperand(1).getReg(); 3552 LLT Ty = MRI.getType(VData); 3553 LLT EltTy = Ty.getScalarType(); 3554 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3555 const LLT S32 = LLT::scalar(32); 3556 3557 VData = fixStoreSourceType(B, VData, IsFormat); 3558 Register RSrc = MI.getOperand(2).getReg(); 3559 3560 MachineMemOperand *MMO = *MI.memoperands_begin(); 3561 const int MemSize = MMO->getSize(); 3562 3563 unsigned ImmOffset; 3564 unsigned TotalOffset; 3565 3566 // The typed intrinsics add an immediate after the registers. 3567 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3568 3569 // The struct intrinsic variants add one additional operand over raw. 3570 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3571 Register VIndex; 3572 int OpOffset = 0; 3573 if (HasVIndex) { 3574 VIndex = MI.getOperand(3).getReg(); 3575 OpOffset = 1; 3576 } 3577 3578 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3579 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3580 3581 unsigned Format = 0; 3582 if (IsTyped) { 3583 Format = MI.getOperand(5 + OpOffset).getImm(); 3584 ++OpOffset; 3585 } 3586 3587 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3588 3589 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3590 if (TotalOffset != 0) 3591 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3592 3593 unsigned Opc; 3594 if (IsTyped) { 3595 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3596 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3597 } else if (IsFormat) { 3598 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3599 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3600 } else { 3601 switch (MemSize) { 3602 case 1: 3603 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3604 break; 3605 case 2: 3606 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3607 break; 3608 default: 3609 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3610 break; 3611 } 3612 } 3613 3614 if (!VIndex) 3615 VIndex = B.buildConstant(S32, 0).getReg(0); 3616 3617 auto MIB = B.buildInstr(Opc) 3618 .addUse(VData) // vdata 3619 .addUse(RSrc) // rsrc 3620 .addUse(VIndex) // vindex 3621 .addUse(VOffset) // voffset 3622 .addUse(SOffset) // soffset 3623 .addImm(ImmOffset); // offset(imm) 3624 3625 if (IsTyped) 3626 MIB.addImm(Format); 3627 3628 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3629 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3630 .addMemOperand(MMO); 3631 3632 MI.eraseFromParent(); 3633 return true; 3634 } 3635 3636 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3637 MachineRegisterInfo &MRI, 3638 MachineIRBuilder &B, 3639 bool IsFormat, 3640 bool IsTyped) const { 3641 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3642 MachineMemOperand *MMO = *MI.memoperands_begin(); 3643 const int MemSize = MMO->getSize(); 3644 const LLT S32 = LLT::scalar(32); 3645 3646 Register Dst = MI.getOperand(0).getReg(); 3647 Register RSrc = MI.getOperand(2).getReg(); 3648 3649 // The typed intrinsics add an immediate after the registers. 3650 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3651 3652 // The struct intrinsic variants add one additional operand over raw. 3653 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3654 Register VIndex; 3655 int OpOffset = 0; 3656 if (HasVIndex) { 3657 VIndex = MI.getOperand(3).getReg(); 3658 OpOffset = 1; 3659 } 3660 3661 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3662 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3663 3664 unsigned Format = 0; 3665 if (IsTyped) { 3666 Format = MI.getOperand(5 + OpOffset).getImm(); 3667 ++OpOffset; 3668 } 3669 3670 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3671 unsigned ImmOffset; 3672 unsigned TotalOffset; 3673 3674 LLT Ty = MRI.getType(Dst); 3675 LLT EltTy = Ty.getScalarType(); 3676 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3677 const bool Unpacked = ST.hasUnpackedD16VMem(); 3678 3679 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3680 if (TotalOffset != 0) 3681 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3682 3683 unsigned Opc; 3684 3685 if (IsTyped) { 3686 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3687 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3688 } else if (IsFormat) { 3689 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3690 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3691 } else { 3692 switch (MemSize) { 3693 case 1: 3694 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3695 break; 3696 case 2: 3697 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3698 break; 3699 default: 3700 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3701 break; 3702 } 3703 } 3704 3705 Register LoadDstReg; 3706 3707 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3708 LLT UnpackedTy = Ty.changeElementSize(32); 3709 3710 if (IsExtLoad) 3711 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3712 else if (Unpacked && IsD16 && Ty.isVector()) 3713 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3714 else 3715 LoadDstReg = Dst; 3716 3717 if (!VIndex) 3718 VIndex = B.buildConstant(S32, 0).getReg(0); 3719 3720 auto MIB = B.buildInstr(Opc) 3721 .addDef(LoadDstReg) // vdata 3722 .addUse(RSrc) // rsrc 3723 .addUse(VIndex) // vindex 3724 .addUse(VOffset) // voffset 3725 .addUse(SOffset) // soffset 3726 .addImm(ImmOffset); // offset(imm) 3727 3728 if (IsTyped) 3729 MIB.addImm(Format); 3730 3731 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3732 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3733 .addMemOperand(MMO); 3734 3735 if (LoadDstReg != Dst) { 3736 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3737 3738 // Widen result for extending loads was widened. 3739 if (IsExtLoad) 3740 B.buildTrunc(Dst, LoadDstReg); 3741 else { 3742 // Repack to original 16-bit vector result 3743 // FIXME: G_TRUNC should work, but legalization currently fails 3744 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3745 SmallVector<Register, 4> Repack; 3746 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3747 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3748 B.buildMerge(Dst, Repack); 3749 } 3750 } 3751 3752 MI.eraseFromParent(); 3753 return true; 3754 } 3755 3756 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3757 MachineIRBuilder &B, 3758 bool IsInc) const { 3759 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3760 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3761 B.buildInstr(Opc) 3762 .addDef(MI.getOperand(0).getReg()) 3763 .addUse(MI.getOperand(2).getReg()) 3764 .addUse(MI.getOperand(3).getReg()) 3765 .cloneMemRefs(MI); 3766 MI.eraseFromParent(); 3767 return true; 3768 } 3769 3770 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3771 switch (IntrID) { 3772 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3773 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3774 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3775 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3776 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3777 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3778 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3779 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3780 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3781 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3782 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3783 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3784 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3785 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3786 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3787 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3788 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3789 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3790 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3791 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3792 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3793 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3794 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3795 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3796 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3797 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3798 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3799 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3800 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3801 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3802 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3803 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3804 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3805 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3806 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3807 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3808 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3809 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3810 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3811 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3812 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3813 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3814 default: 3815 llvm_unreachable("unhandled atomic opcode"); 3816 } 3817 } 3818 3819 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3820 MachineIRBuilder &B, 3821 Intrinsic::ID IID) const { 3822 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3823 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3824 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3825 3826 Register Dst; 3827 3828 int OpOffset = 0; 3829 if (HasReturn) { 3830 // A few FP atomics do not support return values. 3831 Dst = MI.getOperand(0).getReg(); 3832 } else { 3833 OpOffset = -1; 3834 } 3835 3836 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3837 Register CmpVal; 3838 3839 if (IsCmpSwap) { 3840 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3841 ++OpOffset; 3842 } 3843 3844 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3845 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3846 3847 // The struct intrinsic variants add one additional operand over raw. 3848 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3849 Register VIndex; 3850 if (HasVIndex) { 3851 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3852 ++OpOffset; 3853 } 3854 3855 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3856 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3857 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3858 3859 MachineMemOperand *MMO = *MI.memoperands_begin(); 3860 3861 unsigned ImmOffset; 3862 unsigned TotalOffset; 3863 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3864 if (TotalOffset != 0) 3865 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3866 3867 if (!VIndex) 3868 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3869 3870 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3871 3872 if (HasReturn) 3873 MIB.addDef(Dst); 3874 3875 MIB.addUse(VData); // vdata 3876 3877 if (IsCmpSwap) 3878 MIB.addReg(CmpVal); 3879 3880 MIB.addUse(RSrc) // rsrc 3881 .addUse(VIndex) // vindex 3882 .addUse(VOffset) // voffset 3883 .addUse(SOffset) // soffset 3884 .addImm(ImmOffset) // offset(imm) 3885 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3886 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3887 .addMemOperand(MMO); 3888 3889 MI.eraseFromParent(); 3890 return true; 3891 } 3892 3893 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3894 /// vector with s16 typed elements. 3895 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3896 SmallVectorImpl<Register> &PackedAddrs, 3897 int AddrIdx, int DimIdx, int EndIdx, 3898 int NumGradients) { 3899 const LLT S16 = LLT::scalar(16); 3900 const LLT V2S16 = LLT::vector(2, 16); 3901 3902 for (int I = AddrIdx; I < EndIdx; ++I) { 3903 MachineOperand &SrcOp = MI.getOperand(I); 3904 if (!SrcOp.isReg()) 3905 continue; // _L to _LZ may have eliminated this. 3906 3907 Register AddrReg = SrcOp.getReg(); 3908 3909 if (I < DimIdx) { 3910 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3911 PackedAddrs.push_back(AddrReg); 3912 } else { 3913 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3914 // derivatives dx/dh and dx/dv are packed with undef. 3915 if (((I + 1) >= EndIdx) || 3916 ((NumGradients / 2) % 2 == 1 && 3917 (I == DimIdx + (NumGradients / 2) - 1 || 3918 I == DimIdx + NumGradients - 1)) || 3919 // Check for _L to _LZ optimization 3920 !MI.getOperand(I + 1).isReg()) { 3921 PackedAddrs.push_back( 3922 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3923 .getReg(0)); 3924 } else { 3925 PackedAddrs.push_back( 3926 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3927 .getReg(0)); 3928 ++I; 3929 } 3930 } 3931 } 3932 } 3933 3934 /// Convert from separate vaddr components to a single vector address register, 3935 /// and replace the remaining operands with $noreg. 3936 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3937 int DimIdx, int NumVAddrs) { 3938 const LLT S32 = LLT::scalar(32); 3939 3940 SmallVector<Register, 8> AddrRegs; 3941 for (int I = 0; I != NumVAddrs; ++I) { 3942 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3943 if (SrcOp.isReg()) { 3944 AddrRegs.push_back(SrcOp.getReg()); 3945 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3946 } 3947 } 3948 3949 int NumAddrRegs = AddrRegs.size(); 3950 if (NumAddrRegs != 1) { 3951 // Round up to 8 elements for v5-v7 3952 // FIXME: Missing intermediate sized register classes and instructions. 3953 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3954 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3955 auto Undef = B.buildUndef(S32); 3956 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3957 NumAddrRegs = RoundedNumRegs; 3958 } 3959 3960 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3961 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3962 } 3963 3964 for (int I = 1; I != NumVAddrs; ++I) { 3965 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3966 if (SrcOp.isReg()) 3967 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3968 } 3969 } 3970 3971 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3972 /// 3973 /// Depending on the subtarget, load/store with 16-bit element data need to be 3974 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3975 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3976 /// registers. 3977 /// 3978 /// We don't want to directly select image instructions just yet, but also want 3979 /// to exposes all register repacking to the legalizer/combiners. We also don't 3980 /// want a selected instrution entering RegBankSelect. In order to avoid 3981 /// defining a multitude of intermediate image instructions, directly hack on 3982 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3983 /// now unnecessary arguments with $noreg. 3984 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3985 MachineInstr &MI, MachineIRBuilder &B, 3986 GISelChangeObserver &Observer, 3987 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3988 3989 const int NumDefs = MI.getNumExplicitDefs(); 3990 bool IsTFE = NumDefs == 2; 3991 // We are only processing the operands of d16 image operations on subtargets 3992 // that use the unpacked register layout, or need to repack the TFE result. 3993 3994 // TODO: Do we need to guard against already legalized intrinsics? 3995 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3996 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3997 3998 MachineRegisterInfo *MRI = B.getMRI(); 3999 const LLT S32 = LLT::scalar(32); 4000 const LLT S16 = LLT::scalar(16); 4001 const LLT V2S16 = LLT::vector(2, 16); 4002 4003 // Index of first address argument 4004 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 4005 4006 int NumVAddrs, NumGradients; 4007 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 4008 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 4009 getDMaskIdx(BaseOpcode, NumDefs); 4010 unsigned DMask = 0; 4011 4012 // Check for 16 bit addresses and pack if true. 4013 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 4014 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 4015 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 4016 const bool IsG16 = GradTy == S16; 4017 const bool IsA16 = AddrTy == S16; 4018 4019 int DMaskLanes = 0; 4020 if (!BaseOpcode->Atomic) { 4021 DMask = MI.getOperand(DMaskIdx).getImm(); 4022 if (BaseOpcode->Gather4) { 4023 DMaskLanes = 4; 4024 } else if (DMask != 0) { 4025 DMaskLanes = countPopulation(DMask); 4026 } else if (!IsTFE && !BaseOpcode->Store) { 4027 // If dmask is 0, this is a no-op load. This can be eliminated. 4028 B.buildUndef(MI.getOperand(0)); 4029 MI.eraseFromParent(); 4030 return true; 4031 } 4032 } 4033 4034 Observer.changingInstr(MI); 4035 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4036 4037 unsigned NewOpcode = NumDefs == 0 ? 4038 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4039 4040 // Track that we legalized this 4041 MI.setDesc(B.getTII().get(NewOpcode)); 4042 4043 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4044 // dmask to be at least 1 otherwise the instruction will fail 4045 if (IsTFE && DMask == 0) { 4046 DMask = 0x1; 4047 DMaskLanes = 1; 4048 MI.getOperand(DMaskIdx).setImm(DMask); 4049 } 4050 4051 if (BaseOpcode->Atomic) { 4052 Register VData0 = MI.getOperand(2).getReg(); 4053 LLT Ty = MRI->getType(VData0); 4054 4055 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4056 if (Ty.isVector()) 4057 return false; 4058 4059 if (BaseOpcode->AtomicX2) { 4060 Register VData1 = MI.getOperand(3).getReg(); 4061 // The two values are packed in one register. 4062 LLT PackedTy = LLT::vector(2, Ty); 4063 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4064 MI.getOperand(2).setReg(Concat.getReg(0)); 4065 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4066 } 4067 } 4068 4069 int CorrectedNumVAddrs = NumVAddrs; 4070 4071 // Optimize _L to _LZ when _L is zero 4072 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4073 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4074 const ConstantFP *ConstantLod; 4075 const int LodIdx = AddrIdx + NumVAddrs - 1; 4076 4077 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4078 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4079 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4080 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4081 LZMappingInfo->LZ, ImageDimIntr->Dim); 4082 4083 // The starting indexes should remain in the same place. 4084 --NumVAddrs; 4085 --CorrectedNumVAddrs; 4086 4087 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4088 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4089 MI.RemoveOperand(LodIdx); 4090 } 4091 } 4092 } 4093 4094 // Optimize _mip away, when 'lod' is zero 4095 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4096 int64_t ConstantLod; 4097 const int LodIdx = AddrIdx + NumVAddrs - 1; 4098 4099 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4100 if (ConstantLod == 0) { 4101 // TODO: Change intrinsic opcode and remove operand instead or replacing 4102 // it with 0, as the _L to _LZ handling is done above. 4103 MI.getOperand(LodIdx).ChangeToImmediate(0); 4104 --CorrectedNumVAddrs; 4105 } 4106 } 4107 } 4108 4109 // Rewrite the addressing register layout before doing anything else. 4110 if (IsA16 || IsG16) { 4111 if (IsA16) { 4112 // Target must support the feature and gradients need to be 16 bit too 4113 if (!ST.hasA16() || !IsG16) 4114 return false; 4115 } else if (!ST.hasG16()) 4116 return false; 4117 4118 if (NumVAddrs > 1) { 4119 SmallVector<Register, 4> PackedRegs; 4120 // Don't compress addresses for G16 4121 const int PackEndIdx = 4122 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4123 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4124 PackEndIdx, NumGradients); 4125 4126 if (!IsA16) { 4127 // Add uncompressed address 4128 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4129 int AddrReg = MI.getOperand(I).getReg(); 4130 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4131 PackedRegs.push_back(AddrReg); 4132 } 4133 } 4134 4135 // See also below in the non-a16 branch 4136 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4137 4138 if (!UseNSA && PackedRegs.size() > 1) { 4139 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4140 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4141 PackedRegs[0] = Concat.getReg(0); 4142 PackedRegs.resize(1); 4143 } 4144 4145 const int NumPacked = PackedRegs.size(); 4146 for (int I = 0; I != NumVAddrs; ++I) { 4147 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4148 if (!SrcOp.isReg()) { 4149 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4150 continue; 4151 } 4152 4153 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4154 4155 if (I < NumPacked) 4156 SrcOp.setReg(PackedRegs[I]); 4157 else 4158 SrcOp.setReg(AMDGPU::NoRegister); 4159 } 4160 } 4161 } else { 4162 // If the register allocator cannot place the address registers contiguously 4163 // without introducing moves, then using the non-sequential address encoding 4164 // is always preferable, since it saves VALU instructions and is usually a 4165 // wash in terms of code size or even better. 4166 // 4167 // However, we currently have no way of hinting to the register allocator 4168 // that MIMG addresses should be placed contiguously when it is possible to 4169 // do so, so force non-NSA for the common 2-address case as a heuristic. 4170 // 4171 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4172 // allocation when possible. 4173 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4174 4175 if (!UseNSA && NumVAddrs > 1) 4176 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4177 } 4178 4179 int Flags = 0; 4180 if (IsA16) 4181 Flags |= 1; 4182 if (IsG16) 4183 Flags |= 2; 4184 MI.addOperand(MachineOperand::CreateImm(Flags)); 4185 4186 if (BaseOpcode->Store) { // No TFE for stores? 4187 // TODO: Handle dmask trim 4188 Register VData = MI.getOperand(1).getReg(); 4189 LLT Ty = MRI->getType(VData); 4190 if (!Ty.isVector() || Ty.getElementType() != S16) 4191 return true; 4192 4193 Register RepackedReg = handleD16VData(B, *MRI, VData); 4194 if (RepackedReg != VData) { 4195 MI.getOperand(1).setReg(RepackedReg); 4196 } 4197 4198 return true; 4199 } 4200 4201 Register DstReg = MI.getOperand(0).getReg(); 4202 LLT Ty = MRI->getType(DstReg); 4203 const LLT EltTy = Ty.getScalarType(); 4204 const bool IsD16 = Ty.getScalarType() == S16; 4205 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4206 4207 // Confirm that the return type is large enough for the dmask specified 4208 if (NumElts < DMaskLanes) 4209 return false; 4210 4211 if (NumElts > 4 || DMaskLanes > 4) 4212 return false; 4213 4214 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4215 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4216 4217 // The raw dword aligned data component of the load. The only legal cases 4218 // where this matters should be when using the packed D16 format, for 4219 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4220 LLT RoundedTy; 4221 4222 // S32 vector to to cover all data, plus TFE result element. 4223 LLT TFETy; 4224 4225 // Register type to use for each loaded component. Will be S32 or V2S16. 4226 LLT RegTy; 4227 4228 if (IsD16 && ST.hasUnpackedD16VMem()) { 4229 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4230 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4231 RegTy = S32; 4232 } else { 4233 unsigned EltSize = EltTy.getSizeInBits(); 4234 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4235 unsigned RoundedSize = 32 * RoundedElts; 4236 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4237 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4238 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4239 } 4240 4241 // The return type does not need adjustment. 4242 // TODO: Should we change s16 case to s32 or <2 x s16>? 4243 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4244 return true; 4245 4246 Register Dst1Reg; 4247 4248 // Insert after the instruction. 4249 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4250 4251 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4252 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4253 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4254 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4255 4256 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4257 4258 MI.getOperand(0).setReg(NewResultReg); 4259 4260 // In the IR, TFE is supposed to be used with a 2 element struct return 4261 // type. The intruction really returns these two values in one contiguous 4262 // register, with one additional dword beyond the loaded data. Rewrite the 4263 // return type to use a single register result. 4264 4265 if (IsTFE) { 4266 Dst1Reg = MI.getOperand(1).getReg(); 4267 if (MRI->getType(Dst1Reg) != S32) 4268 return false; 4269 4270 // TODO: Make sure the TFE operand bit is set. 4271 MI.RemoveOperand(1); 4272 4273 // Handle the easy case that requires no repack instructions. 4274 if (Ty == S32) { 4275 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4276 return true; 4277 } 4278 } 4279 4280 // Now figure out how to copy the new result register back into the old 4281 // result. 4282 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4283 4284 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4285 4286 if (ResultNumRegs == 1) { 4287 assert(!IsTFE); 4288 ResultRegs[0] = NewResultReg; 4289 } else { 4290 // We have to repack into a new vector of some kind. 4291 for (int I = 0; I != NumDataRegs; ++I) 4292 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4293 B.buildUnmerge(ResultRegs, NewResultReg); 4294 4295 // Drop the final TFE element to get the data part. The TFE result is 4296 // directly written to the right place already. 4297 if (IsTFE) 4298 ResultRegs.resize(NumDataRegs); 4299 } 4300 4301 // For an s16 scalar result, we form an s32 result with a truncate regardless 4302 // of packed vs. unpacked. 4303 if (IsD16 && !Ty.isVector()) { 4304 B.buildTrunc(DstReg, ResultRegs[0]); 4305 return true; 4306 } 4307 4308 // Avoid a build/concat_vector of 1 entry. 4309 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4310 B.buildBitcast(DstReg, ResultRegs[0]); 4311 return true; 4312 } 4313 4314 assert(Ty.isVector()); 4315 4316 if (IsD16) { 4317 // For packed D16 results with TFE enabled, all the data components are 4318 // S32. Cast back to the expected type. 4319 // 4320 // TODO: We don't really need to use load s32 elements. We would only need one 4321 // cast for the TFE result if a multiple of v2s16 was used. 4322 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4323 for (Register &Reg : ResultRegs) 4324 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4325 } else if (ST.hasUnpackedD16VMem()) { 4326 for (Register &Reg : ResultRegs) 4327 Reg = B.buildTrunc(S16, Reg).getReg(0); 4328 } 4329 } 4330 4331 auto padWithUndef = [&](LLT Ty, int NumElts) { 4332 if (NumElts == 0) 4333 return; 4334 Register Undef = B.buildUndef(Ty).getReg(0); 4335 for (int I = 0; I != NumElts; ++I) 4336 ResultRegs.push_back(Undef); 4337 }; 4338 4339 // Pad out any elements eliminated due to the dmask. 4340 LLT ResTy = MRI->getType(ResultRegs[0]); 4341 if (!ResTy.isVector()) { 4342 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4343 B.buildBuildVector(DstReg, ResultRegs); 4344 return true; 4345 } 4346 4347 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4348 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4349 4350 // Deal with the one annoying legal case. 4351 const LLT V3S16 = LLT::vector(3, 16); 4352 if (Ty == V3S16) { 4353 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4354 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4355 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4356 return true; 4357 } 4358 4359 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4360 B.buildConcatVectors(DstReg, ResultRegs); 4361 return true; 4362 } 4363 4364 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4365 LegalizerHelper &Helper, MachineInstr &MI) const { 4366 MachineIRBuilder &B = Helper.MIRBuilder; 4367 GISelChangeObserver &Observer = Helper.Observer; 4368 4369 Register Dst = MI.getOperand(0).getReg(); 4370 LLT Ty = B.getMRI()->getType(Dst); 4371 unsigned Size = Ty.getSizeInBits(); 4372 MachineFunction &MF = B.getMF(); 4373 4374 Observer.changingInstr(MI); 4375 4376 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4377 Ty = getBitcastRegisterType(Ty); 4378 Helper.bitcastDst(MI, Ty, 0); 4379 Dst = MI.getOperand(0).getReg(); 4380 B.setInsertPt(B.getMBB(), MI); 4381 } 4382 4383 // FIXME: We don't really need this intermediate instruction. The intrinsic 4384 // should be fixed to have a memory operand. Since it's readnone, we're not 4385 // allowed to add one. 4386 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4387 MI.RemoveOperand(1); // Remove intrinsic ID 4388 4389 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4390 // TODO: Should this use datalayout alignment? 4391 const unsigned MemSize = (Size + 7) / 8; 4392 const Align MemAlign(4); 4393 MachineMemOperand *MMO = MF.getMachineMemOperand( 4394 MachinePointerInfo(), 4395 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4396 MachineMemOperand::MOInvariant, 4397 MemSize, MemAlign); 4398 MI.addMemOperand(MF, MMO); 4399 4400 // There are no 96-bit result scalar loads, but widening to 128-bit should 4401 // always be legal. We may need to restore this to a 96-bit result if it turns 4402 // out this needs to be converted to a vector load during RegBankSelect. 4403 if (!isPowerOf2_32(Size)) { 4404 if (Ty.isVector()) 4405 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4406 else 4407 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4408 } 4409 4410 Observer.changedInstr(MI); 4411 return true; 4412 } 4413 4414 // TODO: Move to selection 4415 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4416 MachineRegisterInfo &MRI, 4417 MachineIRBuilder &B) const { 4418 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4419 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4420 !ST.isTrapHandlerEnabled()) { 4421 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4422 } else { 4423 // Pass queue pointer to trap handler as input, and insert trap instruction 4424 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4425 MachineRegisterInfo &MRI = *B.getMRI(); 4426 4427 Register LiveIn = 4428 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4429 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4430 return false; 4431 4432 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4433 B.buildCopy(SGPR01, LiveIn); 4434 B.buildInstr(AMDGPU::S_TRAP) 4435 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4436 .addReg(SGPR01, RegState::Implicit); 4437 } 4438 4439 MI.eraseFromParent(); 4440 return true; 4441 } 4442 4443 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4444 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4445 // Is non-HSA path or trap-handler disabled? then, report a warning 4446 // accordingly 4447 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4448 !ST.isTrapHandlerEnabled()) { 4449 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4450 "debugtrap handler not supported", 4451 MI.getDebugLoc(), DS_Warning); 4452 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4453 Ctx.diagnose(NoTrap); 4454 } else { 4455 // Insert debug-trap instruction 4456 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4457 } 4458 4459 MI.eraseFromParent(); 4460 return true; 4461 } 4462 4463 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4464 MachineInstr &MI) const { 4465 MachineIRBuilder &B = Helper.MIRBuilder; 4466 MachineRegisterInfo &MRI = *B.getMRI(); 4467 4468 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4469 auto IntrID = MI.getIntrinsicID(); 4470 switch (IntrID) { 4471 case Intrinsic::amdgcn_if: 4472 case Intrinsic::amdgcn_else: { 4473 MachineInstr *Br = nullptr; 4474 MachineBasicBlock *UncondBrTarget = nullptr; 4475 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4476 const SIRegisterInfo *TRI 4477 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4478 4479 Register Def = MI.getOperand(1).getReg(); 4480 Register Use = MI.getOperand(3).getReg(); 4481 4482 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4483 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4484 if (IntrID == Intrinsic::amdgcn_if) { 4485 B.buildInstr(AMDGPU::SI_IF) 4486 .addDef(Def) 4487 .addUse(Use) 4488 .addMBB(UncondBrTarget); 4489 } else { 4490 B.buildInstr(AMDGPU::SI_ELSE) 4491 .addDef(Def) 4492 .addUse(Use) 4493 .addMBB(UncondBrTarget) 4494 .addImm(0); 4495 } 4496 4497 if (Br) { 4498 Br->getOperand(0).setMBB(CondBrTarget); 4499 } else { 4500 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4501 // since we're swapping branch targets it needs to be reinserted. 4502 // FIXME: IRTranslator should probably not do this 4503 B.buildBr(*CondBrTarget); 4504 } 4505 4506 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4507 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4508 MI.eraseFromParent(); 4509 BrCond->eraseFromParent(); 4510 return true; 4511 } 4512 4513 return false; 4514 } 4515 case Intrinsic::amdgcn_loop: { 4516 MachineInstr *Br = nullptr; 4517 MachineBasicBlock *UncondBrTarget = nullptr; 4518 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4519 const SIRegisterInfo *TRI 4520 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4521 4522 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4523 Register Reg = MI.getOperand(2).getReg(); 4524 4525 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4526 B.buildInstr(AMDGPU::SI_LOOP) 4527 .addUse(Reg) 4528 .addMBB(UncondBrTarget); 4529 4530 if (Br) 4531 Br->getOperand(0).setMBB(CondBrTarget); 4532 else 4533 B.buildBr(*CondBrTarget); 4534 4535 MI.eraseFromParent(); 4536 BrCond->eraseFromParent(); 4537 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4538 return true; 4539 } 4540 4541 return false; 4542 } 4543 case Intrinsic::amdgcn_kernarg_segment_ptr: 4544 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4545 // This only makes sense to call in a kernel, so just lower to null. 4546 B.buildConstant(MI.getOperand(0).getReg(), 0); 4547 MI.eraseFromParent(); 4548 return true; 4549 } 4550 4551 return legalizePreloadedArgIntrin( 4552 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4553 case Intrinsic::amdgcn_implicitarg_ptr: 4554 return legalizeImplicitArgPtr(MI, MRI, B); 4555 case Intrinsic::amdgcn_workitem_id_x: 4556 return legalizePreloadedArgIntrin(MI, MRI, B, 4557 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4558 case Intrinsic::amdgcn_workitem_id_y: 4559 return legalizePreloadedArgIntrin(MI, MRI, B, 4560 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4561 case Intrinsic::amdgcn_workitem_id_z: 4562 return legalizePreloadedArgIntrin(MI, MRI, B, 4563 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4564 case Intrinsic::amdgcn_workgroup_id_x: 4565 return legalizePreloadedArgIntrin(MI, MRI, B, 4566 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4567 case Intrinsic::amdgcn_workgroup_id_y: 4568 return legalizePreloadedArgIntrin(MI, MRI, B, 4569 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4570 case Intrinsic::amdgcn_workgroup_id_z: 4571 return legalizePreloadedArgIntrin(MI, MRI, B, 4572 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4573 case Intrinsic::amdgcn_dispatch_ptr: 4574 return legalizePreloadedArgIntrin(MI, MRI, B, 4575 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4576 case Intrinsic::amdgcn_queue_ptr: 4577 return legalizePreloadedArgIntrin(MI, MRI, B, 4578 AMDGPUFunctionArgInfo::QUEUE_PTR); 4579 case Intrinsic::amdgcn_implicit_buffer_ptr: 4580 return legalizePreloadedArgIntrin( 4581 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4582 case Intrinsic::amdgcn_dispatch_id: 4583 return legalizePreloadedArgIntrin(MI, MRI, B, 4584 AMDGPUFunctionArgInfo::DISPATCH_ID); 4585 case Intrinsic::amdgcn_fdiv_fast: 4586 return legalizeFDIVFastIntrin(MI, MRI, B); 4587 case Intrinsic::amdgcn_is_shared: 4588 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4589 case Intrinsic::amdgcn_is_private: 4590 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4591 case Intrinsic::amdgcn_wavefrontsize: { 4592 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4593 MI.eraseFromParent(); 4594 return true; 4595 } 4596 case Intrinsic::amdgcn_s_buffer_load: 4597 return legalizeSBufferLoad(Helper, MI); 4598 case Intrinsic::amdgcn_raw_buffer_store: 4599 case Intrinsic::amdgcn_struct_buffer_store: 4600 return legalizeBufferStore(MI, MRI, B, false, false); 4601 case Intrinsic::amdgcn_raw_buffer_store_format: 4602 case Intrinsic::amdgcn_struct_buffer_store_format: 4603 return legalizeBufferStore(MI, MRI, B, false, true); 4604 case Intrinsic::amdgcn_raw_tbuffer_store: 4605 case Intrinsic::amdgcn_struct_tbuffer_store: 4606 return legalizeBufferStore(MI, MRI, B, true, true); 4607 case Intrinsic::amdgcn_raw_buffer_load: 4608 case Intrinsic::amdgcn_struct_buffer_load: 4609 return legalizeBufferLoad(MI, MRI, B, false, false); 4610 case Intrinsic::amdgcn_raw_buffer_load_format: 4611 case Intrinsic::amdgcn_struct_buffer_load_format: 4612 return legalizeBufferLoad(MI, MRI, B, true, false); 4613 case Intrinsic::amdgcn_raw_tbuffer_load: 4614 case Intrinsic::amdgcn_struct_tbuffer_load: 4615 return legalizeBufferLoad(MI, MRI, B, true, true); 4616 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4617 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4618 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4619 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4620 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4621 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4622 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4623 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4624 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4625 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4626 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4627 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4628 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4629 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4630 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4631 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4632 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4633 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4634 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4635 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4636 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4637 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4638 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4639 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4640 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4641 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4642 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4643 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4644 return legalizeBufferAtomic(MI, B, IntrID); 4645 case Intrinsic::amdgcn_atomic_inc: 4646 return legalizeAtomicIncDec(MI, B, true); 4647 case Intrinsic::amdgcn_atomic_dec: 4648 return legalizeAtomicIncDec(MI, B, false); 4649 case Intrinsic::trap: 4650 return legalizeTrapIntrinsic(MI, MRI, B); 4651 case Intrinsic::debugtrap: 4652 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4653 case Intrinsic::amdgcn_rsq_clamp: 4654 return legalizeRsqClampIntrinsic(MI, MRI, B); 4655 case Intrinsic::amdgcn_ds_fadd: 4656 case Intrinsic::amdgcn_ds_fmin: 4657 case Intrinsic::amdgcn_ds_fmax: 4658 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4659 default: { 4660 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4661 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4662 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4663 return true; 4664 } 4665 } 4666 4667 return true; 4668 } 4669