1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 /// Return true if we should legalize a load by widening an odd sized memory 364 /// access up to the alignment. Note this case when the memory access itself 365 /// changes, not the size of the result register. 366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 367 unsigned AlignInBits, unsigned AddrSpace, 368 unsigned Opcode) { 369 // We don't want to widen cases that are naturally legal. 370 if (isPowerOf2_32(SizeInBits)) 371 return false; 372 373 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 374 // end up widening these for a scalar load during RegBankSelect, since there 375 // aren't 96-bit scalar loads. 376 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 377 return false; 378 379 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 380 return false; 381 382 // A load is known dereferenceable up to the alignment, so it's legal to widen 383 // to it. 384 // 385 // TODO: Could check dereferenceable for less aligned cases. 386 unsigned RoundedSize = NextPowerOf2(SizeInBits); 387 if (AlignInBits < RoundedSize) 388 return false; 389 390 // Do not widen if it would introduce a slow unaligned load. 391 const SITargetLowering *TLI = ST.getTargetLowering(); 392 bool Fast = false; 393 return TLI->allowsMisalignedMemoryAccessesImpl( 394 RoundedSize, AddrSpace, Align(AlignInBits / 8), 395 MachineMemOperand::MOLoad, &Fast) && 396 Fast; 397 } 398 399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 400 unsigned Opcode) { 401 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 402 return false; 403 404 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 405 Query.MMODescrs[0].AlignInBits, 406 Query.Types[1].getAddressSpace(), Opcode); 407 } 408 409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 410 const GCNTargetMachine &TM) 411 : ST(ST_) { 412 using namespace TargetOpcode; 413 414 auto GetAddrSpacePtr = [&TM](unsigned AS) { 415 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 416 }; 417 418 const LLT S1 = LLT::scalar(1); 419 const LLT S16 = LLT::scalar(16); 420 const LLT S32 = LLT::scalar(32); 421 const LLT S64 = LLT::scalar(64); 422 const LLT S128 = LLT::scalar(128); 423 const LLT S256 = LLT::scalar(256); 424 const LLT S512 = LLT::scalar(512); 425 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 426 427 const LLT V2S16 = LLT::vector(2, 16); 428 const LLT V4S16 = LLT::vector(4, 16); 429 430 const LLT V2S32 = LLT::vector(2, 32); 431 const LLT V3S32 = LLT::vector(3, 32); 432 const LLT V4S32 = LLT::vector(4, 32); 433 const LLT V5S32 = LLT::vector(5, 32); 434 const LLT V6S32 = LLT::vector(6, 32); 435 const LLT V7S32 = LLT::vector(7, 32); 436 const LLT V8S32 = LLT::vector(8, 32); 437 const LLT V9S32 = LLT::vector(9, 32); 438 const LLT V10S32 = LLT::vector(10, 32); 439 const LLT V11S32 = LLT::vector(11, 32); 440 const LLT V12S32 = LLT::vector(12, 32); 441 const LLT V13S32 = LLT::vector(13, 32); 442 const LLT V14S32 = LLT::vector(14, 32); 443 const LLT V15S32 = LLT::vector(15, 32); 444 const LLT V16S32 = LLT::vector(16, 32); 445 const LLT V32S32 = LLT::vector(32, 32); 446 447 const LLT V2S64 = LLT::vector(2, 64); 448 const LLT V3S64 = LLT::vector(3, 64); 449 const LLT V4S64 = LLT::vector(4, 64); 450 const LLT V5S64 = LLT::vector(5, 64); 451 const LLT V6S64 = LLT::vector(6, 64); 452 const LLT V7S64 = LLT::vector(7, 64); 453 const LLT V8S64 = LLT::vector(8, 64); 454 const LLT V16S64 = LLT::vector(16, 64); 455 456 std::initializer_list<LLT> AllS32Vectors = 457 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 458 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 459 std::initializer_list<LLT> AllS64Vectors = 460 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 461 462 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 463 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 464 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 465 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 466 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 467 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 468 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 469 470 const LLT CodePtr = FlatPtr; 471 472 const std::initializer_list<LLT> AddrSpaces64 = { 473 GlobalPtr, ConstantPtr, FlatPtr 474 }; 475 476 const std::initializer_list<LLT> AddrSpaces32 = { 477 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 478 }; 479 480 const std::initializer_list<LLT> FPTypesBase = { 481 S32, S64 482 }; 483 484 const std::initializer_list<LLT> FPTypes16 = { 485 S32, S64, S16 486 }; 487 488 const std::initializer_list<LLT> FPTypesPK16 = { 489 S32, S64, S16, V2S16 490 }; 491 492 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 493 494 setAction({G_BRCOND, S1}, Legal); // VCC branches 495 setAction({G_BRCOND, S32}, Legal); // SCC branches 496 497 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 498 // elements for v3s16 499 getActionDefinitionsBuilder(G_PHI) 500 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 501 .legalFor(AllS32Vectors) 502 .legalFor(AllS64Vectors) 503 .legalFor(AddrSpaces64) 504 .legalFor(AddrSpaces32) 505 .legalIf(isPointer(0)) 506 .clampScalar(0, S16, S256) 507 .widenScalarToNextPow2(0, 32) 508 .clampMaxNumElements(0, S32, 16) 509 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 510 .scalarize(0); 511 512 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 513 // Full set of gfx9 features. 514 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 515 .legalFor({S32, S16, V2S16}) 516 .clampScalar(0, S16, S32) 517 .clampMaxNumElements(0, S16, 2) 518 .scalarize(0) 519 .widenScalarToNextPow2(0, 32); 520 521 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 522 .legalFor({S32, S16, V2S16}) // Clamp modifier 523 .minScalarOrElt(0, S16) 524 .clampMaxNumElements(0, S16, 2) 525 .scalarize(0) 526 .widenScalarToNextPow2(0, 32) 527 .lower(); 528 } else if (ST.has16BitInsts()) { 529 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 530 .legalFor({S32, S16}) 531 .clampScalar(0, S16, S32) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 534 535 // Technically the saturating operations require clamp bit support, but this 536 // was introduced at the same time as 16-bit operations. 537 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 538 .legalFor({S32, S16}) // Clamp modifier 539 .minScalar(0, S16) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 16) 542 .lower(); 543 544 // We're just lowering this, but it helps get a better result to try to 545 // coerce to the desired type first. 546 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 547 .minScalar(0, S16) 548 .scalarize(0) 549 .lower(); 550 } else { 551 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 552 .legalFor({S32}) 553 .clampScalar(0, S32, S32) 554 .scalarize(0); 555 556 if (ST.hasIntClamp()) { 557 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 558 .legalFor({S32}) // Clamp modifier. 559 .scalarize(0) 560 .minScalarOrElt(0, S32) 561 .lower(); 562 } else { 563 // Clamp bit support was added in VI, along with 16-bit operations. 564 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 565 .minScalar(0, S32) 566 .scalarize(0) 567 .lower(); 568 } 569 570 // FIXME: DAG expansion gets better results. The widening uses the smaller 571 // range values and goes for the min/max lowering directly. 572 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 573 .minScalar(0, S32) 574 .scalarize(0) 575 .lower(); 576 } 577 578 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 579 .customFor({S32, S64}) 580 .clampScalar(0, S32, S64) 581 .widenScalarToNextPow2(0, 32) 582 .scalarize(0); 583 584 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 585 .legalFor({S32}) 586 .clampScalar(0, S32, S32) 587 .scalarize(0); 588 589 // Report legal for any types we can handle anywhere. For the cases only legal 590 // on the SALU, RegBankSelect will be able to re-legalize. 591 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 592 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 593 .clampScalar(0, S32, S64) 594 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 595 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 596 .widenScalarToNextPow2(0) 597 .scalarize(0); 598 599 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 600 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 601 .legalFor({{S32, S1}, {S32, S32}}) 602 .minScalar(0, S32) 603 // TODO: .scalarize(0) 604 .lower(); 605 606 getActionDefinitionsBuilder(G_BITCAST) 607 // Don't worry about the size constraint. 608 .legalIf(all(isRegisterType(0), isRegisterType(1))) 609 .lower(); 610 611 612 getActionDefinitionsBuilder(G_CONSTANT) 613 .legalFor({S1, S32, S64, S16, GlobalPtr, 614 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 615 .legalIf(isPointer(0)) 616 .clampScalar(0, S32, S64) 617 .widenScalarToNextPow2(0); 618 619 getActionDefinitionsBuilder(G_FCONSTANT) 620 .legalFor({S32, S64, S16}) 621 .clampScalar(0, S16, S64); 622 623 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 624 .legalIf(isRegisterType(0)) 625 // s1 and s16 are special cases because they have legal operations on 626 // them, but don't really occupy registers in the normal way. 627 .legalFor({S1, S16}) 628 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 629 .clampScalarOrElt(0, S32, MaxScalar) 630 .widenScalarToNextPow2(0, 32) 631 .clampMaxNumElements(0, S32, 16); 632 633 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 634 635 // If the amount is divergent, we have to do a wave reduction to get the 636 // maximum value, so this is expanded during RegBankSelect. 637 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 638 .legalFor({{PrivatePtr, S32}}); 639 640 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 641 .customIf(typeIsNot(0, PrivatePtr)); 642 643 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 644 645 auto &FPOpActions = getActionDefinitionsBuilder( 646 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 647 .legalFor({S32, S64}); 648 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 649 .customFor({S32, S64}); 650 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 651 .customFor({S32, S64}); 652 653 if (ST.has16BitInsts()) { 654 if (ST.hasVOP3PInsts()) 655 FPOpActions.legalFor({S16, V2S16}); 656 else 657 FPOpActions.legalFor({S16}); 658 659 TrigActions.customFor({S16}); 660 FDIVActions.customFor({S16}); 661 } 662 663 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 664 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 665 666 if (ST.hasVOP3PInsts()) { 667 MinNumMaxNum.customFor(FPTypesPK16) 668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 669 .clampMaxNumElements(0, S16, 2) 670 .clampScalar(0, S16, S64) 671 .scalarize(0); 672 } else if (ST.has16BitInsts()) { 673 MinNumMaxNum.customFor(FPTypes16) 674 .clampScalar(0, S16, S64) 675 .scalarize(0); 676 } else { 677 MinNumMaxNum.customFor(FPTypesBase) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } 681 682 if (ST.hasVOP3PInsts()) 683 FPOpActions.clampMaxNumElements(0, S16, 2); 684 685 FPOpActions 686 .scalarize(0) 687 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 688 689 TrigActions 690 .scalarize(0) 691 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 692 693 FDIVActions 694 .scalarize(0) 695 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 696 697 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 698 .legalFor(FPTypesPK16) 699 .clampMaxNumElements(0, S16, 2) 700 .scalarize(0) 701 .clampScalar(0, S16, S64); 702 703 if (ST.has16BitInsts()) { 704 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 705 .legalFor({S32, S64, S16}) 706 .scalarize(0) 707 .clampScalar(0, S16, S64); 708 } else { 709 getActionDefinitionsBuilder(G_FSQRT) 710 .legalFor({S32, S64}) 711 .scalarize(0) 712 .clampScalar(0, S32, S64); 713 714 if (ST.hasFractBug()) { 715 getActionDefinitionsBuilder(G_FFLOOR) 716 .customFor({S64}) 717 .legalFor({S32, S64}) 718 .scalarize(0) 719 .clampScalar(0, S32, S64); 720 } else { 721 getActionDefinitionsBuilder(G_FFLOOR) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } 726 } 727 728 getActionDefinitionsBuilder(G_FPTRUNC) 729 .legalFor({{S32, S64}, {S16, S32}}) 730 .scalarize(0) 731 .lower(); 732 733 getActionDefinitionsBuilder(G_FPEXT) 734 .legalFor({{S64, S32}, {S32, S16}}) 735 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 736 .scalarize(0); 737 738 getActionDefinitionsBuilder(G_FSUB) 739 // Use actual fsub instruction 740 .legalFor({S32}) 741 // Must use fadd + fneg 742 .lowerFor({S64, S16, V2S16}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 // Whether this is legal depends on the floating point mode for the function. 747 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 748 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 749 FMad.customFor({S32, S16}); 750 else if (ST.hasMadMacF32Insts()) 751 FMad.customFor({S32}); 752 else if (ST.hasMadF16()) 753 FMad.customFor({S16}); 754 FMad.scalarize(0) 755 .lower(); 756 757 auto &FRem = getActionDefinitionsBuilder(G_FREM); 758 if (ST.has16BitInsts()) { 759 FRem.customFor({S16, S32, S64}); 760 } else { 761 FRem.minScalar(0, S32) 762 .customFor({S32, S64}); 763 } 764 FRem.scalarize(0); 765 766 // TODO: Do we need to clamp maximum bitwidth? 767 getActionDefinitionsBuilder(G_TRUNC) 768 .legalIf(isScalar(0)) 769 .legalFor({{V2S16, V2S32}}) 770 .clampMaxNumElements(0, S16, 2) 771 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 772 // situations (like an invalid implicit use), we don't want to infinite loop 773 // in the legalizer. 774 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 775 .alwaysLegal(); 776 777 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 778 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 779 {S32, S1}, {S64, S1}, {S16, S1}}) 780 .scalarize(0) 781 .clampScalar(0, S32, S64) 782 .widenScalarToNextPow2(1, 32); 783 784 // TODO: Split s1->s64 during regbankselect for VALU. 785 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 786 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 787 .lowerFor({{S32, S64}}) 788 .lowerIf(typeIs(1, S1)) 789 .customFor({{S64, S64}}); 790 if (ST.has16BitInsts()) 791 IToFP.legalFor({{S16, S16}}); 792 IToFP.clampScalar(1, S32, S64) 793 .minScalar(0, S32) 794 .scalarize(0) 795 .widenScalarToNextPow2(1); 796 797 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 798 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 799 .customFor({{S64, S64}}) 800 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 801 if (ST.has16BitInsts()) 802 FPToI.legalFor({{S16, S16}}); 803 else 804 FPToI.minScalar(1, S32); 805 806 FPToI.minScalar(0, S32) 807 .scalarize(0) 808 .lower(); 809 810 // Lower roundeven into G_FRINT 811 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 812 .scalarize(0) 813 .lower(); 814 815 if (ST.has16BitInsts()) { 816 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 817 .legalFor({S16, S32, S64}) 818 .clampScalar(0, S16, S64) 819 .scalarize(0); 820 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S32, S64}) 823 .clampScalar(0, S32, S64) 824 .scalarize(0); 825 } else { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32}) 828 .customFor({S64}) 829 .clampScalar(0, S32, S64) 830 .scalarize(0); 831 } 832 833 getActionDefinitionsBuilder(G_PTR_ADD) 834 .legalIf(all(isPointer(0), sameSize(0, 1))) 835 .scalarize(0) 836 .scalarSameSizeAs(1, 0); 837 838 getActionDefinitionsBuilder(G_PTRMASK) 839 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 840 .scalarSameSizeAs(1, 0) 841 .scalarize(0); 842 843 auto &CmpBuilder = 844 getActionDefinitionsBuilder(G_ICMP) 845 // The compare output type differs based on the register bank of the output, 846 // so make both s1 and s32 legal. 847 // 848 // Scalar compares producing output in scc will be promoted to s32, as that 849 // is the allocatable register type that will be needed for the copy from 850 // scc. This will be promoted during RegBankSelect, and we assume something 851 // before that won't try to use s32 result types. 852 // 853 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 854 // bank. 855 .legalForCartesianProduct( 856 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 857 .legalForCartesianProduct( 858 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 859 if (ST.has16BitInsts()) { 860 CmpBuilder.legalFor({{S1, S16}}); 861 } 862 863 CmpBuilder 864 .widenScalarToNextPow2(1) 865 .clampScalar(1, S32, S64) 866 .scalarize(0) 867 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 868 869 getActionDefinitionsBuilder(G_FCMP) 870 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 871 .widenScalarToNextPow2(1) 872 .clampScalar(1, S32, S64) 873 .scalarize(0); 874 875 // FIXME: fpow has a selection pattern that should move to custom lowering. 876 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 877 if (ST.has16BitInsts()) 878 Exp2Ops.legalFor({S32, S16}); 879 else 880 Exp2Ops.legalFor({S32}); 881 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 882 Exp2Ops.scalarize(0); 883 884 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 885 if (ST.has16BitInsts()) 886 ExpOps.customFor({{S32}, {S16}}); 887 else 888 ExpOps.customFor({S32}); 889 ExpOps.clampScalar(0, MinScalarFPTy, S32) 890 .scalarize(0); 891 892 getActionDefinitionsBuilder(G_FPOWI) 893 .clampScalar(0, MinScalarFPTy, S32) 894 .lower(); 895 896 // The 64-bit versions produce 32-bit results, but only on the SALU. 897 getActionDefinitionsBuilder(G_CTPOP) 898 .legalFor({{S32, S32}, {S32, S64}}) 899 .clampScalar(0, S32, S32) 900 .clampScalar(1, S32, S64) 901 .scalarize(0) 902 .widenScalarToNextPow2(0, 32) 903 .widenScalarToNextPow2(1, 32); 904 905 // The hardware instructions return a different result on 0 than the generic 906 // instructions expect. The hardware produces -1, but these produce the 907 // bitwidth. 908 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 909 .scalarize(0) 910 .clampScalar(0, S32, S32) 911 .clampScalar(1, S32, S64) 912 .widenScalarToNextPow2(0, 32) 913 .widenScalarToNextPow2(1, 32) 914 .lower(); 915 916 // The 64-bit versions produce 32-bit results, but only on the SALU. 917 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 918 .legalFor({{S32, S32}, {S32, S64}}) 919 .clampScalar(0, S32, S32) 920 .clampScalar(1, S32, S64) 921 .scalarize(0) 922 .widenScalarToNextPow2(0, 32) 923 .widenScalarToNextPow2(1, 32); 924 925 getActionDefinitionsBuilder(G_BITREVERSE) 926 .legalFor({S32}) 927 .clampScalar(0, S32, S32) 928 .scalarize(0); 929 930 if (ST.has16BitInsts()) { 931 getActionDefinitionsBuilder(G_BSWAP) 932 .legalFor({S16, S32, V2S16}) 933 .clampMaxNumElements(0, S16, 2) 934 // FIXME: Fixing non-power-of-2 before clamp is workaround for 935 // narrowScalar limitation. 936 .widenScalarToNextPow2(0) 937 .clampScalar(0, S16, S32) 938 .scalarize(0); 939 940 if (ST.hasVOP3PInsts()) { 941 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 942 .legalFor({S32, S16, V2S16}) 943 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 944 .clampMaxNumElements(0, S16, 2) 945 .minScalar(0, S16) 946 .widenScalarToNextPow2(0) 947 .scalarize(0) 948 .lower(); 949 } else { 950 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 951 .legalFor({S32, S16}) 952 .widenScalarToNextPow2(0) 953 .minScalar(0, S16) 954 .scalarize(0) 955 .lower(); 956 } 957 } else { 958 // TODO: Should have same legality without v_perm_b32 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S32}) 961 .lowerIf(scalarNarrowerThan(0, 32)) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .maxScalar(0, S32) 966 .scalarize(0) 967 .lower(); 968 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 970 .legalFor({S32}) 971 .minScalar(0, S32) 972 .widenScalarToNextPow2(0) 973 .scalarize(0) 974 .lower(); 975 } 976 977 getActionDefinitionsBuilder(G_INTTOPTR) 978 // List the common cases 979 .legalForCartesianProduct(AddrSpaces64, {S64}) 980 .legalForCartesianProduct(AddrSpaces32, {S32}) 981 .scalarize(0) 982 // Accept any address space as long as the size matches 983 .legalIf(sameSize(0, 1)) 984 .widenScalarIf(smallerThan(1, 0), 985 [](const LegalityQuery &Query) { 986 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 987 }) 988 .narrowScalarIf(largerThan(1, 0), 989 [](const LegalityQuery &Query) { 990 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 991 }); 992 993 getActionDefinitionsBuilder(G_PTRTOINT) 994 // List the common cases 995 .legalForCartesianProduct(AddrSpaces64, {S64}) 996 .legalForCartesianProduct(AddrSpaces32, {S32}) 997 .scalarize(0) 998 // Accept any address space as long as the size matches 999 .legalIf(sameSize(0, 1)) 1000 .widenScalarIf(smallerThan(0, 1), 1001 [](const LegalityQuery &Query) { 1002 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1003 }) 1004 .narrowScalarIf( 1005 largerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }); 1009 1010 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1011 .scalarize(0) 1012 .custom(); 1013 1014 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1015 bool IsLoad) -> bool { 1016 const LLT DstTy = Query.Types[0]; 1017 1018 // Split vector extloads. 1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1020 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1021 1022 if (MemSize < DstTy.getSizeInBits()) 1023 MemSize = std::max(MemSize, AlignBits); 1024 1025 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1026 return true; 1027 1028 const LLT PtrTy = Query.Types[1]; 1029 unsigned AS = PtrTy.getAddressSpace(); 1030 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1031 return true; 1032 1033 // Catch weird sized loads that don't evenly divide into the access sizes 1034 // TODO: May be able to widen depending on alignment etc. 1035 unsigned NumRegs = (MemSize + 31) / 32; 1036 if (NumRegs == 3) { 1037 if (!ST.hasDwordx3LoadStores()) 1038 return true; 1039 } else { 1040 // If the alignment allows, these should have been widened. 1041 if (!isPowerOf2_32(NumRegs)) 1042 return true; 1043 } 1044 1045 if (AlignBits < MemSize) { 1046 const SITargetLowering *TLI = ST.getTargetLowering(); 1047 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1048 Align(AlignBits / 8)); 1049 } 1050 1051 return false; 1052 }; 1053 1054 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1055 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1056 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1057 1058 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1059 // LDS 1060 // TODO: Unsupported flat for SI. 1061 1062 for (unsigned Op : {G_LOAD, G_STORE}) { 1063 const bool IsStore = Op == G_STORE; 1064 1065 auto &Actions = getActionDefinitionsBuilder(Op); 1066 // Explicitly list some common cases. 1067 // TODO: Does this help compile time at all? 1068 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1069 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1070 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1071 {S64, GlobalPtr, 64, GlobalAlign32}, 1072 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1073 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1074 {S32, GlobalPtr, 8, GlobalAlign8}, 1075 {S32, GlobalPtr, 16, GlobalAlign16}, 1076 1077 {S32, LocalPtr, 32, 32}, 1078 {S64, LocalPtr, 64, 32}, 1079 {V2S32, LocalPtr, 64, 32}, 1080 {S32, LocalPtr, 8, 8}, 1081 {S32, LocalPtr, 16, 16}, 1082 {V2S16, LocalPtr, 32, 32}, 1083 1084 {S32, PrivatePtr, 32, 32}, 1085 {S32, PrivatePtr, 8, 8}, 1086 {S32, PrivatePtr, 16, 16}, 1087 {V2S16, PrivatePtr, 32, 32}, 1088 1089 {S32, ConstantPtr, 32, GlobalAlign32}, 1090 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1091 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1092 {S64, ConstantPtr, 64, GlobalAlign32}, 1093 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1094 Actions.legalIf( 1095 [=](const LegalityQuery &Query) -> bool { 1096 return isLoadStoreLegal(ST, Query, Op); 1097 }); 1098 1099 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1100 // 64-bits. 1101 // 1102 // TODO: Should generalize bitcast action into coerce, which will also cover 1103 // inserting addrspacecasts. 1104 Actions.customIf(typeIs(1, Constant32Ptr)); 1105 1106 // Turn any illegal element vectors into something easier to deal 1107 // with. These will ultimately produce 32-bit scalar shifts to extract the 1108 // parts anyway. 1109 // 1110 // For odd 16-bit element vectors, prefer to split those into pieces with 1111 // 16-bit vector parts. 1112 Actions.bitcastIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1115 Query.MMODescrs[0].SizeInBits); 1116 }, bitcastToRegisterType(0)); 1117 1118 if (!IsStore) { 1119 // Widen suitably aligned loads by loading extra bytes. The standard 1120 // legalization actions can't properly express widening memory operands. 1121 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1122 return shouldWidenLoad(ST, Query, G_LOAD); 1123 }); 1124 } 1125 1126 // FIXME: load/store narrowing should be moved to lower action 1127 Actions 1128 .narrowScalarIf( 1129 [=](const LegalityQuery &Query) -> bool { 1130 return !Query.Types[0].isVector() && 1131 needToSplitMemOp(Query, Op == G_LOAD); 1132 }, 1133 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1134 const LLT DstTy = Query.Types[0]; 1135 const LLT PtrTy = Query.Types[1]; 1136 1137 const unsigned DstSize = DstTy.getSizeInBits(); 1138 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1139 1140 // Split extloads. 1141 if (DstSize > MemSize) 1142 return std::make_pair(0, LLT::scalar(MemSize)); 1143 1144 if (!isPowerOf2_32(DstSize)) { 1145 // We're probably decomposing an odd sized store. Try to split 1146 // to the widest type. TODO: Account for alignment. As-is it 1147 // should be OK, since the new parts will be further legalized. 1148 unsigned FloorSize = PowerOf2Floor(DstSize); 1149 return std::make_pair(0, LLT::scalar(FloorSize)); 1150 } 1151 1152 if (DstSize > 32 && (DstSize % 32 != 0)) { 1153 // FIXME: Need a way to specify non-extload of larger size if 1154 // suitably aligned. 1155 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1156 } 1157 1158 unsigned MaxSize = maxSizeForAddrSpace(ST, 1159 PtrTy.getAddressSpace(), 1160 Op == G_LOAD); 1161 if (MemSize > MaxSize) 1162 return std::make_pair(0, LLT::scalar(MaxSize)); 1163 1164 unsigned Align = Query.MMODescrs[0].AlignInBits; 1165 return std::make_pair(0, LLT::scalar(Align)); 1166 }) 1167 .fewerElementsIf( 1168 [=](const LegalityQuery &Query) -> bool { 1169 return Query.Types[0].isVector() && 1170 needToSplitMemOp(Query, Op == G_LOAD); 1171 }, 1172 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1173 const LLT DstTy = Query.Types[0]; 1174 const LLT PtrTy = Query.Types[1]; 1175 1176 LLT EltTy = DstTy.getElementType(); 1177 unsigned MaxSize = maxSizeForAddrSpace(ST, 1178 PtrTy.getAddressSpace(), 1179 Op == G_LOAD); 1180 1181 // FIXME: Handle widened to power of 2 results better. This ends 1182 // up scalarizing. 1183 // FIXME: 3 element stores scalarized on SI 1184 1185 // Split if it's too large for the address space. 1186 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1187 unsigned NumElts = DstTy.getNumElements(); 1188 unsigned EltSize = EltTy.getSizeInBits(); 1189 1190 if (MaxSize % EltSize == 0) { 1191 return std::make_pair( 1192 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1193 } 1194 1195 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1196 1197 // FIXME: Refine when odd breakdowns handled 1198 // The scalars will need to be re-legalized. 1199 if (NumPieces == 1 || NumPieces >= NumElts || 1200 NumElts % NumPieces != 0) 1201 return std::make_pair(0, EltTy); 1202 1203 return std::make_pair(0, 1204 LLT::vector(NumElts / NumPieces, EltTy)); 1205 } 1206 1207 // FIXME: We could probably handle weird extending loads better. 1208 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1209 if (DstTy.getSizeInBits() > MemSize) 1210 return std::make_pair(0, EltTy); 1211 1212 unsigned EltSize = EltTy.getSizeInBits(); 1213 unsigned DstSize = DstTy.getSizeInBits(); 1214 if (!isPowerOf2_32(DstSize)) { 1215 // We're probably decomposing an odd sized store. Try to split 1216 // to the widest type. TODO: Account for alignment. As-is it 1217 // should be OK, since the new parts will be further legalized. 1218 unsigned FloorSize = PowerOf2Floor(DstSize); 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1221 } 1222 1223 // Need to split because of alignment. 1224 unsigned Align = Query.MMODescrs[0].AlignInBits; 1225 if (EltSize > Align && 1226 (EltSize / Align < DstTy.getNumElements())) { 1227 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1228 } 1229 1230 // May need relegalization for the scalars. 1231 return std::make_pair(0, EltTy); 1232 }) 1233 .lowerIfMemSizeNotPow2() 1234 .minScalar(0, S32); 1235 1236 if (IsStore) 1237 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1238 1239 Actions 1240 .widenScalarToNextPow2(0) 1241 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1242 .lower(); 1243 } 1244 1245 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1246 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1247 {S32, GlobalPtr, 16, 2 * 8}, 1248 {S32, LocalPtr, 8, 8}, 1249 {S32, LocalPtr, 16, 16}, 1250 {S32, PrivatePtr, 8, 8}, 1251 {S32, PrivatePtr, 16, 16}, 1252 {S32, ConstantPtr, 8, 8}, 1253 {S32, ConstantPtr, 16, 2 * 8}}); 1254 if (ST.hasFlatAddressSpace()) { 1255 ExtLoads.legalForTypesWithMemDesc( 1256 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1257 } 1258 1259 ExtLoads.clampScalar(0, S32, S32) 1260 .widenScalarToNextPow2(0) 1261 .unsupportedIfMemSizeNotPow2() 1262 .lower(); 1263 1264 auto &Atomics = getActionDefinitionsBuilder( 1265 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1266 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1267 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1268 G_ATOMICRMW_UMIN}) 1269 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1270 {S64, GlobalPtr}, {S64, LocalPtr}, 1271 {S32, RegionPtr}, {S64, RegionPtr}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1274 } 1275 1276 if (ST.hasLDSFPAtomics()) { 1277 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1278 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1279 } 1280 1281 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1282 // demarshalling 1283 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1284 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1285 {S32, FlatPtr}, {S64, FlatPtr}}) 1286 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1287 {S32, RegionPtr}, {S64, RegionPtr}}); 1288 // TODO: Pointer types, any 32-bit or 64-bit vector 1289 1290 // Condition should be s32 for scalar, s1 for vector. 1291 getActionDefinitionsBuilder(G_SELECT) 1292 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1293 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1294 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1295 .clampScalar(0, S16, S64) 1296 .scalarize(1) 1297 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1298 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1299 .clampMaxNumElements(0, S32, 2) 1300 .clampMaxNumElements(0, LocalPtr, 2) 1301 .clampMaxNumElements(0, PrivatePtr, 2) 1302 .scalarize(0) 1303 .widenScalarToNextPow2(0) 1304 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1305 1306 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1307 // be more flexible with the shift amount type. 1308 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1309 .legalFor({{S32, S32}, {S64, S32}}); 1310 if (ST.has16BitInsts()) { 1311 if (ST.hasVOP3PInsts()) { 1312 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1313 .clampMaxNumElements(0, S16, 2); 1314 } else 1315 Shifts.legalFor({{S16, S16}}); 1316 1317 // TODO: Support 16-bit shift amounts for all types 1318 Shifts.widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1321 // 32-bit amount. 1322 const LLT ValTy = Query.Types[0]; 1323 const LLT AmountTy = Query.Types[1]; 1324 return ValTy.getSizeInBits() <= 16 && 1325 AmountTy.getSizeInBits() < 16; 1326 }, changeTo(1, S16)); 1327 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1328 Shifts.clampScalar(1, S32, S32); 1329 Shifts.clampScalar(0, S16, S64); 1330 Shifts.widenScalarToNextPow2(0, 16); 1331 1332 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1333 .minScalar(0, S16) 1334 .scalarize(0) 1335 .lower(); 1336 } else { 1337 // Make sure we legalize the shift amount type first, as the general 1338 // expansion for the shifted type will produce much worse code if it hasn't 1339 // been truncated already. 1340 Shifts.clampScalar(1, S32, S32); 1341 Shifts.clampScalar(0, S32, S64); 1342 Shifts.widenScalarToNextPow2(0, 32); 1343 1344 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1345 .minScalar(0, S32) 1346 .scalarize(0) 1347 .lower(); 1348 } 1349 Shifts.scalarize(0); 1350 1351 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1352 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1353 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1354 unsigned IdxTypeIdx = 2; 1355 1356 getActionDefinitionsBuilder(Op) 1357 .customIf([=](const LegalityQuery &Query) { 1358 const LLT EltTy = Query.Types[EltTypeIdx]; 1359 const LLT VecTy = Query.Types[VecTypeIdx]; 1360 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1361 const unsigned EltSize = EltTy.getSizeInBits(); 1362 return (EltSize == 32 || EltSize == 64) && 1363 VecTy.getSizeInBits() % 32 == 0 && 1364 VecTy.getSizeInBits() <= MaxRegisterSize && 1365 IdxTy.getSizeInBits() == 32; 1366 }) 1367 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1368 bitcastToVectorElement32(VecTypeIdx)) 1369 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1370 .bitcastIf( 1371 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1372 [=](const LegalityQuery &Query) { 1373 // For > 64-bit element types, try to turn this into a 64-bit 1374 // element vector since we may be able to do better indexing 1375 // if this is scalar. If not, fall back to 32. 1376 const LLT EltTy = Query.Types[EltTypeIdx]; 1377 const LLT VecTy = Query.Types[VecTypeIdx]; 1378 const unsigned DstEltSize = EltTy.getSizeInBits(); 1379 const unsigned VecSize = VecTy.getSizeInBits(); 1380 1381 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1382 return std::make_pair( 1383 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1384 }) 1385 .clampScalar(EltTypeIdx, S32, S64) 1386 .clampScalar(VecTypeIdx, S32, S64) 1387 .clampScalar(IdxTypeIdx, S32, S32) 1388 .clampMaxNumElements(VecTypeIdx, S32, 32) 1389 // TODO: Clamp elements for 64-bit vectors? 1390 // It should only be necessary with variable indexes. 1391 // As a last resort, lower to the stack 1392 .lower(); 1393 } 1394 1395 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1396 .unsupportedIf([=](const LegalityQuery &Query) { 1397 const LLT &EltTy = Query.Types[1].getElementType(); 1398 return Query.Types[0] != EltTy; 1399 }); 1400 1401 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1402 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1403 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1404 1405 // FIXME: Doesn't handle extract of illegal sizes. 1406 getActionDefinitionsBuilder(Op) 1407 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1408 // FIXME: Multiples of 16 should not be legal. 1409 .legalIf([=](const LegalityQuery &Query) { 1410 const LLT BigTy = Query.Types[BigTyIdx]; 1411 const LLT LitTy = Query.Types[LitTyIdx]; 1412 return (BigTy.getSizeInBits() % 32 == 0) && 1413 (LitTy.getSizeInBits() % 16 == 0); 1414 }) 1415 .widenScalarIf( 1416 [=](const LegalityQuery &Query) { 1417 const LLT BigTy = Query.Types[BigTyIdx]; 1418 return (BigTy.getScalarSizeInBits() < 16); 1419 }, 1420 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1421 .widenScalarIf( 1422 [=](const LegalityQuery &Query) { 1423 const LLT LitTy = Query.Types[LitTyIdx]; 1424 return (LitTy.getScalarSizeInBits() < 16); 1425 }, 1426 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1427 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1428 .widenScalarToNextPow2(BigTyIdx, 32); 1429 1430 } 1431 1432 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1433 .legalForCartesianProduct(AllS32Vectors, {S32}) 1434 .legalForCartesianProduct(AllS64Vectors, {S64}) 1435 .clampNumElements(0, V16S32, V32S32) 1436 .clampNumElements(0, V2S64, V16S64) 1437 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1438 1439 if (ST.hasScalarPackInsts()) { 1440 BuildVector 1441 // FIXME: Should probably widen s1 vectors straight to s32 1442 .minScalarOrElt(0, S16) 1443 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1444 .minScalar(1, S32); 1445 1446 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1447 .legalFor({V2S16, S32}) 1448 .lower(); 1449 BuildVector.minScalarOrElt(0, S32); 1450 } else { 1451 BuildVector.customFor({V2S16, S16}); 1452 BuildVector.minScalarOrElt(0, S32); 1453 1454 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1455 .customFor({V2S16, S32}) 1456 .lower(); 1457 } 1458 1459 BuildVector.legalIf(isRegisterType(0)); 1460 1461 // FIXME: Clamp maximum size 1462 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1463 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1464 .clampMaxNumElements(0, S32, 32) 1465 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1466 .clampMaxNumElements(0, S16, 64); 1467 1468 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1469 // pre-legalize. 1470 if (ST.hasVOP3PInsts()) { 1471 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1472 .customFor({V2S16, V2S16}) 1473 .lower(); 1474 } else 1475 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1476 1477 // Merge/Unmerge 1478 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1479 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1480 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1481 1482 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1483 const LLT Ty = Query.Types[TypeIdx]; 1484 if (Ty.isVector()) { 1485 const LLT &EltTy = Ty.getElementType(); 1486 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1487 return true; 1488 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1489 return true; 1490 } 1491 return false; 1492 }; 1493 1494 auto &Builder = getActionDefinitionsBuilder(Op) 1495 .lowerFor({{S16, V2S16}}) 1496 .lowerIf([=](const LegalityQuery &Query) { 1497 const LLT BigTy = Query.Types[BigTyIdx]; 1498 return BigTy.getSizeInBits() == 32; 1499 }) 1500 // Try to widen to s16 first for small types. 1501 // TODO: Only do this on targets with legal s16 shifts 1502 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1503 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1504 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1505 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1506 elementTypeIs(1, S16)), 1507 changeTo(1, V2S16)) 1508 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1509 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1510 // valid. 1511 .clampScalar(LitTyIdx, S32, S512) 1512 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1513 // Break up vectors with weird elements into scalars 1514 .fewerElementsIf( 1515 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1516 scalarize(0)) 1517 .fewerElementsIf( 1518 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1519 scalarize(1)) 1520 .clampScalar(BigTyIdx, S32, MaxScalar); 1521 1522 if (Op == G_MERGE_VALUES) { 1523 Builder.widenScalarIf( 1524 // TODO: Use 16-bit shifts if legal for 8-bit values? 1525 [=](const LegalityQuery &Query) { 1526 const LLT Ty = Query.Types[LitTyIdx]; 1527 return Ty.getSizeInBits() < 32; 1528 }, 1529 changeTo(LitTyIdx, S32)); 1530 } 1531 1532 Builder.widenScalarIf( 1533 [=](const LegalityQuery &Query) { 1534 const LLT Ty = Query.Types[BigTyIdx]; 1535 return !isPowerOf2_32(Ty.getSizeInBits()) && 1536 Ty.getSizeInBits() % 16 != 0; 1537 }, 1538 [=](const LegalityQuery &Query) { 1539 // Pick the next power of 2, or a multiple of 64 over 128. 1540 // Whichever is smaller. 1541 const LLT &Ty = Query.Types[BigTyIdx]; 1542 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1543 if (NewSizeInBits >= 256) { 1544 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1545 if (RoundedTo < NewSizeInBits) 1546 NewSizeInBits = RoundedTo; 1547 } 1548 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1549 }) 1550 .legalIf([=](const LegalityQuery &Query) { 1551 const LLT &BigTy = Query.Types[BigTyIdx]; 1552 const LLT &LitTy = Query.Types[LitTyIdx]; 1553 1554 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1555 return false; 1556 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1557 return false; 1558 1559 return BigTy.getSizeInBits() % 16 == 0 && 1560 LitTy.getSizeInBits() % 16 == 0 && 1561 BigTy.getSizeInBits() <= MaxRegisterSize; 1562 }) 1563 // Any vectors left are the wrong size. Scalarize them. 1564 .scalarize(0) 1565 .scalarize(1); 1566 } 1567 1568 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1569 // RegBankSelect. 1570 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1571 .legalFor({{S32}, {S64}}); 1572 1573 if (ST.hasVOP3PInsts()) { 1574 SextInReg.lowerFor({{V2S16}}) 1575 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1576 // get more vector shift opportunities, since we'll get those when 1577 // expanded. 1578 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1579 } else if (ST.has16BitInsts()) { 1580 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1581 } else { 1582 // Prefer to promote to s32 before lowering if we don't have 16-bit 1583 // shifts. This avoid a lot of intermediate truncate and extend operations. 1584 SextInReg.lowerFor({{S32}, {S64}}); 1585 } 1586 1587 SextInReg 1588 .scalarize(0) 1589 .clampScalar(0, S32, S64) 1590 .lower(); 1591 1592 getActionDefinitionsBuilder(G_FSHR) 1593 .legalFor({{S32, S32}}) 1594 .scalarize(0) 1595 .lower(); 1596 1597 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1598 .legalFor({S64}); 1599 1600 getActionDefinitionsBuilder(G_FENCE) 1601 .alwaysLegal(); 1602 1603 getActionDefinitionsBuilder({ 1604 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1605 G_FCOPYSIGN, 1606 1607 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1608 G_ATOMICRMW_NAND, 1609 G_ATOMICRMW_FSUB, 1610 G_READ_REGISTER, 1611 G_WRITE_REGISTER, 1612 1613 G_SADDO, G_SSUBO, 1614 1615 // TODO: Implement 1616 G_FMINIMUM, G_FMAXIMUM, 1617 G_FSHL 1618 }).lower(); 1619 1620 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1621 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1622 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1623 .unsupported(); 1624 1625 computeTables(); 1626 verify(*ST.getInstrInfo()); 1627 } 1628 1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1630 MachineInstr &MI) const { 1631 MachineIRBuilder &B = Helper.MIRBuilder; 1632 MachineRegisterInfo &MRI = *B.getMRI(); 1633 1634 switch (MI.getOpcode()) { 1635 case TargetOpcode::G_ADDRSPACE_CAST: 1636 return legalizeAddrSpaceCast(MI, MRI, B); 1637 case TargetOpcode::G_FRINT: 1638 return legalizeFrint(MI, MRI, B); 1639 case TargetOpcode::G_FCEIL: 1640 return legalizeFceil(MI, MRI, B); 1641 case TargetOpcode::G_FREM: 1642 return legalizeFrem(MI, MRI, B); 1643 case TargetOpcode::G_INTRINSIC_TRUNC: 1644 return legalizeIntrinsicTrunc(MI, MRI, B); 1645 case TargetOpcode::G_SITOFP: 1646 return legalizeITOFP(MI, MRI, B, true); 1647 case TargetOpcode::G_UITOFP: 1648 return legalizeITOFP(MI, MRI, B, false); 1649 case TargetOpcode::G_FPTOSI: 1650 return legalizeFPTOI(MI, MRI, B, true); 1651 case TargetOpcode::G_FPTOUI: 1652 return legalizeFPTOI(MI, MRI, B, false); 1653 case TargetOpcode::G_FMINNUM: 1654 case TargetOpcode::G_FMAXNUM: 1655 case TargetOpcode::G_FMINNUM_IEEE: 1656 case TargetOpcode::G_FMAXNUM_IEEE: 1657 return legalizeMinNumMaxNum(Helper, MI); 1658 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1659 return legalizeExtractVectorElt(MI, MRI, B); 1660 case TargetOpcode::G_INSERT_VECTOR_ELT: 1661 return legalizeInsertVectorElt(MI, MRI, B); 1662 case TargetOpcode::G_SHUFFLE_VECTOR: 1663 return legalizeShuffleVector(MI, MRI, B); 1664 case TargetOpcode::G_FSIN: 1665 case TargetOpcode::G_FCOS: 1666 return legalizeSinCos(MI, MRI, B); 1667 case TargetOpcode::G_GLOBAL_VALUE: 1668 return legalizeGlobalValue(MI, MRI, B); 1669 case TargetOpcode::G_LOAD: 1670 return legalizeLoad(Helper, MI); 1671 case TargetOpcode::G_FMAD: 1672 return legalizeFMad(MI, MRI, B); 1673 case TargetOpcode::G_FDIV: 1674 return legalizeFDIV(MI, MRI, B); 1675 case TargetOpcode::G_UDIV: 1676 case TargetOpcode::G_UREM: 1677 return legalizeUDIV_UREM(MI, MRI, B); 1678 case TargetOpcode::G_SDIV: 1679 case TargetOpcode::G_SREM: 1680 return legalizeSDIV_SREM(MI, MRI, B); 1681 case TargetOpcode::G_ATOMIC_CMPXCHG: 1682 return legalizeAtomicCmpXChg(MI, MRI, B); 1683 case TargetOpcode::G_FLOG: 1684 return legalizeFlog(MI, B, numbers::ln2f); 1685 case TargetOpcode::G_FLOG10: 1686 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1687 case TargetOpcode::G_FEXP: 1688 return legalizeFExp(MI, B); 1689 case TargetOpcode::G_FPOW: 1690 return legalizeFPow(MI, B); 1691 case TargetOpcode::G_FFLOOR: 1692 return legalizeFFloor(MI, MRI, B); 1693 case TargetOpcode::G_BUILD_VECTOR: 1694 return legalizeBuildVector(MI, MRI, B); 1695 default: 1696 return false; 1697 } 1698 1699 llvm_unreachable("expected switch to return"); 1700 } 1701 1702 Register AMDGPULegalizerInfo::getSegmentAperture( 1703 unsigned AS, 1704 MachineRegisterInfo &MRI, 1705 MachineIRBuilder &B) const { 1706 MachineFunction &MF = B.getMF(); 1707 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1708 const LLT S32 = LLT::scalar(32); 1709 1710 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1711 1712 if (ST.hasApertureRegs()) { 1713 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1714 // getreg. 1715 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1716 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1717 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1718 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1719 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1721 unsigned Encoding = 1722 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1723 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1724 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1725 1726 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1727 1728 B.buildInstr(AMDGPU::S_GETREG_B32) 1729 .addDef(GetReg) 1730 .addImm(Encoding); 1731 MRI.setType(GetReg, S32); 1732 1733 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1734 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1735 } 1736 1737 Register QueuePtr = MRI.createGenericVirtualRegister( 1738 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1739 1740 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1741 return Register(); 1742 1743 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1744 // private_segment_aperture_base_hi. 1745 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1746 1747 // TODO: can we be smarter about machine pointer info? 1748 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1749 MachineMemOperand *MMO = MF.getMachineMemOperand( 1750 PtrInfo, 1751 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1752 MachineMemOperand::MOInvariant, 1753 4, commonAlignment(Align(64), StructOffset)); 1754 1755 Register LoadAddr; 1756 1757 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1758 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1759 } 1760 1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1762 MachineInstr &MI, MachineRegisterInfo &MRI, 1763 MachineIRBuilder &B) const { 1764 MachineFunction &MF = B.getMF(); 1765 1766 const LLT S32 = LLT::scalar(32); 1767 Register Dst = MI.getOperand(0).getReg(); 1768 Register Src = MI.getOperand(1).getReg(); 1769 1770 LLT DstTy = MRI.getType(Dst); 1771 LLT SrcTy = MRI.getType(Src); 1772 unsigned DestAS = DstTy.getAddressSpace(); 1773 unsigned SrcAS = SrcTy.getAddressSpace(); 1774 1775 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1776 // vector element. 1777 assert(!DstTy.isVector()); 1778 1779 const AMDGPUTargetMachine &TM 1780 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1781 1782 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1783 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1784 return true; 1785 } 1786 1787 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 // Truncate. 1789 B.buildExtract(Dst, Src, 0); 1790 MI.eraseFromParent(); 1791 return true; 1792 } 1793 1794 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1795 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1796 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1797 1798 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1799 // another. Merge operands are required to be the same type, but creating an 1800 // extra ptrtoint would be kind of pointless. 1801 auto HighAddr = B.buildConstant( 1802 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1803 B.buildMerge(Dst, {Src, HighAddr}); 1804 MI.eraseFromParent(); 1805 return true; 1806 } 1807 1808 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1809 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1810 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1811 unsigned NullVal = TM.getNullPointerValue(DestAS); 1812 1813 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1814 auto FlatNull = B.buildConstant(SrcTy, 0); 1815 1816 // Extract low 32-bits of the pointer. 1817 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1818 1819 auto CmpRes = 1820 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1821 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1822 1823 MI.eraseFromParent(); 1824 return true; 1825 } 1826 1827 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1828 return false; 1829 1830 if (!ST.hasFlatAddressSpace()) 1831 return false; 1832 1833 auto SegmentNull = 1834 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1835 auto FlatNull = 1836 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1837 1838 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1839 if (!ApertureReg.isValid()) 1840 return false; 1841 1842 auto CmpRes = 1843 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1844 1845 // Coerce the type of the low half of the result so we can use merge_values. 1846 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1847 1848 // TODO: Should we allow mismatched types but matching sizes in merges to 1849 // avoid the ptrtoint? 1850 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1851 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1852 1853 MI.eraseFromParent(); 1854 return true; 1855 } 1856 1857 bool AMDGPULegalizerInfo::legalizeFrint( 1858 MachineInstr &MI, MachineRegisterInfo &MRI, 1859 MachineIRBuilder &B) const { 1860 Register Src = MI.getOperand(1).getReg(); 1861 LLT Ty = MRI.getType(Src); 1862 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1863 1864 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1865 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1866 1867 auto C1 = B.buildFConstant(Ty, C1Val); 1868 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1869 1870 // TODO: Should this propagate fast-math-flags? 1871 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1872 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1873 1874 auto C2 = B.buildFConstant(Ty, C2Val); 1875 auto Fabs = B.buildFAbs(Ty, Src); 1876 1877 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1878 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeFceil( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B) const { 1886 1887 const LLT S1 = LLT::scalar(1); 1888 const LLT S64 = LLT::scalar(64); 1889 1890 Register Src = MI.getOperand(1).getReg(); 1891 assert(MRI.getType(Src) == S64); 1892 1893 // result = trunc(src) 1894 // if (src > 0.0 && src != result) 1895 // result += 1.0 1896 1897 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1898 1899 const auto Zero = B.buildFConstant(S64, 0.0); 1900 const auto One = B.buildFConstant(S64, 1.0); 1901 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1902 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1903 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1904 auto Add = B.buildSelect(S64, And, One, Zero); 1905 1906 // TODO: Should this propagate fast-math-flags? 1907 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFrem( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 Register DstReg = MI.getOperand(0).getReg(); 1915 Register Src0Reg = MI.getOperand(1).getReg(); 1916 Register Src1Reg = MI.getOperand(2).getReg(); 1917 auto Flags = MI.getFlags(); 1918 LLT Ty = MRI.getType(DstReg); 1919 1920 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1921 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1922 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1923 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1924 MI.eraseFromParent(); 1925 return true; 1926 } 1927 1928 static MachineInstrBuilder extractF64Exponent(Register Hi, 1929 MachineIRBuilder &B) { 1930 const unsigned FractBits = 52; 1931 const unsigned ExpBits = 11; 1932 LLT S32 = LLT::scalar(32); 1933 1934 auto Const0 = B.buildConstant(S32, FractBits - 32); 1935 auto Const1 = B.buildConstant(S32, ExpBits); 1936 1937 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1938 .addUse(Hi) 1939 .addUse(Const0.getReg(0)) 1940 .addUse(Const1.getReg(0)); 1941 1942 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 const LLT S1 = LLT::scalar(1); 1949 const LLT S32 = LLT::scalar(32); 1950 const LLT S64 = LLT::scalar(64); 1951 1952 Register Src = MI.getOperand(1).getReg(); 1953 assert(MRI.getType(Src) == S64); 1954 1955 // TODO: Should this use extract since the low half is unused? 1956 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1957 Register Hi = Unmerge.getReg(1); 1958 1959 // Extract the upper half, since this is where we will find the sign and 1960 // exponent. 1961 auto Exp = extractF64Exponent(Hi, B); 1962 1963 const unsigned FractBits = 52; 1964 1965 // Extract the sign bit. 1966 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1967 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1968 1969 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1970 1971 const auto Zero32 = B.buildConstant(S32, 0); 1972 1973 // Extend back to 64-bits. 1974 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1975 1976 auto Shr = B.buildAShr(S64, FractMask, Exp); 1977 auto Not = B.buildNot(S64, Shr); 1978 auto Tmp0 = B.buildAnd(S64, Src, Not); 1979 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1980 1981 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1982 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1983 1984 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1985 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1986 MI.eraseFromParent(); 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeITOFP( 1991 MachineInstr &MI, MachineRegisterInfo &MRI, 1992 MachineIRBuilder &B, bool Signed) const { 1993 1994 Register Dst = MI.getOperand(0).getReg(); 1995 Register Src = MI.getOperand(1).getReg(); 1996 1997 const LLT S64 = LLT::scalar(64); 1998 const LLT S32 = LLT::scalar(32); 1999 2000 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2001 2002 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2003 2004 auto CvtHi = Signed ? 2005 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2006 B.buildUITOFP(S64, Unmerge.getReg(1)); 2007 2008 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2009 2010 auto ThirtyTwo = B.buildConstant(S32, 32); 2011 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2012 .addUse(CvtHi.getReg(0)) 2013 .addUse(ThirtyTwo.getReg(0)); 2014 2015 // TODO: Should this propagate fast-math-flags? 2016 B.buildFAdd(Dst, LdExp, CvtLo); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 // TODO: Copied from DAG implementation. Verify logic and document how this 2022 // actually works. 2023 bool AMDGPULegalizerInfo::legalizeFPTOI( 2024 MachineInstr &MI, MachineRegisterInfo &MRI, 2025 MachineIRBuilder &B, bool Signed) const { 2026 2027 Register Dst = MI.getOperand(0).getReg(); 2028 Register Src = MI.getOperand(1).getReg(); 2029 2030 const LLT S64 = LLT::scalar(64); 2031 const LLT S32 = LLT::scalar(32); 2032 2033 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2034 2035 unsigned Flags = MI.getFlags(); 2036 2037 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2038 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2039 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2040 2041 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2042 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2043 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2044 2045 auto Hi = Signed ? 2046 B.buildFPTOSI(S32, FloorMul) : 2047 B.buildFPTOUI(S32, FloorMul); 2048 auto Lo = B.buildFPTOUI(S32, Fma); 2049 2050 B.buildMerge(Dst, { Lo, Hi }); 2051 MI.eraseFromParent(); 2052 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2057 MachineInstr &MI) const { 2058 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2059 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2060 2061 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2062 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2063 2064 // With ieee_mode disabled, the instructions have the correct behavior 2065 // already for G_FMINNUM/G_FMAXNUM 2066 if (!MFI->getMode().IEEE) 2067 return !IsIEEEOp; 2068 2069 if (IsIEEEOp) 2070 return true; 2071 2072 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, 2077 MachineIRBuilder &B) const { 2078 // TODO: Should move some of this into LegalizerHelper. 2079 2080 // TODO: Promote dynamic indexing of s16 to s32 2081 2082 // FIXME: Artifact combiner probably should have replaced the truncated 2083 // constant before this, so we shouldn't need 2084 // getConstantVRegValWithLookThrough. 2085 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2086 MI.getOperand(2).getReg(), MRI); 2087 if (!IdxVal) // Dynamic case will be selected to register indexing. 2088 return true; 2089 2090 Register Dst = MI.getOperand(0).getReg(); 2091 Register Vec = MI.getOperand(1).getReg(); 2092 2093 LLT VecTy = MRI.getType(Vec); 2094 LLT EltTy = VecTy.getElementType(); 2095 assert(EltTy == MRI.getType(Dst)); 2096 2097 if (IdxVal->Value < VecTy.getNumElements()) 2098 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2099 else 2100 B.buildUndef(Dst); 2101 2102 MI.eraseFromParent(); 2103 return true; 2104 } 2105 2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2107 MachineInstr &MI, MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 // TODO: Should move some of this into LegalizerHelper. 2110 2111 // TODO: Promote dynamic indexing of s16 to s32 2112 2113 // FIXME: Artifact combiner probably should have replaced the truncated 2114 // constant before this, so we shouldn't need 2115 // getConstantVRegValWithLookThrough. 2116 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2117 MI.getOperand(3).getReg(), MRI); 2118 if (!IdxVal) // Dynamic case will be selected to register indexing. 2119 return true; 2120 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Vec = MI.getOperand(1).getReg(); 2123 Register Ins = MI.getOperand(2).getReg(); 2124 2125 LLT VecTy = MRI.getType(Vec); 2126 LLT EltTy = VecTy.getElementType(); 2127 assert(EltTy == MRI.getType(Ins)); 2128 2129 if (IdxVal->Value < VecTy.getNumElements()) 2130 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2131 else 2132 B.buildUndef(Dst); 2133 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B) const { 2141 const LLT V2S16 = LLT::vector(2, 16); 2142 2143 Register Dst = MI.getOperand(0).getReg(); 2144 Register Src0 = MI.getOperand(1).getReg(); 2145 LLT DstTy = MRI.getType(Dst); 2146 LLT SrcTy = MRI.getType(Src0); 2147 2148 if (SrcTy == V2S16 && DstTy == V2S16 && 2149 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2150 return true; 2151 2152 MachineIRBuilder HelperBuilder(MI); 2153 GISelObserverWrapper DummyObserver; 2154 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2155 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeSinCos( 2159 MachineInstr &MI, MachineRegisterInfo &MRI, 2160 MachineIRBuilder &B) const { 2161 2162 Register DstReg = MI.getOperand(0).getReg(); 2163 Register SrcReg = MI.getOperand(1).getReg(); 2164 LLT Ty = MRI.getType(DstReg); 2165 unsigned Flags = MI.getFlags(); 2166 2167 Register TrigVal; 2168 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2169 if (ST.hasTrigReducedRange()) { 2170 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2171 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2172 .addUse(MulVal.getReg(0)) 2173 .setMIFlags(Flags).getReg(0); 2174 } else 2175 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2176 2177 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2178 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2179 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2180 .addUse(TrigVal) 2181 .setMIFlags(Flags); 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2187 MachineIRBuilder &B, 2188 const GlobalValue *GV, 2189 int64_t Offset, 2190 unsigned GAFlags) const { 2191 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2192 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2193 // to the following code sequence: 2194 // 2195 // For constant address space: 2196 // s_getpc_b64 s[0:1] 2197 // s_add_u32 s0, s0, $symbol 2198 // s_addc_u32 s1, s1, 0 2199 // 2200 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2201 // a fixup or relocation is emitted to replace $symbol with a literal 2202 // constant, which is a pc-relative offset from the encoding of the $symbol 2203 // operand to the global variable. 2204 // 2205 // For global address space: 2206 // s_getpc_b64 s[0:1] 2207 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2208 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2209 // 2210 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2211 // fixups or relocations are emitted to replace $symbol@*@lo and 2212 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2213 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2214 // operand to the global variable. 2215 // 2216 // What we want here is an offset from the value returned by s_getpc 2217 // (which is the address of the s_add_u32 instruction) to the global 2218 // variable, but since the encoding of $symbol starts 4 bytes after the start 2219 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2220 // small. This requires us to add 4 to the global variable offset in order to 2221 // compute the correct address. 2222 2223 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2224 2225 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2226 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2227 2228 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2229 .addDef(PCReg); 2230 2231 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2232 if (GAFlags == SIInstrInfo::MO_NONE) 2233 MIB.addImm(0); 2234 else 2235 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2236 2237 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2238 2239 if (PtrTy.getSizeInBits() == 32) 2240 B.buildExtract(DstReg, PCReg, 0); 2241 return true; 2242 } 2243 2244 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2245 MachineInstr &MI, MachineRegisterInfo &MRI, 2246 MachineIRBuilder &B) const { 2247 Register DstReg = MI.getOperand(0).getReg(); 2248 LLT Ty = MRI.getType(DstReg); 2249 unsigned AS = Ty.getAddressSpace(); 2250 2251 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2252 MachineFunction &MF = B.getMF(); 2253 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2254 2255 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2256 if (!MFI->isEntryFunction()) { 2257 const Function &Fn = MF.getFunction(); 2258 DiagnosticInfoUnsupported BadLDSDecl( 2259 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2260 DS_Warning); 2261 Fn.getContext().diagnose(BadLDSDecl); 2262 2263 // We currently don't have a way to correctly allocate LDS objects that 2264 // aren't directly associated with a kernel. We do force inlining of 2265 // functions that use local objects. However, if these dead functions are 2266 // not eliminated, we don't want a compile time error. Just emit a warning 2267 // and a trap, since there should be no callable path here. 2268 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2269 B.buildUndef(DstReg); 2270 MI.eraseFromParent(); 2271 return true; 2272 } 2273 2274 // TODO: We could emit code to handle the initialization somewhere. 2275 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2276 const SITargetLowering *TLI = ST.getTargetLowering(); 2277 if (!TLI->shouldUseLDSConstAddress(GV)) { 2278 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2279 return true; // Leave in place; 2280 } 2281 2282 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2283 Type *Ty = GV->getValueType(); 2284 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2285 // zero-sized type in other languages to declare the dynamic shared 2286 // memory which size is not known at the compile time. They will be 2287 // allocated by the runtime and placed directly after the static 2288 // allocated ones. They all share the same offset. 2289 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2290 // Adjust alignment for that dynamic shared memory array. 2291 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2292 LLT S32 = LLT::scalar(32); 2293 auto Sz = 2294 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2295 B.buildIntToPtr(DstReg, Sz); 2296 MI.eraseFromParent(); 2297 return true; 2298 } 2299 } 2300 2301 B.buildConstant( 2302 DstReg, 2303 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2304 MI.eraseFromParent(); 2305 return true; 2306 } 2307 2308 const Function &Fn = MF.getFunction(); 2309 DiagnosticInfoUnsupported BadInit( 2310 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2311 Fn.getContext().diagnose(BadInit); 2312 return true; 2313 } 2314 2315 const SITargetLowering *TLI = ST.getTargetLowering(); 2316 2317 if (TLI->shouldEmitFixup(GV)) { 2318 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2319 MI.eraseFromParent(); 2320 return true; 2321 } 2322 2323 if (TLI->shouldEmitPCReloc(GV)) { 2324 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2330 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2331 2332 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2333 MachinePointerInfo::getGOT(MF), 2334 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2335 MachineMemOperand::MOInvariant, 2336 8 /*Size*/, Align(8)); 2337 2338 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2339 2340 if (Ty.getSizeInBits() == 32) { 2341 // Truncate if this is a 32-bit constant adrdess. 2342 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2343 B.buildExtract(DstReg, Load, 0); 2344 } else 2345 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2346 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 static LLT widenToNextPowerOf2(LLT Ty) { 2352 if (Ty.isVector()) 2353 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2354 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2355 } 2356 2357 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2358 MachineInstr &MI) const { 2359 MachineIRBuilder &B = Helper.MIRBuilder; 2360 MachineRegisterInfo &MRI = *B.getMRI(); 2361 GISelChangeObserver &Observer = Helper.Observer; 2362 2363 Register PtrReg = MI.getOperand(1).getReg(); 2364 LLT PtrTy = MRI.getType(PtrReg); 2365 unsigned AddrSpace = PtrTy.getAddressSpace(); 2366 2367 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2368 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2369 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2370 Observer.changingInstr(MI); 2371 MI.getOperand(1).setReg(Cast.getReg(0)); 2372 Observer.changedInstr(MI); 2373 return true; 2374 } 2375 2376 Register ValReg = MI.getOperand(0).getReg(); 2377 LLT ValTy = MRI.getType(ValReg); 2378 2379 MachineMemOperand *MMO = *MI.memoperands_begin(); 2380 const unsigned ValSize = ValTy.getSizeInBits(); 2381 const unsigned MemSize = 8 * MMO->getSize(); 2382 const Align MemAlign = MMO->getAlign(); 2383 const unsigned AlignInBits = 8 * MemAlign.value(); 2384 2385 // Widen non-power-of-2 loads to the alignment if needed 2386 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2387 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2388 2389 // This was already the correct extending load result type, so just adjust 2390 // the memory type. 2391 if (WideMemSize == ValSize) { 2392 MachineFunction &MF = B.getMF(); 2393 2394 MachineMemOperand *WideMMO = 2395 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2396 Observer.changingInstr(MI); 2397 MI.setMemRefs(MF, {WideMMO}); 2398 Observer.changedInstr(MI); 2399 return true; 2400 } 2401 2402 // Don't bother handling edge case that should probably never be produced. 2403 if (ValSize > WideMemSize) 2404 return false; 2405 2406 LLT WideTy = widenToNextPowerOf2(ValTy); 2407 2408 Register WideLoad; 2409 if (!WideTy.isVector()) { 2410 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2411 B.buildTrunc(ValReg, WideLoad).getReg(0); 2412 } else { 2413 // Extract the subvector. 2414 2415 if (isRegisterType(ValTy)) { 2416 // If this a case where G_EXTRACT is legal, use it. 2417 // (e.g. <3 x s32> -> <4 x s32>) 2418 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2419 B.buildExtract(ValReg, WideLoad, 0); 2420 } else { 2421 // For cases where the widened type isn't a nice register value, unmerge 2422 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2423 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2424 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2425 B.setInsertPt(B.getMBB(), MI.getIterator()); 2426 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2427 } 2428 } 2429 2430 MI.eraseFromParent(); 2431 return true; 2432 } 2433 2434 return false; 2435 } 2436 2437 bool AMDGPULegalizerInfo::legalizeFMad( 2438 MachineInstr &MI, MachineRegisterInfo &MRI, 2439 MachineIRBuilder &B) const { 2440 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2441 assert(Ty.isScalar()); 2442 2443 MachineFunction &MF = B.getMF(); 2444 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2445 2446 // TODO: Always legal with future ftz flag. 2447 // FIXME: Do we need just output? 2448 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2449 return true; 2450 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2451 return true; 2452 2453 MachineIRBuilder HelperBuilder(MI); 2454 GISelObserverWrapper DummyObserver; 2455 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2456 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2457 } 2458 2459 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2460 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2461 Register DstReg = MI.getOperand(0).getReg(); 2462 Register PtrReg = MI.getOperand(1).getReg(); 2463 Register CmpVal = MI.getOperand(2).getReg(); 2464 Register NewVal = MI.getOperand(3).getReg(); 2465 2466 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2467 "this should not have been custom lowered"); 2468 2469 LLT ValTy = MRI.getType(CmpVal); 2470 LLT VecTy = LLT::vector(2, ValTy); 2471 2472 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2473 2474 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2475 .addDef(DstReg) 2476 .addUse(PtrReg) 2477 .addUse(PackedVal) 2478 .setMemRefs(MI.memoperands()); 2479 2480 MI.eraseFromParent(); 2481 return true; 2482 } 2483 2484 bool AMDGPULegalizerInfo::legalizeFlog( 2485 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2486 Register Dst = MI.getOperand(0).getReg(); 2487 Register Src = MI.getOperand(1).getReg(); 2488 LLT Ty = B.getMRI()->getType(Dst); 2489 unsigned Flags = MI.getFlags(); 2490 2491 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2492 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2493 2494 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2495 MI.eraseFromParent(); 2496 return true; 2497 } 2498 2499 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2500 MachineIRBuilder &B) const { 2501 Register Dst = MI.getOperand(0).getReg(); 2502 Register Src = MI.getOperand(1).getReg(); 2503 unsigned Flags = MI.getFlags(); 2504 LLT Ty = B.getMRI()->getType(Dst); 2505 2506 auto K = B.buildFConstant(Ty, numbers::log2e); 2507 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2508 B.buildFExp2(Dst, Mul, Flags); 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2514 MachineIRBuilder &B) const { 2515 Register Dst = MI.getOperand(0).getReg(); 2516 Register Src0 = MI.getOperand(1).getReg(); 2517 Register Src1 = MI.getOperand(2).getReg(); 2518 unsigned Flags = MI.getFlags(); 2519 LLT Ty = B.getMRI()->getType(Dst); 2520 const LLT S16 = LLT::scalar(16); 2521 const LLT S32 = LLT::scalar(32); 2522 2523 if (Ty == S32) { 2524 auto Log = B.buildFLog2(S32, Src0, Flags); 2525 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2526 .addUse(Log.getReg(0)) 2527 .addUse(Src1) 2528 .setMIFlags(Flags); 2529 B.buildFExp2(Dst, Mul, Flags); 2530 } else if (Ty == S16) { 2531 // There's no f16 fmul_legacy, so we need to convert for it. 2532 auto Log = B.buildFLog2(S16, Src0, Flags); 2533 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2534 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2535 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2536 .addUse(Ext0.getReg(0)) 2537 .addUse(Ext1.getReg(0)) 2538 .setMIFlags(Flags); 2539 2540 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2541 } else 2542 return false; 2543 2544 MI.eraseFromParent(); 2545 return true; 2546 } 2547 2548 // Find a source register, ignoring any possible source modifiers. 2549 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2550 Register ModSrc = OrigSrc; 2551 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2552 ModSrc = SrcFNeg->getOperand(1).getReg(); 2553 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2554 ModSrc = SrcFAbs->getOperand(1).getReg(); 2555 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2556 ModSrc = SrcFAbs->getOperand(1).getReg(); 2557 return ModSrc; 2558 } 2559 2560 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2561 MachineRegisterInfo &MRI, 2562 MachineIRBuilder &B) const { 2563 2564 const LLT S1 = LLT::scalar(1); 2565 const LLT S64 = LLT::scalar(64); 2566 Register Dst = MI.getOperand(0).getReg(); 2567 Register OrigSrc = MI.getOperand(1).getReg(); 2568 unsigned Flags = MI.getFlags(); 2569 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2570 "this should not have been custom lowered"); 2571 2572 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2573 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2574 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2575 // V_FRACT bug is: 2576 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2577 // 2578 // Convert floor(x) to (x - fract(x)) 2579 2580 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2581 .addUse(OrigSrc) 2582 .setMIFlags(Flags); 2583 2584 // Give source modifier matching some assistance before obscuring a foldable 2585 // pattern. 2586 2587 // TODO: We can avoid the neg on the fract? The input sign to fract 2588 // shouldn't matter? 2589 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2590 2591 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2592 2593 Register Min = MRI.createGenericVirtualRegister(S64); 2594 2595 // We don't need to concern ourselves with the snan handling difference, so 2596 // use the one which will directly select. 2597 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2598 if (MFI->getMode().IEEE) 2599 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2600 else 2601 B.buildFMinNum(Min, Fract, Const, Flags); 2602 2603 Register CorrectedFract = Min; 2604 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2605 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2606 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2607 } 2608 2609 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2610 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2611 2612 MI.eraseFromParent(); 2613 return true; 2614 } 2615 2616 // Turn an illegal packed v2s16 build vector into bit operations. 2617 // TODO: This should probably be a bitcast action in LegalizerHelper. 2618 bool AMDGPULegalizerInfo::legalizeBuildVector( 2619 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2620 Register Dst = MI.getOperand(0).getReg(); 2621 const LLT S32 = LLT::scalar(32); 2622 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2623 2624 Register Src0 = MI.getOperand(1).getReg(); 2625 Register Src1 = MI.getOperand(2).getReg(); 2626 assert(MRI.getType(Src0) == LLT::scalar(16)); 2627 2628 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2629 B.buildBitcast(Dst, Merge); 2630 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 2635 // Return the use branch instruction, otherwise null if the usage is invalid. 2636 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2637 MachineRegisterInfo &MRI, 2638 MachineInstr *&Br, 2639 MachineBasicBlock *&UncondBrTarget) { 2640 Register CondDef = MI.getOperand(0).getReg(); 2641 if (!MRI.hasOneNonDBGUse(CondDef)) 2642 return nullptr; 2643 2644 MachineBasicBlock *Parent = MI.getParent(); 2645 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2646 if (UseMI.getParent() != Parent || 2647 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2648 return nullptr; 2649 2650 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2651 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2652 if (Next == Parent->end()) { 2653 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2654 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2655 return nullptr; 2656 UncondBrTarget = &*NextMBB; 2657 } else { 2658 if (Next->getOpcode() != AMDGPU::G_BR) 2659 return nullptr; 2660 Br = &*Next; 2661 UncondBrTarget = Br->getOperand(0).getMBB(); 2662 } 2663 2664 return &UseMI; 2665 } 2666 2667 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2668 const ArgDescriptor *Arg, 2669 const TargetRegisterClass *ArgRC, 2670 LLT ArgTy) const { 2671 MCRegister SrcReg = Arg->getRegister(); 2672 assert(SrcReg.isPhysical() && "Physical register expected"); 2673 assert(DstReg.isVirtual() && "Virtual register expected"); 2674 2675 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2676 ArgTy); 2677 if (Arg->isMasked()) { 2678 // TODO: Should we try to emit this once in the entry block? 2679 const LLT S32 = LLT::scalar(32); 2680 const unsigned Mask = Arg->getMask(); 2681 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2682 2683 Register AndMaskSrc = LiveIn; 2684 2685 if (Shift != 0) { 2686 auto ShiftAmt = B.buildConstant(S32, Shift); 2687 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2688 } 2689 2690 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2691 } else { 2692 B.buildCopy(DstReg, LiveIn); 2693 } 2694 2695 return true; 2696 } 2697 2698 bool AMDGPULegalizerInfo::loadInputValue( 2699 Register DstReg, MachineIRBuilder &B, 2700 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2701 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2702 const ArgDescriptor *Arg; 2703 const TargetRegisterClass *ArgRC; 2704 LLT ArgTy; 2705 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2706 2707 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2708 return false; // TODO: Handle these 2709 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2713 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2714 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2715 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2716 return false; 2717 2718 MI.eraseFromParent(); 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2723 MachineRegisterInfo &MRI, 2724 MachineIRBuilder &B) const { 2725 Register Dst = MI.getOperand(0).getReg(); 2726 LLT DstTy = MRI.getType(Dst); 2727 LLT S16 = LLT::scalar(16); 2728 LLT S32 = LLT::scalar(32); 2729 LLT S64 = LLT::scalar(64); 2730 2731 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2732 return true; 2733 2734 if (DstTy == S16) 2735 return legalizeFDIV16(MI, MRI, B); 2736 if (DstTy == S32) 2737 return legalizeFDIV32(MI, MRI, B); 2738 if (DstTy == S64) 2739 return legalizeFDIV64(MI, MRI, B); 2740 2741 return false; 2742 } 2743 2744 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2745 Register DstReg, 2746 Register X, 2747 Register Y, 2748 bool IsDiv) const { 2749 const LLT S1 = LLT::scalar(1); 2750 const LLT S32 = LLT::scalar(32); 2751 2752 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2753 // algorithm used here. 2754 2755 // Initial estimate of inv(y). 2756 auto FloatY = B.buildUITOFP(S32, Y); 2757 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2758 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2759 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2760 auto Z = B.buildFPTOUI(S32, ScaledY); 2761 2762 // One round of UNR. 2763 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2764 auto NegYZ = B.buildMul(S32, NegY, Z); 2765 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2766 2767 // Quotient/remainder estimate. 2768 auto Q = B.buildUMulH(S32, X, Z); 2769 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2770 2771 // First quotient/remainder refinement. 2772 auto One = B.buildConstant(S32, 1); 2773 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2774 if (IsDiv) 2775 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2776 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2777 2778 // Second quotient/remainder refinement. 2779 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2780 if (IsDiv) 2781 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2782 else 2783 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2784 } 2785 2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2787 MachineRegisterInfo &MRI, 2788 MachineIRBuilder &B) const { 2789 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2790 Register DstReg = MI.getOperand(0).getReg(); 2791 Register Num = MI.getOperand(1).getReg(); 2792 Register Den = MI.getOperand(2).getReg(); 2793 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2794 MI.eraseFromParent(); 2795 return true; 2796 } 2797 2798 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2799 // 2800 // Return lo, hi of result 2801 // 2802 // %cvt.lo = G_UITOFP Val.lo 2803 // %cvt.hi = G_UITOFP Val.hi 2804 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2805 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2806 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2807 // %mul2 = G_FMUL %mul1, 2**(-32) 2808 // %trunc = G_INTRINSIC_TRUNC %mul2 2809 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2810 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2811 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2812 Register Val) { 2813 const LLT S32 = LLT::scalar(32); 2814 auto Unmerge = B.buildUnmerge(S32, Val); 2815 2816 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2817 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2818 2819 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2820 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2821 2822 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2823 auto Mul1 = 2824 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2825 2826 // 2**(-32) 2827 auto Mul2 = 2828 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2829 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2830 2831 // -(2**32) 2832 auto Mad2 = B.buildFMAD(S32, Trunc, 2833 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2834 2835 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2836 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2837 2838 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2839 } 2840 2841 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2842 Register DstReg, 2843 Register Numer, 2844 Register Denom, 2845 bool IsDiv) const { 2846 const LLT S32 = LLT::scalar(32); 2847 const LLT S64 = LLT::scalar(64); 2848 const LLT S1 = LLT::scalar(1); 2849 Register RcpLo, RcpHi; 2850 2851 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2852 2853 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2854 2855 auto Zero64 = B.buildConstant(S64, 0); 2856 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2857 2858 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2859 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2860 2861 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2862 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2863 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2864 2865 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2866 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2867 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2868 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2869 2870 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2871 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2872 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2873 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2874 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2875 2876 auto Zero32 = B.buildConstant(S32, 0); 2877 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2878 auto Add2_HiC = 2879 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2880 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2881 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2882 2883 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2884 Register NumerLo = UnmergeNumer.getReg(0); 2885 Register NumerHi = UnmergeNumer.getReg(1); 2886 2887 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2888 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2889 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2890 Register Mul3_Lo = UnmergeMul3.getReg(0); 2891 Register Mul3_Hi = UnmergeMul3.getReg(1); 2892 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2893 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2894 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2895 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2896 2897 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2898 Register DenomLo = UnmergeDenom.getReg(0); 2899 Register DenomHi = UnmergeDenom.getReg(1); 2900 2901 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2902 auto C1 = B.buildSExt(S32, CmpHi); 2903 2904 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2905 auto C2 = B.buildSExt(S32, CmpLo); 2906 2907 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2908 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2909 2910 // TODO: Here and below portions of the code can be enclosed into if/endif. 2911 // Currently control flow is unconditional and we have 4 selects after 2912 // potential endif to substitute PHIs. 2913 2914 // if C3 != 0 ... 2915 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2916 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2917 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2918 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2919 2920 auto One64 = B.buildConstant(S64, 1); 2921 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2922 2923 auto C4 = 2924 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2925 auto C5 = 2926 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2927 auto C6 = B.buildSelect( 2928 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2929 2930 // if (C6 != 0) 2931 auto Add4 = B.buildAdd(S64, Add3, One64); 2932 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2933 2934 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2935 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2936 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2937 2938 // endif C6 2939 // endif C3 2940 2941 if (IsDiv) { 2942 auto Sel1 = B.buildSelect( 2943 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2944 B.buildSelect(DstReg, 2945 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2946 } else { 2947 auto Sel2 = B.buildSelect( 2948 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2949 B.buildSelect(DstReg, 2950 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2951 } 2952 } 2953 2954 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2955 MachineRegisterInfo &MRI, 2956 MachineIRBuilder &B) const { 2957 const LLT S64 = LLT::scalar(64); 2958 const LLT S32 = LLT::scalar(32); 2959 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2960 Register DstReg = MI.getOperand(0).getReg(); 2961 Register Num = MI.getOperand(1).getReg(); 2962 Register Den = MI.getOperand(2).getReg(); 2963 LLT Ty = MRI.getType(DstReg); 2964 2965 if (Ty == S32) 2966 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2967 else if (Ty == S64) 2968 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2969 else 2970 return false; 2971 2972 MI.eraseFromParent(); 2973 return true; 2974 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B) const { 2980 const LLT S64 = LLT::scalar(64); 2981 const LLT S32 = LLT::scalar(32); 2982 2983 Register DstReg = MI.getOperand(0).getReg(); 2984 const LLT Ty = MRI.getType(DstReg); 2985 if (Ty != S32 && Ty != S64) 2986 return false; 2987 2988 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2989 2990 Register LHS = MI.getOperand(1).getReg(); 2991 Register RHS = MI.getOperand(2).getReg(); 2992 2993 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2994 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2995 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2996 2997 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2998 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2999 3000 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3001 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3002 3003 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3004 if (Ty == S32) 3005 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3006 else 3007 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3008 3009 Register Sign; 3010 if (IsDiv) 3011 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3012 else 3013 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3014 3015 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3016 B.buildSub(DstReg, UDivRem, Sign); 3017 3018 MI.eraseFromParent(); 3019 return true; 3020 } 3021 3022 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3023 MachineRegisterInfo &MRI, 3024 MachineIRBuilder &B) const { 3025 Register Res = MI.getOperand(0).getReg(); 3026 Register LHS = MI.getOperand(1).getReg(); 3027 Register RHS = MI.getOperand(2).getReg(); 3028 3029 uint16_t Flags = MI.getFlags(); 3030 3031 LLT ResTy = MRI.getType(Res); 3032 LLT S32 = LLT::scalar(32); 3033 LLT S64 = LLT::scalar(64); 3034 3035 const MachineFunction &MF = B.getMF(); 3036 bool Unsafe = 3037 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3038 3039 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3040 return false; 3041 3042 if (!Unsafe && ResTy == S32 && 3043 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3044 return false; 3045 3046 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3047 // 1 / x -> RCP(x) 3048 if (CLHS->isExactlyValue(1.0)) { 3049 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3050 .addUse(RHS) 3051 .setMIFlags(Flags); 3052 3053 MI.eraseFromParent(); 3054 return true; 3055 } 3056 3057 // -1 / x -> RCP( FNEG(x) ) 3058 if (CLHS->isExactlyValue(-1.0)) { 3059 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3060 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3061 .addUse(FNeg.getReg(0)) 3062 .setMIFlags(Flags); 3063 3064 MI.eraseFromParent(); 3065 return true; 3066 } 3067 } 3068 3069 // x / y -> x * (1.0 / y) 3070 if (Unsafe) { 3071 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3072 .addUse(RHS) 3073 .setMIFlags(Flags); 3074 B.buildFMul(Res, LHS, RCP, Flags); 3075 3076 MI.eraseFromParent(); 3077 return true; 3078 } 3079 3080 return false; 3081 } 3082 3083 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3084 MachineRegisterInfo &MRI, 3085 MachineIRBuilder &B) const { 3086 Register Res = MI.getOperand(0).getReg(); 3087 Register LHS = MI.getOperand(1).getReg(); 3088 Register RHS = MI.getOperand(2).getReg(); 3089 3090 uint16_t Flags = MI.getFlags(); 3091 3092 LLT S16 = LLT::scalar(16); 3093 LLT S32 = LLT::scalar(32); 3094 3095 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3096 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3097 3098 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3099 .addUse(RHSExt.getReg(0)) 3100 .setMIFlags(Flags); 3101 3102 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3103 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3104 3105 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3106 .addUse(RDst.getReg(0)) 3107 .addUse(RHS) 3108 .addUse(LHS) 3109 .setMIFlags(Flags); 3110 3111 MI.eraseFromParent(); 3112 return true; 3113 } 3114 3115 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3116 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3117 static void toggleSPDenormMode(bool Enable, 3118 MachineIRBuilder &B, 3119 const GCNSubtarget &ST, 3120 AMDGPU::SIModeRegisterDefaults Mode) { 3121 // Set SP denorm mode to this value. 3122 unsigned SPDenormMode = 3123 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3124 3125 if (ST.hasDenormModeInst()) { 3126 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3127 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3128 3129 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3130 B.buildInstr(AMDGPU::S_DENORM_MODE) 3131 .addImm(NewDenormModeValue); 3132 3133 } else { 3134 // Select FP32 bit field in mode register. 3135 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3136 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3137 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3138 3139 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3140 .addImm(SPDenormMode) 3141 .addImm(SPDenormModeBitField); 3142 } 3143 } 3144 3145 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3146 MachineRegisterInfo &MRI, 3147 MachineIRBuilder &B) const { 3148 Register Res = MI.getOperand(0).getReg(); 3149 Register LHS = MI.getOperand(1).getReg(); 3150 Register RHS = MI.getOperand(2).getReg(); 3151 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3152 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3153 3154 uint16_t Flags = MI.getFlags(); 3155 3156 LLT S32 = LLT::scalar(32); 3157 LLT S1 = LLT::scalar(1); 3158 3159 auto One = B.buildFConstant(S32, 1.0f); 3160 3161 auto DenominatorScaled = 3162 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3163 .addUse(LHS) 3164 .addUse(RHS) 3165 .addImm(0) 3166 .setMIFlags(Flags); 3167 auto NumeratorScaled = 3168 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3169 .addUse(LHS) 3170 .addUse(RHS) 3171 .addImm(1) 3172 .setMIFlags(Flags); 3173 3174 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3175 .addUse(DenominatorScaled.getReg(0)) 3176 .setMIFlags(Flags); 3177 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3178 3179 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3180 // aren't modeled as reading it. 3181 if (!Mode.allFP32Denormals()) 3182 toggleSPDenormMode(true, B, ST, Mode); 3183 3184 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3185 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3186 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3187 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3188 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3189 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3190 3191 if (!Mode.allFP32Denormals()) 3192 toggleSPDenormMode(false, B, ST, Mode); 3193 3194 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3195 .addUse(Fma4.getReg(0)) 3196 .addUse(Fma1.getReg(0)) 3197 .addUse(Fma3.getReg(0)) 3198 .addUse(NumeratorScaled.getReg(1)) 3199 .setMIFlags(Flags); 3200 3201 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3202 .addUse(Fmas.getReg(0)) 3203 .addUse(RHS) 3204 .addUse(LHS) 3205 .setMIFlags(Flags); 3206 3207 MI.eraseFromParent(); 3208 return true; 3209 } 3210 3211 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3212 MachineRegisterInfo &MRI, 3213 MachineIRBuilder &B) const { 3214 Register Res = MI.getOperand(0).getReg(); 3215 Register LHS = MI.getOperand(1).getReg(); 3216 Register RHS = MI.getOperand(2).getReg(); 3217 3218 uint16_t Flags = MI.getFlags(); 3219 3220 LLT S64 = LLT::scalar(64); 3221 LLT S1 = LLT::scalar(1); 3222 3223 auto One = B.buildFConstant(S64, 1.0); 3224 3225 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3226 .addUse(LHS) 3227 .addUse(RHS) 3228 .addImm(0) 3229 .setMIFlags(Flags); 3230 3231 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3232 3233 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3234 .addUse(DivScale0.getReg(0)) 3235 .setMIFlags(Flags); 3236 3237 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3238 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3239 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3240 3241 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3242 .addUse(LHS) 3243 .addUse(RHS) 3244 .addImm(1) 3245 .setMIFlags(Flags); 3246 3247 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3248 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3249 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3250 3251 Register Scale; 3252 if (!ST.hasUsableDivScaleConditionOutput()) { 3253 // Workaround a hardware bug on SI where the condition output from div_scale 3254 // is not usable. 3255 3256 LLT S32 = LLT::scalar(32); 3257 3258 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3259 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3260 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3261 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3262 3263 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3264 Scale1Unmerge.getReg(1)); 3265 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3266 Scale0Unmerge.getReg(1)); 3267 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3268 } else { 3269 Scale = DivScale1.getReg(1); 3270 } 3271 3272 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3273 .addUse(Fma4.getReg(0)) 3274 .addUse(Fma3.getReg(0)) 3275 .addUse(Mul.getReg(0)) 3276 .addUse(Scale) 3277 .setMIFlags(Flags); 3278 3279 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3280 .addUse(Fmas.getReg(0)) 3281 .addUse(RHS) 3282 .addUse(LHS) 3283 .setMIFlags(Flags); 3284 3285 MI.eraseFromParent(); 3286 return true; 3287 } 3288 3289 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3290 MachineRegisterInfo &MRI, 3291 MachineIRBuilder &B) const { 3292 Register Res = MI.getOperand(0).getReg(); 3293 Register LHS = MI.getOperand(2).getReg(); 3294 Register RHS = MI.getOperand(3).getReg(); 3295 uint16_t Flags = MI.getFlags(); 3296 3297 LLT S32 = LLT::scalar(32); 3298 LLT S1 = LLT::scalar(1); 3299 3300 auto Abs = B.buildFAbs(S32, RHS, Flags); 3301 const APFloat C0Val(1.0f); 3302 3303 auto C0 = B.buildConstant(S32, 0x6f800000); 3304 auto C1 = B.buildConstant(S32, 0x2f800000); 3305 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3306 3307 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3308 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3309 3310 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3311 3312 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3313 .addUse(Mul0.getReg(0)) 3314 .setMIFlags(Flags); 3315 3316 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3317 3318 B.buildFMul(Res, Sel, Mul1, Flags); 3319 3320 MI.eraseFromParent(); 3321 return true; 3322 } 3323 3324 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3325 // FIXME: Why do we handle this one but not other removed instructions? 3326 // 3327 // Reciprocal square root. The clamp prevents infinite results, clamping 3328 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3329 // +-max_float. 3330 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3331 MachineRegisterInfo &MRI, 3332 MachineIRBuilder &B) const { 3333 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3334 return true; 3335 3336 Register Dst = MI.getOperand(0).getReg(); 3337 Register Src = MI.getOperand(2).getReg(); 3338 auto Flags = MI.getFlags(); 3339 3340 LLT Ty = MRI.getType(Dst); 3341 3342 const fltSemantics *FltSemantics; 3343 if (Ty == LLT::scalar(32)) 3344 FltSemantics = &APFloat::IEEEsingle(); 3345 else if (Ty == LLT::scalar(64)) 3346 FltSemantics = &APFloat::IEEEdouble(); 3347 else 3348 return false; 3349 3350 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3351 .addUse(Src) 3352 .setMIFlags(Flags); 3353 3354 // We don't need to concern ourselves with the snan handling difference, since 3355 // the rsq quieted (or not) so use the one which will directly select. 3356 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3357 const bool UseIEEE = MFI->getMode().IEEE; 3358 3359 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3360 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3361 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3362 3363 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3364 3365 if (UseIEEE) 3366 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3367 else 3368 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3369 MI.eraseFromParent(); 3370 return true; 3371 } 3372 3373 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3374 switch (IID) { 3375 case Intrinsic::amdgcn_ds_fadd: 3376 return AMDGPU::G_ATOMICRMW_FADD; 3377 case Intrinsic::amdgcn_ds_fmin: 3378 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3379 case Intrinsic::amdgcn_ds_fmax: 3380 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3381 default: 3382 llvm_unreachable("not a DS FP intrinsic"); 3383 } 3384 } 3385 3386 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3387 MachineInstr &MI, 3388 Intrinsic::ID IID) const { 3389 GISelChangeObserver &Observer = Helper.Observer; 3390 Observer.changingInstr(MI); 3391 3392 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3393 3394 // The remaining operands were used to set fields in the MemOperand on 3395 // construction. 3396 for (int I = 6; I > 3; --I) 3397 MI.RemoveOperand(I); 3398 3399 MI.RemoveOperand(1); // Remove the intrinsic ID. 3400 Observer.changedInstr(MI); 3401 return true; 3402 } 3403 3404 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3405 MachineRegisterInfo &MRI, 3406 MachineIRBuilder &B) const { 3407 uint64_t Offset = 3408 ST.getTargetLowering()->getImplicitParameterOffset( 3409 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3410 LLT DstTy = MRI.getType(DstReg); 3411 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3412 3413 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3414 if (!loadInputValue(KernargPtrReg, B, 3415 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3416 return false; 3417 3418 // FIXME: This should be nuw 3419 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3420 return true; 3421 } 3422 3423 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3424 MachineRegisterInfo &MRI, 3425 MachineIRBuilder &B) const { 3426 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3427 if (!MFI->isEntryFunction()) { 3428 return legalizePreloadedArgIntrin(MI, MRI, B, 3429 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3430 } 3431 3432 Register DstReg = MI.getOperand(0).getReg(); 3433 if (!getImplicitArgPtr(DstReg, MRI, B)) 3434 return false; 3435 3436 MI.eraseFromParent(); 3437 return true; 3438 } 3439 3440 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3441 MachineRegisterInfo &MRI, 3442 MachineIRBuilder &B, 3443 unsigned AddrSpace) const { 3444 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3445 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3446 Register Hi32 = Unmerge.getReg(1); 3447 3448 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3449 MI.eraseFromParent(); 3450 return true; 3451 } 3452 3453 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3454 // offset (the offset that is included in bounds checking and swizzling, to be 3455 // split between the instruction's voffset and immoffset fields) and soffset 3456 // (the offset that is excluded from bounds checking and swizzling, to go in 3457 // the instruction's soffset field). This function takes the first kind of 3458 // offset and figures out how to split it between voffset and immoffset. 3459 std::tuple<Register, unsigned, unsigned> 3460 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3461 Register OrigOffset) const { 3462 const unsigned MaxImm = 4095; 3463 Register BaseReg; 3464 unsigned TotalConstOffset; 3465 MachineInstr *OffsetDef; 3466 const LLT S32 = LLT::scalar(32); 3467 3468 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3469 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3470 3471 unsigned ImmOffset = TotalConstOffset; 3472 3473 // If the immediate value is too big for the immoffset field, put the value 3474 // and -4096 into the immoffset field so that the value that is copied/added 3475 // for the voffset field is a multiple of 4096, and it stands more chance 3476 // of being CSEd with the copy/add for another similar load/store. 3477 // However, do not do that rounding down to a multiple of 4096 if that is a 3478 // negative number, as it appears to be illegal to have a negative offset 3479 // in the vgpr, even if adding the immediate offset makes it positive. 3480 unsigned Overflow = ImmOffset & ~MaxImm; 3481 ImmOffset -= Overflow; 3482 if ((int32_t)Overflow < 0) { 3483 Overflow += ImmOffset; 3484 ImmOffset = 0; 3485 } 3486 3487 if (Overflow != 0) { 3488 if (!BaseReg) { 3489 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3490 } else { 3491 auto OverflowVal = B.buildConstant(S32, Overflow); 3492 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3493 } 3494 } 3495 3496 if (!BaseReg) 3497 BaseReg = B.buildConstant(S32, 0).getReg(0); 3498 3499 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3500 } 3501 3502 /// Handle register layout difference for f16 images for some subtargets. 3503 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3504 MachineRegisterInfo &MRI, 3505 Register Reg) const { 3506 if (!ST.hasUnpackedD16VMem()) 3507 return Reg; 3508 3509 const LLT S16 = LLT::scalar(16); 3510 const LLT S32 = LLT::scalar(32); 3511 LLT StoreVT = MRI.getType(Reg); 3512 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3513 3514 auto Unmerge = B.buildUnmerge(S16, Reg); 3515 3516 SmallVector<Register, 4> WideRegs; 3517 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3518 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3519 3520 int NumElts = StoreVT.getNumElements(); 3521 3522 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3523 } 3524 3525 Register AMDGPULegalizerInfo::fixStoreSourceType( 3526 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3527 MachineRegisterInfo *MRI = B.getMRI(); 3528 LLT Ty = MRI->getType(VData); 3529 3530 const LLT S16 = LLT::scalar(16); 3531 3532 // Fixup illegal register types for i8 stores. 3533 if (Ty == LLT::scalar(8) || Ty == S16) { 3534 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3535 return AnyExt; 3536 } 3537 3538 if (Ty.isVector()) { 3539 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3540 if (IsFormat) 3541 return handleD16VData(B, *MRI, VData); 3542 } 3543 } 3544 3545 return VData; 3546 } 3547 3548 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3549 MachineRegisterInfo &MRI, 3550 MachineIRBuilder &B, 3551 bool IsTyped, 3552 bool IsFormat) const { 3553 Register VData = MI.getOperand(1).getReg(); 3554 LLT Ty = MRI.getType(VData); 3555 LLT EltTy = Ty.getScalarType(); 3556 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3557 const LLT S32 = LLT::scalar(32); 3558 3559 VData = fixStoreSourceType(B, VData, IsFormat); 3560 Register RSrc = MI.getOperand(2).getReg(); 3561 3562 MachineMemOperand *MMO = *MI.memoperands_begin(); 3563 const int MemSize = MMO->getSize(); 3564 3565 unsigned ImmOffset; 3566 unsigned TotalOffset; 3567 3568 // The typed intrinsics add an immediate after the registers. 3569 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3570 3571 // The struct intrinsic variants add one additional operand over raw. 3572 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3573 Register VIndex; 3574 int OpOffset = 0; 3575 if (HasVIndex) { 3576 VIndex = MI.getOperand(3).getReg(); 3577 OpOffset = 1; 3578 } 3579 3580 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3581 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3582 3583 unsigned Format = 0; 3584 if (IsTyped) { 3585 Format = MI.getOperand(5 + OpOffset).getImm(); 3586 ++OpOffset; 3587 } 3588 3589 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3590 3591 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3592 if (TotalOffset != 0) 3593 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3594 3595 unsigned Opc; 3596 if (IsTyped) { 3597 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3598 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3599 } else if (IsFormat) { 3600 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3601 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3602 } else { 3603 switch (MemSize) { 3604 case 1: 3605 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3606 break; 3607 case 2: 3608 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3609 break; 3610 default: 3611 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3612 break; 3613 } 3614 } 3615 3616 if (!VIndex) 3617 VIndex = B.buildConstant(S32, 0).getReg(0); 3618 3619 auto MIB = B.buildInstr(Opc) 3620 .addUse(VData) // vdata 3621 .addUse(RSrc) // rsrc 3622 .addUse(VIndex) // vindex 3623 .addUse(VOffset) // voffset 3624 .addUse(SOffset) // soffset 3625 .addImm(ImmOffset); // offset(imm) 3626 3627 if (IsTyped) 3628 MIB.addImm(Format); 3629 3630 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3631 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3632 .addMemOperand(MMO); 3633 3634 MI.eraseFromParent(); 3635 return true; 3636 } 3637 3638 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3639 MachineRegisterInfo &MRI, 3640 MachineIRBuilder &B, 3641 bool IsFormat, 3642 bool IsTyped) const { 3643 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3644 MachineMemOperand *MMO = *MI.memoperands_begin(); 3645 const int MemSize = MMO->getSize(); 3646 const LLT S32 = LLT::scalar(32); 3647 3648 Register Dst = MI.getOperand(0).getReg(); 3649 Register RSrc = MI.getOperand(2).getReg(); 3650 3651 // The typed intrinsics add an immediate after the registers. 3652 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3653 3654 // The struct intrinsic variants add one additional operand over raw. 3655 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3656 Register VIndex; 3657 int OpOffset = 0; 3658 if (HasVIndex) { 3659 VIndex = MI.getOperand(3).getReg(); 3660 OpOffset = 1; 3661 } 3662 3663 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3664 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3665 3666 unsigned Format = 0; 3667 if (IsTyped) { 3668 Format = MI.getOperand(5 + OpOffset).getImm(); 3669 ++OpOffset; 3670 } 3671 3672 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3673 unsigned ImmOffset; 3674 unsigned TotalOffset; 3675 3676 LLT Ty = MRI.getType(Dst); 3677 LLT EltTy = Ty.getScalarType(); 3678 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3679 const bool Unpacked = ST.hasUnpackedD16VMem(); 3680 3681 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3682 if (TotalOffset != 0) 3683 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3684 3685 unsigned Opc; 3686 3687 if (IsTyped) { 3688 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3689 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3690 } else if (IsFormat) { 3691 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3692 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3693 } else { 3694 switch (MemSize) { 3695 case 1: 3696 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3697 break; 3698 case 2: 3699 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3700 break; 3701 default: 3702 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3703 break; 3704 } 3705 } 3706 3707 Register LoadDstReg; 3708 3709 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3710 LLT UnpackedTy = Ty.changeElementSize(32); 3711 3712 if (IsExtLoad) 3713 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3714 else if (Unpacked && IsD16 && Ty.isVector()) 3715 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3716 else 3717 LoadDstReg = Dst; 3718 3719 if (!VIndex) 3720 VIndex = B.buildConstant(S32, 0).getReg(0); 3721 3722 auto MIB = B.buildInstr(Opc) 3723 .addDef(LoadDstReg) // vdata 3724 .addUse(RSrc) // rsrc 3725 .addUse(VIndex) // vindex 3726 .addUse(VOffset) // voffset 3727 .addUse(SOffset) // soffset 3728 .addImm(ImmOffset); // offset(imm) 3729 3730 if (IsTyped) 3731 MIB.addImm(Format); 3732 3733 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3734 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3735 .addMemOperand(MMO); 3736 3737 if (LoadDstReg != Dst) { 3738 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3739 3740 // Widen result for extending loads was widened. 3741 if (IsExtLoad) 3742 B.buildTrunc(Dst, LoadDstReg); 3743 else { 3744 // Repack to original 16-bit vector result 3745 // FIXME: G_TRUNC should work, but legalization currently fails 3746 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3747 SmallVector<Register, 4> Repack; 3748 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3749 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3750 B.buildMerge(Dst, Repack); 3751 } 3752 } 3753 3754 MI.eraseFromParent(); 3755 return true; 3756 } 3757 3758 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3759 MachineIRBuilder &B, 3760 bool IsInc) const { 3761 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3762 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3763 B.buildInstr(Opc) 3764 .addDef(MI.getOperand(0).getReg()) 3765 .addUse(MI.getOperand(2).getReg()) 3766 .addUse(MI.getOperand(3).getReg()) 3767 .cloneMemRefs(MI); 3768 MI.eraseFromParent(); 3769 return true; 3770 } 3771 3772 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3773 switch (IntrID) { 3774 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3775 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3776 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3777 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3778 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3779 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3780 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3781 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3782 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3783 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3784 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3785 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3786 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3787 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3788 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3789 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3790 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3791 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3792 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3793 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3794 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3795 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3796 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3797 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3798 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3799 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3800 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3801 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3802 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3803 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3804 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3805 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3806 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3807 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3808 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3809 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3810 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3811 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3812 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3813 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3814 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3815 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3816 default: 3817 llvm_unreachable("unhandled atomic opcode"); 3818 } 3819 } 3820 3821 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3822 MachineIRBuilder &B, 3823 Intrinsic::ID IID) const { 3824 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3825 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3826 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3827 3828 Register Dst; 3829 3830 int OpOffset = 0; 3831 if (HasReturn) { 3832 // A few FP atomics do not support return values. 3833 Dst = MI.getOperand(0).getReg(); 3834 } else { 3835 OpOffset = -1; 3836 } 3837 3838 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3839 Register CmpVal; 3840 3841 if (IsCmpSwap) { 3842 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3843 ++OpOffset; 3844 } 3845 3846 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3847 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3848 3849 // The struct intrinsic variants add one additional operand over raw. 3850 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3851 Register VIndex; 3852 if (HasVIndex) { 3853 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3854 ++OpOffset; 3855 } 3856 3857 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3858 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3859 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3860 3861 MachineMemOperand *MMO = *MI.memoperands_begin(); 3862 3863 unsigned ImmOffset; 3864 unsigned TotalOffset; 3865 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3866 if (TotalOffset != 0) 3867 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3868 3869 if (!VIndex) 3870 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3871 3872 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3873 3874 if (HasReturn) 3875 MIB.addDef(Dst); 3876 3877 MIB.addUse(VData); // vdata 3878 3879 if (IsCmpSwap) 3880 MIB.addReg(CmpVal); 3881 3882 MIB.addUse(RSrc) // rsrc 3883 .addUse(VIndex) // vindex 3884 .addUse(VOffset) // voffset 3885 .addUse(SOffset) // soffset 3886 .addImm(ImmOffset) // offset(imm) 3887 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3888 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3889 .addMemOperand(MMO); 3890 3891 MI.eraseFromParent(); 3892 return true; 3893 } 3894 3895 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3896 /// vector with s16 typed elements. 3897 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3898 SmallVectorImpl<Register> &PackedAddrs, 3899 int AddrIdx, int DimIdx, int EndIdx, 3900 int NumGradients) { 3901 const LLT S16 = LLT::scalar(16); 3902 const LLT V2S16 = LLT::vector(2, 16); 3903 3904 for (int I = AddrIdx; I < EndIdx; ++I) { 3905 MachineOperand &SrcOp = MI.getOperand(I); 3906 if (!SrcOp.isReg()) 3907 continue; // _L to _LZ may have eliminated this. 3908 3909 Register AddrReg = SrcOp.getReg(); 3910 3911 if (I < DimIdx) { 3912 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3913 PackedAddrs.push_back(AddrReg); 3914 } else { 3915 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3916 // derivatives dx/dh and dx/dv are packed with undef. 3917 if (((I + 1) >= EndIdx) || 3918 ((NumGradients / 2) % 2 == 1 && 3919 (I == DimIdx + (NumGradients / 2) - 1 || 3920 I == DimIdx + NumGradients - 1)) || 3921 // Check for _L to _LZ optimization 3922 !MI.getOperand(I + 1).isReg()) { 3923 PackedAddrs.push_back( 3924 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3925 .getReg(0)); 3926 } else { 3927 PackedAddrs.push_back( 3928 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3929 .getReg(0)); 3930 ++I; 3931 } 3932 } 3933 } 3934 } 3935 3936 /// Convert from separate vaddr components to a single vector address register, 3937 /// and replace the remaining operands with $noreg. 3938 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3939 int DimIdx, int NumVAddrs) { 3940 const LLT S32 = LLT::scalar(32); 3941 3942 SmallVector<Register, 8> AddrRegs; 3943 for (int I = 0; I != NumVAddrs; ++I) { 3944 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3945 if (SrcOp.isReg()) { 3946 AddrRegs.push_back(SrcOp.getReg()); 3947 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3948 } 3949 } 3950 3951 int NumAddrRegs = AddrRegs.size(); 3952 if (NumAddrRegs != 1) { 3953 // Round up to 8 elements for v5-v7 3954 // FIXME: Missing intermediate sized register classes and instructions. 3955 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3956 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3957 auto Undef = B.buildUndef(S32); 3958 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3959 NumAddrRegs = RoundedNumRegs; 3960 } 3961 3962 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3963 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3964 } 3965 3966 for (int I = 1; I != NumVAddrs; ++I) { 3967 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3968 if (SrcOp.isReg()) 3969 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3970 } 3971 } 3972 3973 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3974 /// 3975 /// Depending on the subtarget, load/store with 16-bit element data need to be 3976 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3977 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3978 /// registers. 3979 /// 3980 /// We don't want to directly select image instructions just yet, but also want 3981 /// to exposes all register repacking to the legalizer/combiners. We also don't 3982 /// want a selected instrution entering RegBankSelect. In order to avoid 3983 /// defining a multitude of intermediate image instructions, directly hack on 3984 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3985 /// now unnecessary arguments with $noreg. 3986 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3987 MachineInstr &MI, MachineIRBuilder &B, 3988 GISelChangeObserver &Observer, 3989 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3990 3991 const int NumDefs = MI.getNumExplicitDefs(); 3992 bool IsTFE = NumDefs == 2; 3993 // We are only processing the operands of d16 image operations on subtargets 3994 // that use the unpacked register layout, or need to repack the TFE result. 3995 3996 // TODO: Do we need to guard against already legalized intrinsics? 3997 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3998 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3999 4000 MachineRegisterInfo *MRI = B.getMRI(); 4001 const LLT S32 = LLT::scalar(32); 4002 const LLT S16 = LLT::scalar(16); 4003 const LLT V2S16 = LLT::vector(2, 16); 4004 4005 // Index of first address argument 4006 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 4007 4008 int NumVAddrs, NumGradients; 4009 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 4010 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 4011 getDMaskIdx(BaseOpcode, NumDefs); 4012 unsigned DMask = 0; 4013 4014 // Check for 16 bit addresses and pack if true. 4015 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 4016 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 4017 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 4018 const bool IsG16 = GradTy == S16; 4019 const bool IsA16 = AddrTy == S16; 4020 4021 int DMaskLanes = 0; 4022 if (!BaseOpcode->Atomic) { 4023 DMask = MI.getOperand(DMaskIdx).getImm(); 4024 if (BaseOpcode->Gather4) { 4025 DMaskLanes = 4; 4026 } else if (DMask != 0) { 4027 DMaskLanes = countPopulation(DMask); 4028 } else if (!IsTFE && !BaseOpcode->Store) { 4029 // If dmask is 0, this is a no-op load. This can be eliminated. 4030 B.buildUndef(MI.getOperand(0)); 4031 MI.eraseFromParent(); 4032 return true; 4033 } 4034 } 4035 4036 Observer.changingInstr(MI); 4037 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4038 4039 unsigned NewOpcode = NumDefs == 0 ? 4040 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4041 4042 // Track that we legalized this 4043 MI.setDesc(B.getTII().get(NewOpcode)); 4044 4045 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4046 // dmask to be at least 1 otherwise the instruction will fail 4047 if (IsTFE && DMask == 0) { 4048 DMask = 0x1; 4049 DMaskLanes = 1; 4050 MI.getOperand(DMaskIdx).setImm(DMask); 4051 } 4052 4053 if (BaseOpcode->Atomic) { 4054 Register VData0 = MI.getOperand(2).getReg(); 4055 LLT Ty = MRI->getType(VData0); 4056 4057 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4058 if (Ty.isVector()) 4059 return false; 4060 4061 if (BaseOpcode->AtomicX2) { 4062 Register VData1 = MI.getOperand(3).getReg(); 4063 // The two values are packed in one register. 4064 LLT PackedTy = LLT::vector(2, Ty); 4065 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4066 MI.getOperand(2).setReg(Concat.getReg(0)); 4067 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4068 } 4069 } 4070 4071 int CorrectedNumVAddrs = NumVAddrs; 4072 4073 // Optimize _L to _LZ when _L is zero 4074 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4075 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4076 const ConstantFP *ConstantLod; 4077 const int LodIdx = AddrIdx + NumVAddrs - 1; 4078 4079 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4080 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4081 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4082 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4083 LZMappingInfo->LZ, ImageDimIntr->Dim); 4084 4085 // The starting indexes should remain in the same place. 4086 --NumVAddrs; 4087 --CorrectedNumVAddrs; 4088 4089 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4090 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4091 MI.RemoveOperand(LodIdx); 4092 } 4093 } 4094 } 4095 4096 // Optimize _mip away, when 'lod' is zero 4097 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4098 int64_t ConstantLod; 4099 const int LodIdx = AddrIdx + NumVAddrs - 1; 4100 4101 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4102 if (ConstantLod == 0) { 4103 // TODO: Change intrinsic opcode and remove operand instead or replacing 4104 // it with 0, as the _L to _LZ handling is done above. 4105 MI.getOperand(LodIdx).ChangeToImmediate(0); 4106 --CorrectedNumVAddrs; 4107 } 4108 } 4109 } 4110 4111 // Rewrite the addressing register layout before doing anything else. 4112 if (IsA16 || IsG16) { 4113 if (IsA16) { 4114 // Target must support the feature and gradients need to be 16 bit too 4115 if (!ST.hasA16() || !IsG16) 4116 return false; 4117 } else if (!ST.hasG16()) 4118 return false; 4119 4120 if (NumVAddrs > 1) { 4121 SmallVector<Register, 4> PackedRegs; 4122 // Don't compress addresses for G16 4123 const int PackEndIdx = 4124 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4125 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4126 PackEndIdx, NumGradients); 4127 4128 if (!IsA16) { 4129 // Add uncompressed address 4130 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4131 int AddrReg = MI.getOperand(I).getReg(); 4132 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4133 PackedRegs.push_back(AddrReg); 4134 } 4135 } 4136 4137 // See also below in the non-a16 branch 4138 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4139 4140 if (!UseNSA && PackedRegs.size() > 1) { 4141 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4142 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4143 PackedRegs[0] = Concat.getReg(0); 4144 PackedRegs.resize(1); 4145 } 4146 4147 const int NumPacked = PackedRegs.size(); 4148 for (int I = 0; I != NumVAddrs; ++I) { 4149 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4150 if (!SrcOp.isReg()) { 4151 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4152 continue; 4153 } 4154 4155 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4156 4157 if (I < NumPacked) 4158 SrcOp.setReg(PackedRegs[I]); 4159 else 4160 SrcOp.setReg(AMDGPU::NoRegister); 4161 } 4162 } 4163 } else { 4164 // If the register allocator cannot place the address registers contiguously 4165 // without introducing moves, then using the non-sequential address encoding 4166 // is always preferable, since it saves VALU instructions and is usually a 4167 // wash in terms of code size or even better. 4168 // 4169 // However, we currently have no way of hinting to the register allocator 4170 // that MIMG addresses should be placed contiguously when it is possible to 4171 // do so, so force non-NSA for the common 2-address case as a heuristic. 4172 // 4173 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4174 // allocation when possible. 4175 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4176 4177 if (!UseNSA && NumVAddrs > 1) 4178 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4179 } 4180 4181 int Flags = 0; 4182 if (IsA16) 4183 Flags |= 1; 4184 if (IsG16) 4185 Flags |= 2; 4186 MI.addOperand(MachineOperand::CreateImm(Flags)); 4187 4188 if (BaseOpcode->Store) { // No TFE for stores? 4189 // TODO: Handle dmask trim 4190 Register VData = MI.getOperand(1).getReg(); 4191 LLT Ty = MRI->getType(VData); 4192 if (!Ty.isVector() || Ty.getElementType() != S16) 4193 return true; 4194 4195 Register RepackedReg = handleD16VData(B, *MRI, VData); 4196 if (RepackedReg != VData) { 4197 MI.getOperand(1).setReg(RepackedReg); 4198 } 4199 4200 return true; 4201 } 4202 4203 Register DstReg = MI.getOperand(0).getReg(); 4204 LLT Ty = MRI->getType(DstReg); 4205 const LLT EltTy = Ty.getScalarType(); 4206 const bool IsD16 = Ty.getScalarType() == S16; 4207 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4208 4209 // Confirm that the return type is large enough for the dmask specified 4210 if (NumElts < DMaskLanes) 4211 return false; 4212 4213 if (NumElts > 4 || DMaskLanes > 4) 4214 return false; 4215 4216 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4217 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4218 4219 // The raw dword aligned data component of the load. The only legal cases 4220 // where this matters should be when using the packed D16 format, for 4221 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4222 LLT RoundedTy; 4223 4224 // S32 vector to to cover all data, plus TFE result element. 4225 LLT TFETy; 4226 4227 // Register type to use for each loaded component. Will be S32 or V2S16. 4228 LLT RegTy; 4229 4230 if (IsD16 && ST.hasUnpackedD16VMem()) { 4231 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4232 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4233 RegTy = S32; 4234 } else { 4235 unsigned EltSize = EltTy.getSizeInBits(); 4236 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4237 unsigned RoundedSize = 32 * RoundedElts; 4238 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4239 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4240 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4241 } 4242 4243 // The return type does not need adjustment. 4244 // TODO: Should we change s16 case to s32 or <2 x s16>? 4245 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4246 return true; 4247 4248 Register Dst1Reg; 4249 4250 // Insert after the instruction. 4251 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4252 4253 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4254 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4255 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4256 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4257 4258 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4259 4260 MI.getOperand(0).setReg(NewResultReg); 4261 4262 // In the IR, TFE is supposed to be used with a 2 element struct return 4263 // type. The intruction really returns these two values in one contiguous 4264 // register, with one additional dword beyond the loaded data. Rewrite the 4265 // return type to use a single register result. 4266 4267 if (IsTFE) { 4268 Dst1Reg = MI.getOperand(1).getReg(); 4269 if (MRI->getType(Dst1Reg) != S32) 4270 return false; 4271 4272 // TODO: Make sure the TFE operand bit is set. 4273 MI.RemoveOperand(1); 4274 4275 // Handle the easy case that requires no repack instructions. 4276 if (Ty == S32) { 4277 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4278 return true; 4279 } 4280 } 4281 4282 // Now figure out how to copy the new result register back into the old 4283 // result. 4284 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4285 4286 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4287 4288 if (ResultNumRegs == 1) { 4289 assert(!IsTFE); 4290 ResultRegs[0] = NewResultReg; 4291 } else { 4292 // We have to repack into a new vector of some kind. 4293 for (int I = 0; I != NumDataRegs; ++I) 4294 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4295 B.buildUnmerge(ResultRegs, NewResultReg); 4296 4297 // Drop the final TFE element to get the data part. The TFE result is 4298 // directly written to the right place already. 4299 if (IsTFE) 4300 ResultRegs.resize(NumDataRegs); 4301 } 4302 4303 // For an s16 scalar result, we form an s32 result with a truncate regardless 4304 // of packed vs. unpacked. 4305 if (IsD16 && !Ty.isVector()) { 4306 B.buildTrunc(DstReg, ResultRegs[0]); 4307 return true; 4308 } 4309 4310 // Avoid a build/concat_vector of 1 entry. 4311 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4312 B.buildBitcast(DstReg, ResultRegs[0]); 4313 return true; 4314 } 4315 4316 assert(Ty.isVector()); 4317 4318 if (IsD16) { 4319 // For packed D16 results with TFE enabled, all the data components are 4320 // S32. Cast back to the expected type. 4321 // 4322 // TODO: We don't really need to use load s32 elements. We would only need one 4323 // cast for the TFE result if a multiple of v2s16 was used. 4324 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4325 for (Register &Reg : ResultRegs) 4326 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4327 } else if (ST.hasUnpackedD16VMem()) { 4328 for (Register &Reg : ResultRegs) 4329 Reg = B.buildTrunc(S16, Reg).getReg(0); 4330 } 4331 } 4332 4333 auto padWithUndef = [&](LLT Ty, int NumElts) { 4334 if (NumElts == 0) 4335 return; 4336 Register Undef = B.buildUndef(Ty).getReg(0); 4337 for (int I = 0; I != NumElts; ++I) 4338 ResultRegs.push_back(Undef); 4339 }; 4340 4341 // Pad out any elements eliminated due to the dmask. 4342 LLT ResTy = MRI->getType(ResultRegs[0]); 4343 if (!ResTy.isVector()) { 4344 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4345 B.buildBuildVector(DstReg, ResultRegs); 4346 return true; 4347 } 4348 4349 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4350 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4351 4352 // Deal with the one annoying legal case. 4353 const LLT V3S16 = LLT::vector(3, 16); 4354 if (Ty == V3S16) { 4355 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4356 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4357 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4358 return true; 4359 } 4360 4361 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4362 B.buildConcatVectors(DstReg, ResultRegs); 4363 return true; 4364 } 4365 4366 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4367 LegalizerHelper &Helper, MachineInstr &MI) const { 4368 MachineIRBuilder &B = Helper.MIRBuilder; 4369 GISelChangeObserver &Observer = Helper.Observer; 4370 4371 Register Dst = MI.getOperand(0).getReg(); 4372 LLT Ty = B.getMRI()->getType(Dst); 4373 unsigned Size = Ty.getSizeInBits(); 4374 MachineFunction &MF = B.getMF(); 4375 4376 Observer.changingInstr(MI); 4377 4378 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4379 Ty = getBitcastRegisterType(Ty); 4380 Helper.bitcastDst(MI, Ty, 0); 4381 Dst = MI.getOperand(0).getReg(); 4382 B.setInsertPt(B.getMBB(), MI); 4383 } 4384 4385 // FIXME: We don't really need this intermediate instruction. The intrinsic 4386 // should be fixed to have a memory operand. Since it's readnone, we're not 4387 // allowed to add one. 4388 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4389 MI.RemoveOperand(1); // Remove intrinsic ID 4390 4391 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4392 // TODO: Should this use datalayout alignment? 4393 const unsigned MemSize = (Size + 7) / 8; 4394 const Align MemAlign(4); 4395 MachineMemOperand *MMO = MF.getMachineMemOperand( 4396 MachinePointerInfo(), 4397 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4398 MachineMemOperand::MOInvariant, 4399 MemSize, MemAlign); 4400 MI.addMemOperand(MF, MMO); 4401 4402 // There are no 96-bit result scalar loads, but widening to 128-bit should 4403 // always be legal. We may need to restore this to a 96-bit result if it turns 4404 // out this needs to be converted to a vector load during RegBankSelect. 4405 if (!isPowerOf2_32(Size)) { 4406 if (Ty.isVector()) 4407 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4408 else 4409 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4410 } 4411 4412 Observer.changedInstr(MI); 4413 return true; 4414 } 4415 4416 // TODO: Move to selection 4417 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4418 MachineRegisterInfo &MRI, 4419 MachineIRBuilder &B) const { 4420 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4421 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4422 !ST.isTrapHandlerEnabled()) { 4423 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4424 } else { 4425 // Pass queue pointer to trap handler as input, and insert trap instruction 4426 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4427 MachineRegisterInfo &MRI = *B.getMRI(); 4428 4429 Register LiveIn = 4430 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4431 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4432 return false; 4433 4434 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4435 B.buildCopy(SGPR01, LiveIn); 4436 B.buildInstr(AMDGPU::S_TRAP) 4437 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4438 .addReg(SGPR01, RegState::Implicit); 4439 } 4440 4441 MI.eraseFromParent(); 4442 return true; 4443 } 4444 4445 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4446 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4447 // Is non-HSA path or trap-handler disabled? then, report a warning 4448 // accordingly 4449 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4450 !ST.isTrapHandlerEnabled()) { 4451 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4452 "debugtrap handler not supported", 4453 MI.getDebugLoc(), DS_Warning); 4454 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4455 Ctx.diagnose(NoTrap); 4456 } else { 4457 // Insert debug-trap instruction 4458 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4459 } 4460 4461 MI.eraseFromParent(); 4462 return true; 4463 } 4464 4465 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4466 MachineInstr &MI) const { 4467 MachineIRBuilder &B = Helper.MIRBuilder; 4468 MachineRegisterInfo &MRI = *B.getMRI(); 4469 4470 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4471 auto IntrID = MI.getIntrinsicID(); 4472 switch (IntrID) { 4473 case Intrinsic::amdgcn_if: 4474 case Intrinsic::amdgcn_else: { 4475 MachineInstr *Br = nullptr; 4476 MachineBasicBlock *UncondBrTarget = nullptr; 4477 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4478 const SIRegisterInfo *TRI 4479 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4480 4481 Register Def = MI.getOperand(1).getReg(); 4482 Register Use = MI.getOperand(3).getReg(); 4483 4484 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4485 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4486 if (IntrID == Intrinsic::amdgcn_if) { 4487 B.buildInstr(AMDGPU::SI_IF) 4488 .addDef(Def) 4489 .addUse(Use) 4490 .addMBB(UncondBrTarget); 4491 } else { 4492 B.buildInstr(AMDGPU::SI_ELSE) 4493 .addDef(Def) 4494 .addUse(Use) 4495 .addMBB(UncondBrTarget) 4496 .addImm(0); 4497 } 4498 4499 if (Br) { 4500 Br->getOperand(0).setMBB(CondBrTarget); 4501 } else { 4502 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4503 // since we're swapping branch targets it needs to be reinserted. 4504 // FIXME: IRTranslator should probably not do this 4505 B.buildBr(*CondBrTarget); 4506 } 4507 4508 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4509 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4510 MI.eraseFromParent(); 4511 BrCond->eraseFromParent(); 4512 return true; 4513 } 4514 4515 return false; 4516 } 4517 case Intrinsic::amdgcn_loop: { 4518 MachineInstr *Br = nullptr; 4519 MachineBasicBlock *UncondBrTarget = nullptr; 4520 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4521 const SIRegisterInfo *TRI 4522 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4523 4524 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4525 Register Reg = MI.getOperand(2).getReg(); 4526 4527 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4528 B.buildInstr(AMDGPU::SI_LOOP) 4529 .addUse(Reg) 4530 .addMBB(UncondBrTarget); 4531 4532 if (Br) 4533 Br->getOperand(0).setMBB(CondBrTarget); 4534 else 4535 B.buildBr(*CondBrTarget); 4536 4537 MI.eraseFromParent(); 4538 BrCond->eraseFromParent(); 4539 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4540 return true; 4541 } 4542 4543 return false; 4544 } 4545 case Intrinsic::amdgcn_kernarg_segment_ptr: 4546 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4547 // This only makes sense to call in a kernel, so just lower to null. 4548 B.buildConstant(MI.getOperand(0).getReg(), 0); 4549 MI.eraseFromParent(); 4550 return true; 4551 } 4552 4553 return legalizePreloadedArgIntrin( 4554 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4555 case Intrinsic::amdgcn_implicitarg_ptr: 4556 return legalizeImplicitArgPtr(MI, MRI, B); 4557 case Intrinsic::amdgcn_workitem_id_x: 4558 return legalizePreloadedArgIntrin(MI, MRI, B, 4559 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4560 case Intrinsic::amdgcn_workitem_id_y: 4561 return legalizePreloadedArgIntrin(MI, MRI, B, 4562 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4563 case Intrinsic::amdgcn_workitem_id_z: 4564 return legalizePreloadedArgIntrin(MI, MRI, B, 4565 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4566 case Intrinsic::amdgcn_workgroup_id_x: 4567 return legalizePreloadedArgIntrin(MI, MRI, B, 4568 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4569 case Intrinsic::amdgcn_workgroup_id_y: 4570 return legalizePreloadedArgIntrin(MI, MRI, B, 4571 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4572 case Intrinsic::amdgcn_workgroup_id_z: 4573 return legalizePreloadedArgIntrin(MI, MRI, B, 4574 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4575 case Intrinsic::amdgcn_dispatch_ptr: 4576 return legalizePreloadedArgIntrin(MI, MRI, B, 4577 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4578 case Intrinsic::amdgcn_queue_ptr: 4579 return legalizePreloadedArgIntrin(MI, MRI, B, 4580 AMDGPUFunctionArgInfo::QUEUE_PTR); 4581 case Intrinsic::amdgcn_implicit_buffer_ptr: 4582 return legalizePreloadedArgIntrin( 4583 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4584 case Intrinsic::amdgcn_dispatch_id: 4585 return legalizePreloadedArgIntrin(MI, MRI, B, 4586 AMDGPUFunctionArgInfo::DISPATCH_ID); 4587 case Intrinsic::amdgcn_fdiv_fast: 4588 return legalizeFDIVFastIntrin(MI, MRI, B); 4589 case Intrinsic::amdgcn_is_shared: 4590 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4591 case Intrinsic::amdgcn_is_private: 4592 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4593 case Intrinsic::amdgcn_wavefrontsize: { 4594 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4595 MI.eraseFromParent(); 4596 return true; 4597 } 4598 case Intrinsic::amdgcn_s_buffer_load: 4599 return legalizeSBufferLoad(Helper, MI); 4600 case Intrinsic::amdgcn_raw_buffer_store: 4601 case Intrinsic::amdgcn_struct_buffer_store: 4602 return legalizeBufferStore(MI, MRI, B, false, false); 4603 case Intrinsic::amdgcn_raw_buffer_store_format: 4604 case Intrinsic::amdgcn_struct_buffer_store_format: 4605 return legalizeBufferStore(MI, MRI, B, false, true); 4606 case Intrinsic::amdgcn_raw_tbuffer_store: 4607 case Intrinsic::amdgcn_struct_tbuffer_store: 4608 return legalizeBufferStore(MI, MRI, B, true, true); 4609 case Intrinsic::amdgcn_raw_buffer_load: 4610 case Intrinsic::amdgcn_struct_buffer_load: 4611 return legalizeBufferLoad(MI, MRI, B, false, false); 4612 case Intrinsic::amdgcn_raw_buffer_load_format: 4613 case Intrinsic::amdgcn_struct_buffer_load_format: 4614 return legalizeBufferLoad(MI, MRI, B, true, false); 4615 case Intrinsic::amdgcn_raw_tbuffer_load: 4616 case Intrinsic::amdgcn_struct_tbuffer_load: 4617 return legalizeBufferLoad(MI, MRI, B, true, true); 4618 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4619 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4620 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4621 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4622 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4623 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4624 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4625 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4626 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4627 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4628 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4629 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4630 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4631 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4632 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4633 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4634 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4635 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4636 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4637 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4638 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4639 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4640 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4641 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4642 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4643 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4644 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4645 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4646 return legalizeBufferAtomic(MI, B, IntrID); 4647 case Intrinsic::amdgcn_atomic_inc: 4648 return legalizeAtomicIncDec(MI, B, true); 4649 case Intrinsic::amdgcn_atomic_dec: 4650 return legalizeAtomicIncDec(MI, B, false); 4651 case Intrinsic::trap: 4652 return legalizeTrapIntrinsic(MI, MRI, B); 4653 case Intrinsic::debugtrap: 4654 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4655 case Intrinsic::amdgcn_rsq_clamp: 4656 return legalizeRsqClampIntrinsic(MI, MRI, B); 4657 case Intrinsic::amdgcn_ds_fadd: 4658 case Intrinsic::amdgcn_ds_fmin: 4659 case Intrinsic::amdgcn_ds_fmax: 4660 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4661 default: { 4662 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4663 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4664 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4665 return true; 4666 } 4667 } 4668 4669 return true; 4670 } 4671