1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 339 LLT EltTy = Ty.getElementType(); 340 if (EltTy.isPointer()) 341 return true; 342 343 unsigned EltSize = EltTy.getSizeInBits(); 344 return EltSize != 32 && EltSize != 64; 345 } 346 347 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 348 unsigned Opcode) { 349 const LLT Ty = Query.Types[0]; 350 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 351 !loadStoreBitcastWorkaround(Ty); 352 } 353 354 /// Return true if a load or store of the type should be lowered with a bitcast 355 /// to a different type. 356 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 357 const unsigned MemSizeInBits) { 358 const unsigned Size = Ty.getSizeInBits(); 359 if (Size != MemSizeInBits) 360 return Size <= 32 && Ty.isVector(); 361 362 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 363 return true; 364 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 365 !isRegisterVectorElementType(Ty.getElementType()); 366 } 367 368 /// Return true if we should legalize a load by widening an odd sized memory 369 /// access up to the alignment. Note this case when the memory access itself 370 /// changes, not the size of the result register. 371 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 372 unsigned AlignInBits, unsigned AddrSpace, 373 unsigned Opcode) { 374 // We don't want to widen cases that are naturally legal. 375 if (isPowerOf2_32(SizeInBits)) 376 return false; 377 378 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 379 // end up widening these for a scalar load during RegBankSelect, since there 380 // aren't 96-bit scalar loads. 381 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 382 return false; 383 384 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 385 return false; 386 387 // A load is known dereferenceable up to the alignment, so it's legal to widen 388 // to it. 389 // 390 // TODO: Could check dereferenceable for less aligned cases. 391 unsigned RoundedSize = NextPowerOf2(SizeInBits); 392 if (AlignInBits < RoundedSize) 393 return false; 394 395 // Do not widen if it would introduce a slow unaligned load. 396 const SITargetLowering *TLI = ST.getTargetLowering(); 397 bool Fast = false; 398 return TLI->allowsMisalignedMemoryAccessesImpl( 399 RoundedSize, AddrSpace, Align(AlignInBits / 8), 400 MachineMemOperand::MOLoad, &Fast) && 401 Fast; 402 } 403 404 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 405 unsigned Opcode) { 406 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 407 return false; 408 409 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 410 Query.MMODescrs[0].AlignInBits, 411 Query.Types[1].getAddressSpace(), Opcode); 412 } 413 414 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 415 const GCNTargetMachine &TM) 416 : ST(ST_) { 417 using namespace TargetOpcode; 418 419 auto GetAddrSpacePtr = [&TM](unsigned AS) { 420 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 421 }; 422 423 const LLT S1 = LLT::scalar(1); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S16 = LLT::vector(2, 16); 433 const LLT V4S16 = LLT::vector(4, 16); 434 435 const LLT V2S32 = LLT::vector(2, 32); 436 const LLT V3S32 = LLT::vector(3, 32); 437 const LLT V4S32 = LLT::vector(4, 32); 438 const LLT V5S32 = LLT::vector(5, 32); 439 const LLT V6S32 = LLT::vector(6, 32); 440 const LLT V7S32 = LLT::vector(7, 32); 441 const LLT V8S32 = LLT::vector(8, 32); 442 const LLT V9S32 = LLT::vector(9, 32); 443 const LLT V10S32 = LLT::vector(10, 32); 444 const LLT V11S32 = LLT::vector(11, 32); 445 const LLT V12S32 = LLT::vector(12, 32); 446 const LLT V13S32 = LLT::vector(13, 32); 447 const LLT V14S32 = LLT::vector(14, 32); 448 const LLT V15S32 = LLT::vector(15, 32); 449 const LLT V16S32 = LLT::vector(16, 32); 450 const LLT V32S32 = LLT::vector(32, 32); 451 452 const LLT V2S64 = LLT::vector(2, 64); 453 const LLT V3S64 = LLT::vector(3, 64); 454 const LLT V4S64 = LLT::vector(4, 64); 455 const LLT V5S64 = LLT::vector(5, 64); 456 const LLT V6S64 = LLT::vector(6, 64); 457 const LLT V7S64 = LLT::vector(7, 64); 458 const LLT V8S64 = LLT::vector(8, 64); 459 const LLT V16S64 = LLT::vector(16, 64); 460 461 std::initializer_list<LLT> AllS32Vectors = 462 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 463 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 464 std::initializer_list<LLT> AllS64Vectors = 465 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 466 467 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 468 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 469 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 470 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 471 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 472 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 473 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 474 475 const LLT CodePtr = FlatPtr; 476 477 const std::initializer_list<LLT> AddrSpaces64 = { 478 GlobalPtr, ConstantPtr, FlatPtr 479 }; 480 481 const std::initializer_list<LLT> AddrSpaces32 = { 482 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 483 }; 484 485 const std::initializer_list<LLT> FPTypesBase = { 486 S32, S64 487 }; 488 489 const std::initializer_list<LLT> FPTypes16 = { 490 S32, S64, S16 491 }; 492 493 const std::initializer_list<LLT> FPTypesPK16 = { 494 S32, S64, S16, V2S16 495 }; 496 497 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 498 499 setAction({G_BRCOND, S1}, Legal); // VCC branches 500 setAction({G_BRCOND, S32}, Legal); // SCC branches 501 502 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 503 // elements for v3s16 504 getActionDefinitionsBuilder(G_PHI) 505 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 506 .legalFor(AllS32Vectors) 507 .legalFor(AllS64Vectors) 508 .legalFor(AddrSpaces64) 509 .legalFor(AddrSpaces32) 510 .legalIf(isPointer(0)) 511 .clampScalar(0, S16, S256) 512 .widenScalarToNextPow2(0, 32) 513 .clampMaxNumElements(0, S32, 16) 514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 515 .scalarize(0); 516 517 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 518 // Full set of gfx9 features. 519 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 520 .legalFor({S32, S16, V2S16}) 521 .clampScalar(0, S16, S32) 522 .clampMaxNumElements(0, S16, 2) 523 .scalarize(0) 524 .widenScalarToNextPow2(0, 32); 525 526 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 527 .legalFor({S32, S16, V2S16}) // Clamp modifier 528 .minScalarOrElt(0, S16) 529 .clampMaxNumElements(0, S16, 2) 530 .scalarize(0) 531 .widenScalarToNextPow2(0, 32) 532 .lower(); 533 } else if (ST.has16BitInsts()) { 534 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 535 .legalFor({S32, S16}) 536 .clampScalar(0, S16, S32) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 539 540 // Technically the saturating operations require clamp bit support, but this 541 // was introduced at the same time as 16-bit operations. 542 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 543 .legalFor({S32, S16}) // Clamp modifier 544 .minScalar(0, S16) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 16) 547 .lower(); 548 549 // We're just lowering this, but it helps get a better result to try to 550 // coerce to the desired type first. 551 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 552 .minScalar(0, S16) 553 .scalarize(0) 554 .lower(); 555 } else { 556 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 557 .legalFor({S32}) 558 .clampScalar(0, S32, S32) 559 .scalarize(0); 560 561 if (ST.hasIntClamp()) { 562 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 563 .legalFor({S32}) // Clamp modifier. 564 .scalarize(0) 565 .minScalarOrElt(0, S32) 566 .lower(); 567 } else { 568 // Clamp bit support was added in VI, along with 16-bit operations. 569 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 570 .minScalar(0, S32) 571 .scalarize(0) 572 .lower(); 573 } 574 575 // FIXME: DAG expansion gets better results. The widening uses the smaller 576 // range values and goes for the min/max lowering directly. 577 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 584 .customFor({S32, S64}) 585 .clampScalar(0, S32, S64) 586 .widenScalarToNextPow2(0, 32) 587 .scalarize(0); 588 589 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 590 .legalFor({S32}) 591 .clampScalar(0, S32, S32) 592 .scalarize(0); 593 594 // Report legal for any types we can handle anywhere. For the cases only legal 595 // on the SALU, RegBankSelect will be able to re-legalize. 596 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 597 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 598 .clampScalar(0, S32, S64) 599 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 600 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 601 .widenScalarToNextPow2(0) 602 .scalarize(0); 603 604 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 605 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 606 .legalFor({{S32, S1}, {S32, S32}}) 607 .minScalar(0, S32) 608 // TODO: .scalarize(0) 609 .lower(); 610 611 getActionDefinitionsBuilder(G_BITCAST) 612 // Don't worry about the size constraint. 613 .legalIf(all(isRegisterType(0), isRegisterType(1))) 614 .lower(); 615 616 617 getActionDefinitionsBuilder(G_CONSTANT) 618 .legalFor({S1, S32, S64, S16, GlobalPtr, 619 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 620 .legalIf(isPointer(0)) 621 .clampScalar(0, S32, S64) 622 .widenScalarToNextPow2(0); 623 624 getActionDefinitionsBuilder(G_FCONSTANT) 625 .legalFor({S32, S64, S16}) 626 .clampScalar(0, S16, S64); 627 628 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 629 .legalIf(isRegisterType(0)) 630 // s1 and s16 are special cases because they have legal operations on 631 // them, but don't really occupy registers in the normal way. 632 .legalFor({S1, S16}) 633 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 634 .clampScalarOrElt(0, S32, MaxScalar) 635 .widenScalarToNextPow2(0, 32) 636 .clampMaxNumElements(0, S32, 16); 637 638 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 639 640 // If the amount is divergent, we have to do a wave reduction to get the 641 // maximum value, so this is expanded during RegBankSelect. 642 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 643 .legalFor({{PrivatePtr, S32}}); 644 645 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 646 .customIf(typeIsNot(0, PrivatePtr)); 647 648 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 649 650 auto &FPOpActions = getActionDefinitionsBuilder( 651 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 652 .legalFor({S32, S64}); 653 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 654 .customFor({S32, S64}); 655 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 656 .customFor({S32, S64}); 657 658 if (ST.has16BitInsts()) { 659 if (ST.hasVOP3PInsts()) 660 FPOpActions.legalFor({S16, V2S16}); 661 else 662 FPOpActions.legalFor({S16}); 663 664 TrigActions.customFor({S16}); 665 FDIVActions.customFor({S16}); 666 } 667 668 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 669 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 670 671 if (ST.hasVOP3PInsts()) { 672 MinNumMaxNum.customFor(FPTypesPK16) 673 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 674 .clampMaxNumElements(0, S16, 2) 675 .clampScalar(0, S16, S64) 676 .scalarize(0); 677 } else if (ST.has16BitInsts()) { 678 MinNumMaxNum.customFor(FPTypes16) 679 .clampScalar(0, S16, S64) 680 .scalarize(0); 681 } else { 682 MinNumMaxNum.customFor(FPTypesBase) 683 .clampScalar(0, S32, S64) 684 .scalarize(0); 685 } 686 687 if (ST.hasVOP3PInsts()) 688 FPOpActions.clampMaxNumElements(0, S16, 2); 689 690 FPOpActions 691 .scalarize(0) 692 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 693 694 TrigActions 695 .scalarize(0) 696 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 697 698 FDIVActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 703 .legalFor(FPTypesPK16) 704 .clampMaxNumElements(0, S16, 2) 705 .scalarize(0) 706 .clampScalar(0, S16, S64); 707 708 if (ST.has16BitInsts()) { 709 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 710 .legalFor({S32, S64, S16}) 711 .scalarize(0) 712 .clampScalar(0, S16, S64); 713 } else { 714 getActionDefinitionsBuilder(G_FSQRT) 715 .legalFor({S32, S64}) 716 .scalarize(0) 717 .clampScalar(0, S32, S64); 718 719 if (ST.hasFractBug()) { 720 getActionDefinitionsBuilder(G_FFLOOR) 721 .customFor({S64}) 722 .legalFor({S32, S64}) 723 .scalarize(0) 724 .clampScalar(0, S32, S64); 725 } else { 726 getActionDefinitionsBuilder(G_FFLOOR) 727 .legalFor({S32, S64}) 728 .scalarize(0) 729 .clampScalar(0, S32, S64); 730 } 731 } 732 733 getActionDefinitionsBuilder(G_FPTRUNC) 734 .legalFor({{S32, S64}, {S16, S32}}) 735 .scalarize(0) 736 .lower(); 737 738 getActionDefinitionsBuilder(G_FPEXT) 739 .legalFor({{S64, S32}, {S32, S16}}) 740 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 741 .scalarize(0); 742 743 getActionDefinitionsBuilder(G_FSUB) 744 // Use actual fsub instruction 745 .legalFor({S32}) 746 // Must use fadd + fneg 747 .lowerFor({S64, S16, V2S16}) 748 .scalarize(0) 749 .clampScalar(0, S32, S64); 750 751 // Whether this is legal depends on the floating point mode for the function. 752 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 753 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 754 FMad.customFor({S32, S16}); 755 else if (ST.hasMadMacF32Insts()) 756 FMad.customFor({S32}); 757 else if (ST.hasMadF16()) 758 FMad.customFor({S16}); 759 FMad.scalarize(0) 760 .lower(); 761 762 auto &FRem = getActionDefinitionsBuilder(G_FREM); 763 if (ST.has16BitInsts()) { 764 FRem.customFor({S16, S32, S64}); 765 } else { 766 FRem.minScalar(0, S32) 767 .customFor({S32, S64}); 768 } 769 FRem.scalarize(0); 770 771 // TODO: Do we need to clamp maximum bitwidth? 772 getActionDefinitionsBuilder(G_TRUNC) 773 .legalIf(isScalar(0)) 774 .legalFor({{V2S16, V2S32}}) 775 .clampMaxNumElements(0, S16, 2) 776 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 777 // situations (like an invalid implicit use), we don't want to infinite loop 778 // in the legalizer. 779 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 780 .alwaysLegal(); 781 782 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 783 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 784 {S32, S1}, {S64, S1}, {S16, S1}}) 785 .scalarize(0) 786 .clampScalar(0, S32, S64) 787 .widenScalarToNextPow2(1, 32); 788 789 // TODO: Split s1->s64 during regbankselect for VALU. 790 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 791 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 792 .lowerFor({{S32, S64}}) 793 .lowerIf(typeIs(1, S1)) 794 .customFor({{S64, S64}}); 795 if (ST.has16BitInsts()) 796 IToFP.legalFor({{S16, S16}}); 797 IToFP.clampScalar(1, S32, S64) 798 .minScalar(0, S32) 799 .scalarize(0) 800 .widenScalarToNextPow2(1); 801 802 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 803 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 804 .customFor({{S64, S64}}) 805 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 806 if (ST.has16BitInsts()) 807 FPToI.legalFor({{S16, S16}}); 808 else 809 FPToI.minScalar(1, S32); 810 811 FPToI.minScalar(0, S32) 812 .scalarize(0) 813 .lower(); 814 815 // Lower roundeven into G_FRINT 816 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 817 .scalarize(0) 818 .lower(); 819 820 if (ST.has16BitInsts()) { 821 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 822 .legalFor({S16, S32, S64}) 823 .clampScalar(0, S16, S64) 824 .scalarize(0); 825 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 826 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 827 .legalFor({S32, S64}) 828 .clampScalar(0, S32, S64) 829 .scalarize(0); 830 } else { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S32}) 833 .customFor({S64}) 834 .clampScalar(0, S32, S64) 835 .scalarize(0); 836 } 837 838 getActionDefinitionsBuilder(G_PTR_ADD) 839 .legalIf(all(isPointer(0), sameSize(0, 1))) 840 .scalarize(0) 841 .scalarSameSizeAs(1, 0); 842 843 getActionDefinitionsBuilder(G_PTRMASK) 844 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 845 .scalarSameSizeAs(1, 0) 846 .scalarize(0); 847 848 auto &CmpBuilder = 849 getActionDefinitionsBuilder(G_ICMP) 850 // The compare output type differs based on the register bank of the output, 851 // so make both s1 and s32 legal. 852 // 853 // Scalar compares producing output in scc will be promoted to s32, as that 854 // is the allocatable register type that will be needed for the copy from 855 // scc. This will be promoted during RegBankSelect, and we assume something 856 // before that won't try to use s32 result types. 857 // 858 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 859 // bank. 860 .legalForCartesianProduct( 861 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 862 .legalForCartesianProduct( 863 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 864 if (ST.has16BitInsts()) { 865 CmpBuilder.legalFor({{S1, S16}}); 866 } 867 868 CmpBuilder 869 .widenScalarToNextPow2(1) 870 .clampScalar(1, S32, S64) 871 .scalarize(0) 872 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 873 874 getActionDefinitionsBuilder(G_FCMP) 875 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 876 .widenScalarToNextPow2(1) 877 .clampScalar(1, S32, S64) 878 .scalarize(0); 879 880 // FIXME: fpow has a selection pattern that should move to custom lowering. 881 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 882 if (ST.has16BitInsts()) 883 Exp2Ops.legalFor({S32, S16}); 884 else 885 Exp2Ops.legalFor({S32}); 886 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 887 Exp2Ops.scalarize(0); 888 889 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 890 if (ST.has16BitInsts()) 891 ExpOps.customFor({{S32}, {S16}}); 892 else 893 ExpOps.customFor({S32}); 894 ExpOps.clampScalar(0, MinScalarFPTy, S32) 895 .scalarize(0); 896 897 getActionDefinitionsBuilder(G_FPOWI) 898 .clampScalar(0, MinScalarFPTy, S32) 899 .lower(); 900 901 // The 64-bit versions produce 32-bit results, but only on the SALU. 902 getActionDefinitionsBuilder(G_CTPOP) 903 .legalFor({{S32, S32}, {S32, S64}}) 904 .clampScalar(0, S32, S32) 905 .clampScalar(1, S32, S64) 906 .scalarize(0) 907 .widenScalarToNextPow2(0, 32) 908 .widenScalarToNextPow2(1, 32); 909 910 // The hardware instructions return a different result on 0 than the generic 911 // instructions expect. The hardware produces -1, but these produce the 912 // bitwidth. 913 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 914 .scalarize(0) 915 .clampScalar(0, S32, S32) 916 .clampScalar(1, S32, S64) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 getActionDefinitionsBuilder(G_BITREVERSE) 931 .legalFor({S32}) 932 .clampScalar(0, S32, S32) 933 .scalarize(0); 934 935 if (ST.has16BitInsts()) { 936 getActionDefinitionsBuilder(G_BSWAP) 937 .legalFor({S16, S32, V2S16}) 938 .clampMaxNumElements(0, S16, 2) 939 // FIXME: Fixing non-power-of-2 before clamp is workaround for 940 // narrowScalar limitation. 941 .widenScalarToNextPow2(0) 942 .clampScalar(0, S16, S32) 943 .scalarize(0); 944 945 if (ST.hasVOP3PInsts()) { 946 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 947 .legalFor({S32, S16, V2S16}) 948 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 949 .clampMaxNumElements(0, S16, 2) 950 .minScalar(0, S16) 951 .widenScalarToNextPow2(0) 952 .scalarize(0) 953 .lower(); 954 } else { 955 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 956 .legalFor({S32, S16}) 957 .widenScalarToNextPow2(0) 958 .minScalar(0, S16) 959 .scalarize(0) 960 .lower(); 961 } 962 } else { 963 // TODO: Should have same legality without v_perm_b32 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S32}) 966 .lowerIf(scalarNarrowerThan(0, 32)) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .maxScalar(0, S32) 971 .scalarize(0) 972 .lower(); 973 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 975 .legalFor({S32}) 976 .minScalar(0, S32) 977 .widenScalarToNextPow2(0) 978 .scalarize(0) 979 .lower(); 980 } 981 982 getActionDefinitionsBuilder(G_INTTOPTR) 983 // List the common cases 984 .legalForCartesianProduct(AddrSpaces64, {S64}) 985 .legalForCartesianProduct(AddrSpaces32, {S32}) 986 .scalarize(0) 987 // Accept any address space as long as the size matches 988 .legalIf(sameSize(0, 1)) 989 .widenScalarIf(smallerThan(1, 0), 990 [](const LegalityQuery &Query) { 991 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 992 }) 993 .narrowScalarIf(largerThan(1, 0), 994 [](const LegalityQuery &Query) { 995 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 996 }); 997 998 getActionDefinitionsBuilder(G_PTRTOINT) 999 // List the common cases 1000 .legalForCartesianProduct(AddrSpaces64, {S64}) 1001 .legalForCartesianProduct(AddrSpaces32, {S32}) 1002 .scalarize(0) 1003 // Accept any address space as long as the size matches 1004 .legalIf(sameSize(0, 1)) 1005 .widenScalarIf(smallerThan(0, 1), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1008 }) 1009 .narrowScalarIf( 1010 largerThan(0, 1), 1011 [](const LegalityQuery &Query) { 1012 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1013 }); 1014 1015 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1016 .scalarize(0) 1017 .custom(); 1018 1019 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1020 bool IsLoad) -> bool { 1021 const LLT DstTy = Query.Types[0]; 1022 1023 // Split vector extloads. 1024 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1025 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1026 1027 if (MemSize < DstTy.getSizeInBits()) 1028 MemSize = std::max(MemSize, AlignBits); 1029 1030 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1031 return true; 1032 1033 const LLT PtrTy = Query.Types[1]; 1034 unsigned AS = PtrTy.getAddressSpace(); 1035 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1036 return true; 1037 1038 // Catch weird sized loads that don't evenly divide into the access sizes 1039 // TODO: May be able to widen depending on alignment etc. 1040 unsigned NumRegs = (MemSize + 31) / 32; 1041 if (NumRegs == 3) { 1042 if (!ST.hasDwordx3LoadStores()) 1043 return true; 1044 } else { 1045 // If the alignment allows, these should have been widened. 1046 if (!isPowerOf2_32(NumRegs)) 1047 return true; 1048 } 1049 1050 if (AlignBits < MemSize) { 1051 const SITargetLowering *TLI = ST.getTargetLowering(); 1052 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1053 Align(AlignBits / 8)); 1054 } 1055 1056 return false; 1057 }; 1058 1059 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1060 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1061 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1062 1063 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1064 // LDS 1065 // TODO: Unsupported flat for SI. 1066 1067 for (unsigned Op : {G_LOAD, G_STORE}) { 1068 const bool IsStore = Op == G_STORE; 1069 1070 auto &Actions = getActionDefinitionsBuilder(Op); 1071 // Explicitly list some common cases. 1072 // TODO: Does this help compile time at all? 1073 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1074 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1075 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1076 {S64, GlobalPtr, 64, GlobalAlign32}, 1077 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1078 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1079 {S32, GlobalPtr, 8, GlobalAlign8}, 1080 {S32, GlobalPtr, 16, GlobalAlign16}, 1081 1082 {S32, LocalPtr, 32, 32}, 1083 {S64, LocalPtr, 64, 32}, 1084 {V2S32, LocalPtr, 64, 32}, 1085 {S32, LocalPtr, 8, 8}, 1086 {S32, LocalPtr, 16, 16}, 1087 {V2S16, LocalPtr, 32, 32}, 1088 1089 {S32, PrivatePtr, 32, 32}, 1090 {S32, PrivatePtr, 8, 8}, 1091 {S32, PrivatePtr, 16, 16}, 1092 {V2S16, PrivatePtr, 32, 32}, 1093 1094 {S32, ConstantPtr, 32, GlobalAlign32}, 1095 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1096 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1097 {S64, ConstantPtr, 64, GlobalAlign32}, 1098 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1099 Actions.legalIf( 1100 [=](const LegalityQuery &Query) -> bool { 1101 return isLoadStoreLegal(ST, Query, Op); 1102 }); 1103 1104 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1105 // 64-bits. 1106 // 1107 // TODO: Should generalize bitcast action into coerce, which will also cover 1108 // inserting addrspacecasts. 1109 Actions.customIf(typeIs(1, Constant32Ptr)); 1110 1111 // Turn any illegal element vectors into something easier to deal 1112 // with. These will ultimately produce 32-bit scalar shifts to extract the 1113 // parts anyway. 1114 // 1115 // For odd 16-bit element vectors, prefer to split those into pieces with 1116 // 16-bit vector parts. 1117 Actions.bitcastIf( 1118 [=](const LegalityQuery &Query) -> bool { 1119 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1120 Query.MMODescrs[0].SizeInBits); 1121 }, bitcastToRegisterType(0)); 1122 1123 if (!IsStore) { 1124 // Widen suitably aligned loads by loading extra bytes. The standard 1125 // legalization actions can't properly express widening memory operands. 1126 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1127 return shouldWidenLoad(ST, Query, G_LOAD); 1128 }); 1129 } 1130 1131 // FIXME: load/store narrowing should be moved to lower action 1132 Actions 1133 .narrowScalarIf( 1134 [=](const LegalityQuery &Query) -> bool { 1135 return !Query.Types[0].isVector() && 1136 needToSplitMemOp(Query, Op == G_LOAD); 1137 }, 1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1139 const LLT DstTy = Query.Types[0]; 1140 const LLT PtrTy = Query.Types[1]; 1141 1142 const unsigned DstSize = DstTy.getSizeInBits(); 1143 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1144 1145 // Split extloads. 1146 if (DstSize > MemSize) 1147 return std::make_pair(0, LLT::scalar(MemSize)); 1148 1149 if (!isPowerOf2_32(DstSize)) { 1150 // We're probably decomposing an odd sized store. Try to split 1151 // to the widest type. TODO: Account for alignment. As-is it 1152 // should be OK, since the new parts will be further legalized. 1153 unsigned FloorSize = PowerOf2Floor(DstSize); 1154 return std::make_pair(0, LLT::scalar(FloorSize)); 1155 } 1156 1157 if (DstSize > 32 && (DstSize % 32 != 0)) { 1158 // FIXME: Need a way to specify non-extload of larger size if 1159 // suitably aligned. 1160 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1161 } 1162 1163 unsigned MaxSize = maxSizeForAddrSpace(ST, 1164 PtrTy.getAddressSpace(), 1165 Op == G_LOAD); 1166 if (MemSize > MaxSize) 1167 return std::make_pair(0, LLT::scalar(MaxSize)); 1168 1169 unsigned Align = Query.MMODescrs[0].AlignInBits; 1170 return std::make_pair(0, LLT::scalar(Align)); 1171 }) 1172 .fewerElementsIf( 1173 [=](const LegalityQuery &Query) -> bool { 1174 return Query.Types[0].isVector() && 1175 needToSplitMemOp(Query, Op == G_LOAD); 1176 }, 1177 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1178 const LLT DstTy = Query.Types[0]; 1179 const LLT PtrTy = Query.Types[1]; 1180 1181 LLT EltTy = DstTy.getElementType(); 1182 unsigned MaxSize = maxSizeForAddrSpace(ST, 1183 PtrTy.getAddressSpace(), 1184 Op == G_LOAD); 1185 1186 // FIXME: Handle widened to power of 2 results better. This ends 1187 // up scalarizing. 1188 // FIXME: 3 element stores scalarized on SI 1189 1190 // Split if it's too large for the address space. 1191 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1192 unsigned NumElts = DstTy.getNumElements(); 1193 unsigned EltSize = EltTy.getSizeInBits(); 1194 1195 if (MaxSize % EltSize == 0) { 1196 return std::make_pair( 1197 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1198 } 1199 1200 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1201 1202 // FIXME: Refine when odd breakdowns handled 1203 // The scalars will need to be re-legalized. 1204 if (NumPieces == 1 || NumPieces >= NumElts || 1205 NumElts % NumPieces != 0) 1206 return std::make_pair(0, EltTy); 1207 1208 return std::make_pair(0, 1209 LLT::vector(NumElts / NumPieces, EltTy)); 1210 } 1211 1212 // FIXME: We could probably handle weird extending loads better. 1213 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1214 if (DstTy.getSizeInBits() > MemSize) 1215 return std::make_pair(0, EltTy); 1216 1217 unsigned EltSize = EltTy.getSizeInBits(); 1218 unsigned DstSize = DstTy.getSizeInBits(); 1219 if (!isPowerOf2_32(DstSize)) { 1220 // We're probably decomposing an odd sized store. Try to split 1221 // to the widest type. TODO: Account for alignment. As-is it 1222 // should be OK, since the new parts will be further legalized. 1223 unsigned FloorSize = PowerOf2Floor(DstSize); 1224 return std::make_pair( 1225 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1226 } 1227 1228 // Need to split because of alignment. 1229 unsigned Align = Query.MMODescrs[0].AlignInBits; 1230 if (EltSize > Align && 1231 (EltSize / Align < DstTy.getNumElements())) { 1232 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1233 } 1234 1235 // May need relegalization for the scalars. 1236 return std::make_pair(0, EltTy); 1237 }) 1238 .lowerIfMemSizeNotPow2() 1239 .minScalar(0, S32); 1240 1241 if (IsStore) 1242 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1243 1244 Actions 1245 .widenScalarToNextPow2(0) 1246 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1247 .lower(); 1248 } 1249 1250 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1251 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1252 {S32, GlobalPtr, 16, 2 * 8}, 1253 {S32, LocalPtr, 8, 8}, 1254 {S32, LocalPtr, 16, 16}, 1255 {S32, PrivatePtr, 8, 8}, 1256 {S32, PrivatePtr, 16, 16}, 1257 {S32, ConstantPtr, 8, 8}, 1258 {S32, ConstantPtr, 16, 2 * 8}}); 1259 if (ST.hasFlatAddressSpace()) { 1260 ExtLoads.legalForTypesWithMemDesc( 1261 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1262 } 1263 1264 ExtLoads.clampScalar(0, S32, S32) 1265 .widenScalarToNextPow2(0) 1266 .unsupportedIfMemSizeNotPow2() 1267 .lower(); 1268 1269 auto &Atomics = getActionDefinitionsBuilder( 1270 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1271 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1272 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1273 G_ATOMICRMW_UMIN}) 1274 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1275 {S64, GlobalPtr}, {S64, LocalPtr}, 1276 {S32, RegionPtr}, {S64, RegionPtr}}); 1277 if (ST.hasFlatAddressSpace()) { 1278 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1279 } 1280 1281 if (ST.hasLDSFPAtomics()) { 1282 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1283 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1284 } 1285 1286 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1287 // demarshalling 1288 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1289 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1290 {S32, FlatPtr}, {S64, FlatPtr}}) 1291 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1292 {S32, RegionPtr}, {S64, RegionPtr}}); 1293 // TODO: Pointer types, any 32-bit or 64-bit vector 1294 1295 // Condition should be s32 for scalar, s1 for vector. 1296 getActionDefinitionsBuilder(G_SELECT) 1297 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1298 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1299 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1300 .clampScalar(0, S16, S64) 1301 .scalarize(1) 1302 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1303 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1304 .clampMaxNumElements(0, S32, 2) 1305 .clampMaxNumElements(0, LocalPtr, 2) 1306 .clampMaxNumElements(0, PrivatePtr, 2) 1307 .scalarize(0) 1308 .widenScalarToNextPow2(0) 1309 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1310 1311 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1312 // be more flexible with the shift amount type. 1313 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1314 .legalFor({{S32, S32}, {S64, S32}}); 1315 if (ST.has16BitInsts()) { 1316 if (ST.hasVOP3PInsts()) { 1317 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1318 .clampMaxNumElements(0, S16, 2); 1319 } else 1320 Shifts.legalFor({{S16, S16}}); 1321 1322 // TODO: Support 16-bit shift amounts for all types 1323 Shifts.widenScalarIf( 1324 [=](const LegalityQuery &Query) { 1325 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1326 // 32-bit amount. 1327 const LLT ValTy = Query.Types[0]; 1328 const LLT AmountTy = Query.Types[1]; 1329 return ValTy.getSizeInBits() <= 16 && 1330 AmountTy.getSizeInBits() < 16; 1331 }, changeTo(1, S16)); 1332 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1333 Shifts.clampScalar(1, S32, S32); 1334 Shifts.clampScalar(0, S16, S64); 1335 Shifts.widenScalarToNextPow2(0, 16); 1336 1337 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1338 .minScalar(0, S16) 1339 .scalarize(0) 1340 .lower(); 1341 } else { 1342 // Make sure we legalize the shift amount type first, as the general 1343 // expansion for the shifted type will produce much worse code if it hasn't 1344 // been truncated already. 1345 Shifts.clampScalar(1, S32, S32); 1346 Shifts.clampScalar(0, S32, S64); 1347 Shifts.widenScalarToNextPow2(0, 32); 1348 1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1350 .minScalar(0, S32) 1351 .scalarize(0) 1352 .lower(); 1353 } 1354 Shifts.scalarize(0); 1355 1356 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1357 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1358 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1359 unsigned IdxTypeIdx = 2; 1360 1361 getActionDefinitionsBuilder(Op) 1362 .customIf([=](const LegalityQuery &Query) { 1363 const LLT EltTy = Query.Types[EltTypeIdx]; 1364 const LLT VecTy = Query.Types[VecTypeIdx]; 1365 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1366 const unsigned EltSize = EltTy.getSizeInBits(); 1367 return (EltSize == 32 || EltSize == 64) && 1368 VecTy.getSizeInBits() % 32 == 0 && 1369 VecTy.getSizeInBits() <= MaxRegisterSize && 1370 IdxTy.getSizeInBits() == 32; 1371 }) 1372 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1373 bitcastToVectorElement32(VecTypeIdx)) 1374 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1375 .bitcastIf( 1376 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1377 [=](const LegalityQuery &Query) { 1378 // For > 64-bit element types, try to turn this into a 64-bit 1379 // element vector since we may be able to do better indexing 1380 // if this is scalar. If not, fall back to 32. 1381 const LLT EltTy = Query.Types[EltTypeIdx]; 1382 const LLT VecTy = Query.Types[VecTypeIdx]; 1383 const unsigned DstEltSize = EltTy.getSizeInBits(); 1384 const unsigned VecSize = VecTy.getSizeInBits(); 1385 1386 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1387 return std::make_pair( 1388 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1389 }) 1390 .clampScalar(EltTypeIdx, S32, S64) 1391 .clampScalar(VecTypeIdx, S32, S64) 1392 .clampScalar(IdxTypeIdx, S32, S32) 1393 .clampMaxNumElements(VecTypeIdx, S32, 32) 1394 // TODO: Clamp elements for 64-bit vectors? 1395 // It should only be necessary with variable indexes. 1396 // As a last resort, lower to the stack 1397 .lower(); 1398 } 1399 1400 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1401 .unsupportedIf([=](const LegalityQuery &Query) { 1402 const LLT &EltTy = Query.Types[1].getElementType(); 1403 return Query.Types[0] != EltTy; 1404 }); 1405 1406 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1407 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1408 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1409 1410 // FIXME: Doesn't handle extract of illegal sizes. 1411 getActionDefinitionsBuilder(Op) 1412 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1413 // FIXME: Multiples of 16 should not be legal. 1414 .legalIf([=](const LegalityQuery &Query) { 1415 const LLT BigTy = Query.Types[BigTyIdx]; 1416 const LLT LitTy = Query.Types[LitTyIdx]; 1417 return (BigTy.getSizeInBits() % 32 == 0) && 1418 (LitTy.getSizeInBits() % 16 == 0); 1419 }) 1420 .widenScalarIf( 1421 [=](const LegalityQuery &Query) { 1422 const LLT BigTy = Query.Types[BigTyIdx]; 1423 return (BigTy.getScalarSizeInBits() < 16); 1424 }, 1425 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1426 .widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT LitTy = Query.Types[LitTyIdx]; 1429 return (LitTy.getScalarSizeInBits() < 16); 1430 }, 1431 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1432 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1433 .widenScalarToNextPow2(BigTyIdx, 32); 1434 1435 } 1436 1437 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1438 .legalForCartesianProduct(AllS32Vectors, {S32}) 1439 .legalForCartesianProduct(AllS64Vectors, {S64}) 1440 .clampNumElements(0, V16S32, V32S32) 1441 .clampNumElements(0, V2S64, V16S64) 1442 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1443 1444 if (ST.hasScalarPackInsts()) { 1445 BuildVector 1446 // FIXME: Should probably widen s1 vectors straight to s32 1447 .minScalarOrElt(0, S16) 1448 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1449 .minScalar(1, S32); 1450 1451 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1452 .legalFor({V2S16, S32}) 1453 .lower(); 1454 BuildVector.minScalarOrElt(0, S32); 1455 } else { 1456 BuildVector.customFor({V2S16, S16}); 1457 BuildVector.minScalarOrElt(0, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .customFor({V2S16, S32}) 1461 .lower(); 1462 } 1463 1464 BuildVector.legalIf(isRegisterType(0)); 1465 1466 // FIXME: Clamp maximum size 1467 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1468 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1469 .clampMaxNumElements(0, S32, 32) 1470 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1471 .clampMaxNumElements(0, S16, 64); 1472 1473 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1474 // pre-legalize. 1475 if (ST.hasVOP3PInsts()) { 1476 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1477 .customFor({V2S16, V2S16}) 1478 .lower(); 1479 } else 1480 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1481 1482 // Merge/Unmerge 1483 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1484 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1485 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1486 1487 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1488 const LLT Ty = Query.Types[TypeIdx]; 1489 if (Ty.isVector()) { 1490 const LLT &EltTy = Ty.getElementType(); 1491 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1492 return true; 1493 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1494 return true; 1495 } 1496 return false; 1497 }; 1498 1499 auto &Builder = getActionDefinitionsBuilder(Op) 1500 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1501 .lowerFor({{S16, V2S16}}) 1502 .lowerIf([=](const LegalityQuery &Query) { 1503 const LLT BigTy = Query.Types[BigTyIdx]; 1504 return BigTy.getSizeInBits() == 32; 1505 }) 1506 // Try to widen to s16 first for small types. 1507 // TODO: Only do this on targets with legal s16 shifts 1508 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1509 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1510 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1511 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1512 elementTypeIs(1, S16)), 1513 changeTo(1, V2S16)) 1514 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1515 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1516 // valid. 1517 .clampScalar(LitTyIdx, S32, S512) 1518 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1519 // Break up vectors with weird elements into scalars 1520 .fewerElementsIf( 1521 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1522 scalarize(0)) 1523 .fewerElementsIf( 1524 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1525 scalarize(1)) 1526 .clampScalar(BigTyIdx, S32, MaxScalar); 1527 1528 if (Op == G_MERGE_VALUES) { 1529 Builder.widenScalarIf( 1530 // TODO: Use 16-bit shifts if legal for 8-bit values? 1531 [=](const LegalityQuery &Query) { 1532 const LLT Ty = Query.Types[LitTyIdx]; 1533 return Ty.getSizeInBits() < 32; 1534 }, 1535 changeTo(LitTyIdx, S32)); 1536 } 1537 1538 Builder.widenScalarIf( 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[BigTyIdx]; 1541 return !isPowerOf2_32(Ty.getSizeInBits()) && 1542 Ty.getSizeInBits() % 16 != 0; 1543 }, 1544 [=](const LegalityQuery &Query) { 1545 // Pick the next power of 2, or a multiple of 64 over 128. 1546 // Whichever is smaller. 1547 const LLT &Ty = Query.Types[BigTyIdx]; 1548 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1549 if (NewSizeInBits >= 256) { 1550 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1551 if (RoundedTo < NewSizeInBits) 1552 NewSizeInBits = RoundedTo; 1553 } 1554 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1555 }) 1556 // Any vectors left are the wrong size. Scalarize them. 1557 .scalarize(0) 1558 .scalarize(1); 1559 } 1560 1561 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1562 // RegBankSelect. 1563 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1564 .legalFor({{S32}, {S64}}); 1565 1566 if (ST.hasVOP3PInsts()) { 1567 SextInReg.lowerFor({{V2S16}}) 1568 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1569 // get more vector shift opportunities, since we'll get those when 1570 // expanded. 1571 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1572 } else if (ST.has16BitInsts()) { 1573 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1574 } else { 1575 // Prefer to promote to s32 before lowering if we don't have 16-bit 1576 // shifts. This avoid a lot of intermediate truncate and extend operations. 1577 SextInReg.lowerFor({{S32}, {S64}}); 1578 } 1579 1580 SextInReg 1581 .scalarize(0) 1582 .clampScalar(0, S32, S64) 1583 .lower(); 1584 1585 getActionDefinitionsBuilder(G_FSHR) 1586 .legalFor({{S32, S32}}) 1587 .scalarize(0) 1588 .lower(); 1589 1590 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1591 .legalFor({S64}); 1592 1593 getActionDefinitionsBuilder(G_FENCE) 1594 .alwaysLegal(); 1595 1596 getActionDefinitionsBuilder({ 1597 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1598 G_FCOPYSIGN, 1599 1600 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1601 G_ATOMICRMW_NAND, 1602 G_ATOMICRMW_FSUB, 1603 G_READ_REGISTER, 1604 G_WRITE_REGISTER, 1605 1606 G_SADDO, G_SSUBO, 1607 1608 // TODO: Implement 1609 G_FMINIMUM, G_FMAXIMUM, 1610 G_FSHL 1611 }).lower(); 1612 1613 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1614 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1615 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1616 .unsupported(); 1617 1618 computeTables(); 1619 verify(*ST.getInstrInfo()); 1620 } 1621 1622 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1623 MachineInstr &MI) const { 1624 MachineIRBuilder &B = Helper.MIRBuilder; 1625 MachineRegisterInfo &MRI = *B.getMRI(); 1626 1627 switch (MI.getOpcode()) { 1628 case TargetOpcode::G_ADDRSPACE_CAST: 1629 return legalizeAddrSpaceCast(MI, MRI, B); 1630 case TargetOpcode::G_FRINT: 1631 return legalizeFrint(MI, MRI, B); 1632 case TargetOpcode::G_FCEIL: 1633 return legalizeFceil(MI, MRI, B); 1634 case TargetOpcode::G_FREM: 1635 return legalizeFrem(MI, MRI, B); 1636 case TargetOpcode::G_INTRINSIC_TRUNC: 1637 return legalizeIntrinsicTrunc(MI, MRI, B); 1638 case TargetOpcode::G_SITOFP: 1639 return legalizeITOFP(MI, MRI, B, true); 1640 case TargetOpcode::G_UITOFP: 1641 return legalizeITOFP(MI, MRI, B, false); 1642 case TargetOpcode::G_FPTOSI: 1643 return legalizeFPTOI(MI, MRI, B, true); 1644 case TargetOpcode::G_FPTOUI: 1645 return legalizeFPTOI(MI, MRI, B, false); 1646 case TargetOpcode::G_FMINNUM: 1647 case TargetOpcode::G_FMAXNUM: 1648 case TargetOpcode::G_FMINNUM_IEEE: 1649 case TargetOpcode::G_FMAXNUM_IEEE: 1650 return legalizeMinNumMaxNum(Helper, MI); 1651 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1652 return legalizeExtractVectorElt(MI, MRI, B); 1653 case TargetOpcode::G_INSERT_VECTOR_ELT: 1654 return legalizeInsertVectorElt(MI, MRI, B); 1655 case TargetOpcode::G_SHUFFLE_VECTOR: 1656 return legalizeShuffleVector(MI, MRI, B); 1657 case TargetOpcode::G_FSIN: 1658 case TargetOpcode::G_FCOS: 1659 return legalizeSinCos(MI, MRI, B); 1660 case TargetOpcode::G_GLOBAL_VALUE: 1661 return legalizeGlobalValue(MI, MRI, B); 1662 case TargetOpcode::G_LOAD: 1663 return legalizeLoad(Helper, MI); 1664 case TargetOpcode::G_FMAD: 1665 return legalizeFMad(MI, MRI, B); 1666 case TargetOpcode::G_FDIV: 1667 return legalizeFDIV(MI, MRI, B); 1668 case TargetOpcode::G_UDIV: 1669 case TargetOpcode::G_UREM: 1670 return legalizeUDIV_UREM(MI, MRI, B); 1671 case TargetOpcode::G_SDIV: 1672 case TargetOpcode::G_SREM: 1673 return legalizeSDIV_SREM(MI, MRI, B); 1674 case TargetOpcode::G_ATOMIC_CMPXCHG: 1675 return legalizeAtomicCmpXChg(MI, MRI, B); 1676 case TargetOpcode::G_FLOG: 1677 return legalizeFlog(MI, B, numbers::ln2f); 1678 case TargetOpcode::G_FLOG10: 1679 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1680 case TargetOpcode::G_FEXP: 1681 return legalizeFExp(MI, B); 1682 case TargetOpcode::G_FPOW: 1683 return legalizeFPow(MI, B); 1684 case TargetOpcode::G_FFLOOR: 1685 return legalizeFFloor(MI, MRI, B); 1686 case TargetOpcode::G_BUILD_VECTOR: 1687 return legalizeBuildVector(MI, MRI, B); 1688 default: 1689 return false; 1690 } 1691 1692 llvm_unreachable("expected switch to return"); 1693 } 1694 1695 Register AMDGPULegalizerInfo::getSegmentAperture( 1696 unsigned AS, 1697 MachineRegisterInfo &MRI, 1698 MachineIRBuilder &B) const { 1699 MachineFunction &MF = B.getMF(); 1700 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1701 const LLT S32 = LLT::scalar(32); 1702 1703 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1704 1705 if (ST.hasApertureRegs()) { 1706 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1707 // getreg. 1708 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1709 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1710 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1711 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1712 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1713 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1714 unsigned Encoding = 1715 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1716 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1717 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1718 1719 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1720 1721 B.buildInstr(AMDGPU::S_GETREG_B32) 1722 .addDef(GetReg) 1723 .addImm(Encoding); 1724 MRI.setType(GetReg, S32); 1725 1726 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1727 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1728 } 1729 1730 Register QueuePtr = MRI.createGenericVirtualRegister( 1731 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1732 1733 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1734 return Register(); 1735 1736 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1737 // private_segment_aperture_base_hi. 1738 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1739 1740 // TODO: can we be smarter about machine pointer info? 1741 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1742 MachineMemOperand *MMO = MF.getMachineMemOperand( 1743 PtrInfo, 1744 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1745 MachineMemOperand::MOInvariant, 1746 4, commonAlignment(Align(64), StructOffset)); 1747 1748 Register LoadAddr; 1749 1750 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1751 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1752 } 1753 1754 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1755 MachineInstr &MI, MachineRegisterInfo &MRI, 1756 MachineIRBuilder &B) const { 1757 MachineFunction &MF = B.getMF(); 1758 1759 const LLT S32 = LLT::scalar(32); 1760 Register Dst = MI.getOperand(0).getReg(); 1761 Register Src = MI.getOperand(1).getReg(); 1762 1763 LLT DstTy = MRI.getType(Dst); 1764 LLT SrcTy = MRI.getType(Src); 1765 unsigned DestAS = DstTy.getAddressSpace(); 1766 unsigned SrcAS = SrcTy.getAddressSpace(); 1767 1768 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1769 // vector element. 1770 assert(!DstTy.isVector()); 1771 1772 const AMDGPUTargetMachine &TM 1773 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1774 1775 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1776 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1777 return true; 1778 } 1779 1780 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1781 // Truncate. 1782 B.buildExtract(Dst, Src, 0); 1783 MI.eraseFromParent(); 1784 return true; 1785 } 1786 1787 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1788 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1789 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1790 1791 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1792 // another. Merge operands are required to be the same type, but creating an 1793 // extra ptrtoint would be kind of pointless. 1794 auto HighAddr = B.buildConstant( 1795 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1796 B.buildMerge(Dst, {Src, HighAddr}); 1797 MI.eraseFromParent(); 1798 return true; 1799 } 1800 1801 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1802 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1803 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1804 unsigned NullVal = TM.getNullPointerValue(DestAS); 1805 1806 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1807 auto FlatNull = B.buildConstant(SrcTy, 0); 1808 1809 // Extract low 32-bits of the pointer. 1810 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1811 1812 auto CmpRes = 1813 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1814 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1821 return false; 1822 1823 if (!ST.hasFlatAddressSpace()) 1824 return false; 1825 1826 auto SegmentNull = 1827 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1828 auto FlatNull = 1829 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1830 1831 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1832 if (!ApertureReg.isValid()) 1833 return false; 1834 1835 auto CmpRes = 1836 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1837 1838 // Coerce the type of the low half of the result so we can use merge_values. 1839 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1840 1841 // TODO: Should we allow mismatched types but matching sizes in merges to 1842 // avoid the ptrtoint? 1843 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1844 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1845 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 bool AMDGPULegalizerInfo::legalizeFrint( 1851 MachineInstr &MI, MachineRegisterInfo &MRI, 1852 MachineIRBuilder &B) const { 1853 Register Src = MI.getOperand(1).getReg(); 1854 LLT Ty = MRI.getType(Src); 1855 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1856 1857 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1858 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1859 1860 auto C1 = B.buildFConstant(Ty, C1Val); 1861 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1862 1863 // TODO: Should this propagate fast-math-flags? 1864 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1865 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1866 1867 auto C2 = B.buildFConstant(Ty, C2Val); 1868 auto Fabs = B.buildFAbs(Ty, Src); 1869 1870 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1871 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizeFceil( 1877 MachineInstr &MI, MachineRegisterInfo &MRI, 1878 MachineIRBuilder &B) const { 1879 1880 const LLT S1 = LLT::scalar(1); 1881 const LLT S64 = LLT::scalar(64); 1882 1883 Register Src = MI.getOperand(1).getReg(); 1884 assert(MRI.getType(Src) == S64); 1885 1886 // result = trunc(src) 1887 // if (src > 0.0 && src != result) 1888 // result += 1.0 1889 1890 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1891 1892 const auto Zero = B.buildFConstant(S64, 0.0); 1893 const auto One = B.buildFConstant(S64, 1.0); 1894 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1895 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1896 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1897 auto Add = B.buildSelect(S64, And, One, Zero); 1898 1899 // TODO: Should this propagate fast-math-flags? 1900 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::legalizeFrem( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B) const { 1907 Register DstReg = MI.getOperand(0).getReg(); 1908 Register Src0Reg = MI.getOperand(1).getReg(); 1909 Register Src1Reg = MI.getOperand(2).getReg(); 1910 auto Flags = MI.getFlags(); 1911 LLT Ty = MRI.getType(DstReg); 1912 1913 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1914 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1915 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1916 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1917 MI.eraseFromParent(); 1918 return true; 1919 } 1920 1921 static MachineInstrBuilder extractF64Exponent(Register Hi, 1922 MachineIRBuilder &B) { 1923 const unsigned FractBits = 52; 1924 const unsigned ExpBits = 11; 1925 LLT S32 = LLT::scalar(32); 1926 1927 auto Const0 = B.buildConstant(S32, FractBits - 32); 1928 auto Const1 = B.buildConstant(S32, ExpBits); 1929 1930 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1931 .addUse(Hi) 1932 .addUse(Const0.getReg(0)) 1933 .addUse(Const1.getReg(0)); 1934 1935 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1936 } 1937 1938 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1939 MachineInstr &MI, MachineRegisterInfo &MRI, 1940 MachineIRBuilder &B) const { 1941 const LLT S1 = LLT::scalar(1); 1942 const LLT S32 = LLT::scalar(32); 1943 const LLT S64 = LLT::scalar(64); 1944 1945 Register Src = MI.getOperand(1).getReg(); 1946 assert(MRI.getType(Src) == S64); 1947 1948 // TODO: Should this use extract since the low half is unused? 1949 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1950 Register Hi = Unmerge.getReg(1); 1951 1952 // Extract the upper half, since this is where we will find the sign and 1953 // exponent. 1954 auto Exp = extractF64Exponent(Hi, B); 1955 1956 const unsigned FractBits = 52; 1957 1958 // Extract the sign bit. 1959 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1960 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1961 1962 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1963 1964 const auto Zero32 = B.buildConstant(S32, 0); 1965 1966 // Extend back to 64-bits. 1967 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1968 1969 auto Shr = B.buildAShr(S64, FractMask, Exp); 1970 auto Not = B.buildNot(S64, Shr); 1971 auto Tmp0 = B.buildAnd(S64, Src, Not); 1972 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1973 1974 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1975 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1976 1977 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1978 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeITOFP( 1984 MachineInstr &MI, MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B, bool Signed) const { 1986 1987 Register Dst = MI.getOperand(0).getReg(); 1988 Register Src = MI.getOperand(1).getReg(); 1989 1990 const LLT S64 = LLT::scalar(64); 1991 const LLT S32 = LLT::scalar(32); 1992 1993 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1994 1995 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1996 1997 auto CvtHi = Signed ? 1998 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1999 B.buildUITOFP(S64, Unmerge.getReg(1)); 2000 2001 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2002 2003 auto ThirtyTwo = B.buildConstant(S32, 32); 2004 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2005 .addUse(CvtHi.getReg(0)) 2006 .addUse(ThirtyTwo.getReg(0)); 2007 2008 // TODO: Should this propagate fast-math-flags? 2009 B.buildFAdd(Dst, LdExp, CvtLo); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 // TODO: Copied from DAG implementation. Verify logic and document how this 2015 // actually works. 2016 bool AMDGPULegalizerInfo::legalizeFPTOI( 2017 MachineInstr &MI, MachineRegisterInfo &MRI, 2018 MachineIRBuilder &B, bool Signed) const { 2019 2020 Register Dst = MI.getOperand(0).getReg(); 2021 Register Src = MI.getOperand(1).getReg(); 2022 2023 const LLT S64 = LLT::scalar(64); 2024 const LLT S32 = LLT::scalar(32); 2025 2026 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2027 2028 unsigned Flags = MI.getFlags(); 2029 2030 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2031 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2032 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2033 2034 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2035 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2036 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2037 2038 auto Hi = Signed ? 2039 B.buildFPTOSI(S32, FloorMul) : 2040 B.buildFPTOUI(S32, FloorMul); 2041 auto Lo = B.buildFPTOUI(S32, Fma); 2042 2043 B.buildMerge(Dst, { Lo, Hi }); 2044 MI.eraseFromParent(); 2045 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2050 MachineInstr &MI) const { 2051 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2052 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2053 2054 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2055 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2056 2057 // With ieee_mode disabled, the instructions have the correct behavior 2058 // already for G_FMINNUM/G_FMAXNUM 2059 if (!MFI->getMode().IEEE) 2060 return !IsIEEEOp; 2061 2062 if (IsIEEEOp) 2063 return true; 2064 2065 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2066 } 2067 2068 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2069 MachineInstr &MI, MachineRegisterInfo &MRI, 2070 MachineIRBuilder &B) const { 2071 // TODO: Should move some of this into LegalizerHelper. 2072 2073 // TODO: Promote dynamic indexing of s16 to s32 2074 2075 // FIXME: Artifact combiner probably should have replaced the truncated 2076 // constant before this, so we shouldn't need 2077 // getConstantVRegValWithLookThrough. 2078 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2079 MI.getOperand(2).getReg(), MRI); 2080 if (!IdxVal) // Dynamic case will be selected to register indexing. 2081 return true; 2082 2083 Register Dst = MI.getOperand(0).getReg(); 2084 Register Vec = MI.getOperand(1).getReg(); 2085 2086 LLT VecTy = MRI.getType(Vec); 2087 LLT EltTy = VecTy.getElementType(); 2088 assert(EltTy == MRI.getType(Dst)); 2089 2090 if (IdxVal->Value < VecTy.getNumElements()) 2091 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2092 else 2093 B.buildUndef(Dst); 2094 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2100 MachineInstr &MI, MachineRegisterInfo &MRI, 2101 MachineIRBuilder &B) const { 2102 // TODO: Should move some of this into LegalizerHelper. 2103 2104 // TODO: Promote dynamic indexing of s16 to s32 2105 2106 // FIXME: Artifact combiner probably should have replaced the truncated 2107 // constant before this, so we shouldn't need 2108 // getConstantVRegValWithLookThrough. 2109 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2110 MI.getOperand(3).getReg(), MRI); 2111 if (!IdxVal) // Dynamic case will be selected to register indexing. 2112 return true; 2113 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Vec = MI.getOperand(1).getReg(); 2116 Register Ins = MI.getOperand(2).getReg(); 2117 2118 LLT VecTy = MRI.getType(Vec); 2119 LLT EltTy = VecTy.getElementType(); 2120 assert(EltTy == MRI.getType(Ins)); 2121 2122 if (IdxVal->Value < VecTy.getNumElements()) 2123 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2124 else 2125 B.buildUndef(Dst); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2132 MachineInstr &MI, MachineRegisterInfo &MRI, 2133 MachineIRBuilder &B) const { 2134 const LLT V2S16 = LLT::vector(2, 16); 2135 2136 Register Dst = MI.getOperand(0).getReg(); 2137 Register Src0 = MI.getOperand(1).getReg(); 2138 LLT DstTy = MRI.getType(Dst); 2139 LLT SrcTy = MRI.getType(Src0); 2140 2141 if (SrcTy == V2S16 && DstTy == V2S16 && 2142 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2143 return true; 2144 2145 MachineIRBuilder HelperBuilder(MI); 2146 GISelObserverWrapper DummyObserver; 2147 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2148 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2149 } 2150 2151 bool AMDGPULegalizerInfo::legalizeSinCos( 2152 MachineInstr &MI, MachineRegisterInfo &MRI, 2153 MachineIRBuilder &B) const { 2154 2155 Register DstReg = MI.getOperand(0).getReg(); 2156 Register SrcReg = MI.getOperand(1).getReg(); 2157 LLT Ty = MRI.getType(DstReg); 2158 unsigned Flags = MI.getFlags(); 2159 2160 Register TrigVal; 2161 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2162 if (ST.hasTrigReducedRange()) { 2163 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2164 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2165 .addUse(MulVal.getReg(0)) 2166 .setMIFlags(Flags).getReg(0); 2167 } else 2168 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2169 2170 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2171 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2172 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2173 .addUse(TrigVal) 2174 .setMIFlags(Flags); 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2180 MachineIRBuilder &B, 2181 const GlobalValue *GV, 2182 int64_t Offset, 2183 unsigned GAFlags) const { 2184 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2185 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2186 // to the following code sequence: 2187 // 2188 // For constant address space: 2189 // s_getpc_b64 s[0:1] 2190 // s_add_u32 s0, s0, $symbol 2191 // s_addc_u32 s1, s1, 0 2192 // 2193 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2194 // a fixup or relocation is emitted to replace $symbol with a literal 2195 // constant, which is a pc-relative offset from the encoding of the $symbol 2196 // operand to the global variable. 2197 // 2198 // For global address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2201 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // fixups or relocations are emitted to replace $symbol@*@lo and 2205 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2206 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2207 // operand to the global variable. 2208 // 2209 // What we want here is an offset from the value returned by s_getpc 2210 // (which is the address of the s_add_u32 instruction) to the global 2211 // variable, but since the encoding of $symbol starts 4 bytes after the start 2212 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2213 // small. This requires us to add 4 to the global variable offset in order to 2214 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2215 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2216 // instruction. 2217 2218 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2219 2220 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2221 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2222 2223 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2224 .addDef(PCReg); 2225 2226 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2227 if (GAFlags == SIInstrInfo::MO_NONE) 2228 MIB.addImm(0); 2229 else 2230 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2231 2232 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2233 2234 if (PtrTy.getSizeInBits() == 32) 2235 B.buildExtract(DstReg, PCReg, 0); 2236 return true; 2237 } 2238 2239 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2240 MachineInstr &MI, MachineRegisterInfo &MRI, 2241 MachineIRBuilder &B) const { 2242 Register DstReg = MI.getOperand(0).getReg(); 2243 LLT Ty = MRI.getType(DstReg); 2244 unsigned AS = Ty.getAddressSpace(); 2245 2246 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2247 MachineFunction &MF = B.getMF(); 2248 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2249 2250 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2251 if (!MFI->isEntryFunction()) { 2252 const Function &Fn = MF.getFunction(); 2253 DiagnosticInfoUnsupported BadLDSDecl( 2254 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2255 DS_Warning); 2256 Fn.getContext().diagnose(BadLDSDecl); 2257 2258 // We currently don't have a way to correctly allocate LDS objects that 2259 // aren't directly associated with a kernel. We do force inlining of 2260 // functions that use local objects. However, if these dead functions are 2261 // not eliminated, we don't want a compile time error. Just emit a warning 2262 // and a trap, since there should be no callable path here. 2263 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2264 B.buildUndef(DstReg); 2265 MI.eraseFromParent(); 2266 return true; 2267 } 2268 2269 // TODO: We could emit code to handle the initialization somewhere. 2270 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2271 const SITargetLowering *TLI = ST.getTargetLowering(); 2272 if (!TLI->shouldUseLDSConstAddress(GV)) { 2273 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2274 return true; // Leave in place; 2275 } 2276 2277 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2278 Type *Ty = GV->getValueType(); 2279 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2280 // zero-sized type in other languages to declare the dynamic shared 2281 // memory which size is not known at the compile time. They will be 2282 // allocated by the runtime and placed directly after the static 2283 // allocated ones. They all share the same offset. 2284 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2285 // Adjust alignment for that dynamic shared memory array. 2286 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2287 LLT S32 = LLT::scalar(32); 2288 auto Sz = 2289 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2290 B.buildIntToPtr(DstReg, Sz); 2291 MI.eraseFromParent(); 2292 return true; 2293 } 2294 } 2295 2296 B.buildConstant( 2297 DstReg, 2298 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2299 MI.eraseFromParent(); 2300 return true; 2301 } 2302 2303 const Function &Fn = MF.getFunction(); 2304 DiagnosticInfoUnsupported BadInit( 2305 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2306 Fn.getContext().diagnose(BadInit); 2307 return true; 2308 } 2309 2310 const SITargetLowering *TLI = ST.getTargetLowering(); 2311 2312 if (TLI->shouldEmitFixup(GV)) { 2313 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2314 MI.eraseFromParent(); 2315 return true; 2316 } 2317 2318 if (TLI->shouldEmitPCReloc(GV)) { 2319 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2320 MI.eraseFromParent(); 2321 return true; 2322 } 2323 2324 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2325 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2326 2327 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2328 MachinePointerInfo::getGOT(MF), 2329 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2330 MachineMemOperand::MOInvariant, 2331 8 /*Size*/, Align(8)); 2332 2333 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2334 2335 if (Ty.getSizeInBits() == 32) { 2336 // Truncate if this is a 32-bit constant adrdess. 2337 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2338 B.buildExtract(DstReg, Load, 0); 2339 } else 2340 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2341 2342 MI.eraseFromParent(); 2343 return true; 2344 } 2345 2346 static LLT widenToNextPowerOf2(LLT Ty) { 2347 if (Ty.isVector()) 2348 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2349 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2350 } 2351 2352 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2353 MachineInstr &MI) const { 2354 MachineIRBuilder &B = Helper.MIRBuilder; 2355 MachineRegisterInfo &MRI = *B.getMRI(); 2356 GISelChangeObserver &Observer = Helper.Observer; 2357 2358 Register PtrReg = MI.getOperand(1).getReg(); 2359 LLT PtrTy = MRI.getType(PtrReg); 2360 unsigned AddrSpace = PtrTy.getAddressSpace(); 2361 2362 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2363 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2364 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2365 Observer.changingInstr(MI); 2366 MI.getOperand(1).setReg(Cast.getReg(0)); 2367 Observer.changedInstr(MI); 2368 return true; 2369 } 2370 2371 Register ValReg = MI.getOperand(0).getReg(); 2372 LLT ValTy = MRI.getType(ValReg); 2373 2374 MachineMemOperand *MMO = *MI.memoperands_begin(); 2375 const unsigned ValSize = ValTy.getSizeInBits(); 2376 const unsigned MemSize = 8 * MMO->getSize(); 2377 const Align MemAlign = MMO->getAlign(); 2378 const unsigned AlignInBits = 8 * MemAlign.value(); 2379 2380 // Widen non-power-of-2 loads to the alignment if needed 2381 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2382 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2383 2384 // This was already the correct extending load result type, so just adjust 2385 // the memory type. 2386 if (WideMemSize == ValSize) { 2387 MachineFunction &MF = B.getMF(); 2388 2389 MachineMemOperand *WideMMO = 2390 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2391 Observer.changingInstr(MI); 2392 MI.setMemRefs(MF, {WideMMO}); 2393 Observer.changedInstr(MI); 2394 return true; 2395 } 2396 2397 // Don't bother handling edge case that should probably never be produced. 2398 if (ValSize > WideMemSize) 2399 return false; 2400 2401 LLT WideTy = widenToNextPowerOf2(ValTy); 2402 2403 Register WideLoad; 2404 if (!WideTy.isVector()) { 2405 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2406 B.buildTrunc(ValReg, WideLoad).getReg(0); 2407 } else { 2408 // Extract the subvector. 2409 2410 if (isRegisterType(ValTy)) { 2411 // If this a case where G_EXTRACT is legal, use it. 2412 // (e.g. <3 x s32> -> <4 x s32>) 2413 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2414 B.buildExtract(ValReg, WideLoad, 0); 2415 } else { 2416 // For cases where the widened type isn't a nice register value, unmerge 2417 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2418 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2419 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2420 B.setInsertPt(B.getMBB(), MI.getIterator()); 2421 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2422 } 2423 } 2424 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 return false; 2430 } 2431 2432 bool AMDGPULegalizerInfo::legalizeFMad( 2433 MachineInstr &MI, MachineRegisterInfo &MRI, 2434 MachineIRBuilder &B) const { 2435 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2436 assert(Ty.isScalar()); 2437 2438 MachineFunction &MF = B.getMF(); 2439 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2440 2441 // TODO: Always legal with future ftz flag. 2442 // FIXME: Do we need just output? 2443 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2444 return true; 2445 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2446 return true; 2447 2448 MachineIRBuilder HelperBuilder(MI); 2449 GISelObserverWrapper DummyObserver; 2450 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2451 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2452 } 2453 2454 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2455 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2456 Register DstReg = MI.getOperand(0).getReg(); 2457 Register PtrReg = MI.getOperand(1).getReg(); 2458 Register CmpVal = MI.getOperand(2).getReg(); 2459 Register NewVal = MI.getOperand(3).getReg(); 2460 2461 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2462 "this should not have been custom lowered"); 2463 2464 LLT ValTy = MRI.getType(CmpVal); 2465 LLT VecTy = LLT::vector(2, ValTy); 2466 2467 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2468 2469 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2470 .addDef(DstReg) 2471 .addUse(PtrReg) 2472 .addUse(PackedVal) 2473 .setMemRefs(MI.memoperands()); 2474 2475 MI.eraseFromParent(); 2476 return true; 2477 } 2478 2479 bool AMDGPULegalizerInfo::legalizeFlog( 2480 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2481 Register Dst = MI.getOperand(0).getReg(); 2482 Register Src = MI.getOperand(1).getReg(); 2483 LLT Ty = B.getMRI()->getType(Dst); 2484 unsigned Flags = MI.getFlags(); 2485 2486 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2487 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2488 2489 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 2494 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2495 MachineIRBuilder &B) const { 2496 Register Dst = MI.getOperand(0).getReg(); 2497 Register Src = MI.getOperand(1).getReg(); 2498 unsigned Flags = MI.getFlags(); 2499 LLT Ty = B.getMRI()->getType(Dst); 2500 2501 auto K = B.buildFConstant(Ty, numbers::log2e); 2502 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2503 B.buildFExp2(Dst, Mul, Flags); 2504 MI.eraseFromParent(); 2505 return true; 2506 } 2507 2508 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2509 MachineIRBuilder &B) const { 2510 Register Dst = MI.getOperand(0).getReg(); 2511 Register Src0 = MI.getOperand(1).getReg(); 2512 Register Src1 = MI.getOperand(2).getReg(); 2513 unsigned Flags = MI.getFlags(); 2514 LLT Ty = B.getMRI()->getType(Dst); 2515 const LLT S16 = LLT::scalar(16); 2516 const LLT S32 = LLT::scalar(32); 2517 2518 if (Ty == S32) { 2519 auto Log = B.buildFLog2(S32, Src0, Flags); 2520 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2521 .addUse(Log.getReg(0)) 2522 .addUse(Src1) 2523 .setMIFlags(Flags); 2524 B.buildFExp2(Dst, Mul, Flags); 2525 } else if (Ty == S16) { 2526 // There's no f16 fmul_legacy, so we need to convert for it. 2527 auto Log = B.buildFLog2(S16, Src0, Flags); 2528 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2529 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2531 .addUse(Ext0.getReg(0)) 2532 .addUse(Ext1.getReg(0)) 2533 .setMIFlags(Flags); 2534 2535 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2536 } else 2537 return false; 2538 2539 MI.eraseFromParent(); 2540 return true; 2541 } 2542 2543 // Find a source register, ignoring any possible source modifiers. 2544 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2545 Register ModSrc = OrigSrc; 2546 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2547 ModSrc = SrcFNeg->getOperand(1).getReg(); 2548 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2549 ModSrc = SrcFAbs->getOperand(1).getReg(); 2550 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2551 ModSrc = SrcFAbs->getOperand(1).getReg(); 2552 return ModSrc; 2553 } 2554 2555 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2556 MachineRegisterInfo &MRI, 2557 MachineIRBuilder &B) const { 2558 2559 const LLT S1 = LLT::scalar(1); 2560 const LLT S64 = LLT::scalar(64); 2561 Register Dst = MI.getOperand(0).getReg(); 2562 Register OrigSrc = MI.getOperand(1).getReg(); 2563 unsigned Flags = MI.getFlags(); 2564 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2565 "this should not have been custom lowered"); 2566 2567 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2568 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2569 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2570 // V_FRACT bug is: 2571 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2572 // 2573 // Convert floor(x) to (x - fract(x)) 2574 2575 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2576 .addUse(OrigSrc) 2577 .setMIFlags(Flags); 2578 2579 // Give source modifier matching some assistance before obscuring a foldable 2580 // pattern. 2581 2582 // TODO: We can avoid the neg on the fract? The input sign to fract 2583 // shouldn't matter? 2584 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2585 2586 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2587 2588 Register Min = MRI.createGenericVirtualRegister(S64); 2589 2590 // We don't need to concern ourselves with the snan handling difference, so 2591 // use the one which will directly select. 2592 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2593 if (MFI->getMode().IEEE) 2594 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2595 else 2596 B.buildFMinNum(Min, Fract, Const, Flags); 2597 2598 Register CorrectedFract = Min; 2599 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2600 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2601 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2602 } 2603 2604 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2605 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2606 2607 MI.eraseFromParent(); 2608 return true; 2609 } 2610 2611 // Turn an illegal packed v2s16 build vector into bit operations. 2612 // TODO: This should probably be a bitcast action in LegalizerHelper. 2613 bool AMDGPULegalizerInfo::legalizeBuildVector( 2614 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2615 Register Dst = MI.getOperand(0).getReg(); 2616 const LLT S32 = LLT::scalar(32); 2617 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2618 2619 Register Src0 = MI.getOperand(1).getReg(); 2620 Register Src1 = MI.getOperand(2).getReg(); 2621 assert(MRI.getType(Src0) == LLT::scalar(16)); 2622 2623 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2624 B.buildBitcast(Dst, Merge); 2625 2626 MI.eraseFromParent(); 2627 return true; 2628 } 2629 2630 // Check that this is a G_XOR x, -1 2631 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2632 if (MI.getOpcode() != TargetOpcode::G_XOR) 2633 return false; 2634 auto ConstVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 2635 return ConstVal && *ConstVal == -1; 2636 } 2637 2638 // Return the use branch instruction, otherwise null if the usage is invalid. 2639 static MachineInstr * 2640 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2641 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2642 Register CondDef = MI.getOperand(0).getReg(); 2643 if (!MRI.hasOneNonDBGUse(CondDef)) 2644 return nullptr; 2645 2646 MachineBasicBlock *Parent = MI.getParent(); 2647 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2648 2649 if (isNot(MRI, *UseMI)) { 2650 Register NegatedCond = UseMI->getOperand(0).getReg(); 2651 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2652 return nullptr; 2653 2654 // We're deleting the def of this value, so we need to remove it. 2655 UseMI->eraseFromParent(); 2656 2657 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2658 Negated = true; 2659 } 2660 2661 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2662 return nullptr; 2663 2664 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2665 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2666 if (Next == Parent->end()) { 2667 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2668 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2669 return nullptr; 2670 UncondBrTarget = &*NextMBB; 2671 } else { 2672 if (Next->getOpcode() != AMDGPU::G_BR) 2673 return nullptr; 2674 Br = &*Next; 2675 UncondBrTarget = Br->getOperand(0).getMBB(); 2676 } 2677 2678 return UseMI; 2679 } 2680 2681 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2682 const ArgDescriptor *Arg, 2683 const TargetRegisterClass *ArgRC, 2684 LLT ArgTy) const { 2685 MCRegister SrcReg = Arg->getRegister(); 2686 assert(SrcReg.isPhysical() && "Physical register expected"); 2687 assert(DstReg.isVirtual() && "Virtual register expected"); 2688 2689 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2690 ArgTy); 2691 if (Arg->isMasked()) { 2692 // TODO: Should we try to emit this once in the entry block? 2693 const LLT S32 = LLT::scalar(32); 2694 const unsigned Mask = Arg->getMask(); 2695 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2696 2697 Register AndMaskSrc = LiveIn; 2698 2699 if (Shift != 0) { 2700 auto ShiftAmt = B.buildConstant(S32, Shift); 2701 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2702 } 2703 2704 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2705 } else { 2706 B.buildCopy(DstReg, LiveIn); 2707 } 2708 2709 return true; 2710 } 2711 2712 bool AMDGPULegalizerInfo::loadInputValue( 2713 Register DstReg, MachineIRBuilder &B, 2714 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2715 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2716 const ArgDescriptor *Arg; 2717 const TargetRegisterClass *ArgRC; 2718 LLT ArgTy; 2719 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2720 2721 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2722 return false; // TODO: Handle these 2723 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2724 } 2725 2726 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2727 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2728 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2729 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2730 return false; 2731 2732 MI.eraseFromParent(); 2733 return true; 2734 } 2735 2736 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2737 MachineRegisterInfo &MRI, 2738 MachineIRBuilder &B) const { 2739 Register Dst = MI.getOperand(0).getReg(); 2740 LLT DstTy = MRI.getType(Dst); 2741 LLT S16 = LLT::scalar(16); 2742 LLT S32 = LLT::scalar(32); 2743 LLT S64 = LLT::scalar(64); 2744 2745 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2746 return true; 2747 2748 if (DstTy == S16) 2749 return legalizeFDIV16(MI, MRI, B); 2750 if (DstTy == S32) 2751 return legalizeFDIV32(MI, MRI, B); 2752 if (DstTy == S64) 2753 return legalizeFDIV64(MI, MRI, B); 2754 2755 return false; 2756 } 2757 2758 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2759 Register DstReg, 2760 Register X, 2761 Register Y, 2762 bool IsDiv) const { 2763 const LLT S1 = LLT::scalar(1); 2764 const LLT S32 = LLT::scalar(32); 2765 2766 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2767 // algorithm used here. 2768 2769 // Initial estimate of inv(y). 2770 auto FloatY = B.buildUITOFP(S32, Y); 2771 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2772 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2773 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2774 auto Z = B.buildFPTOUI(S32, ScaledY); 2775 2776 // One round of UNR. 2777 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2778 auto NegYZ = B.buildMul(S32, NegY, Z); 2779 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2780 2781 // Quotient/remainder estimate. 2782 auto Q = B.buildUMulH(S32, X, Z); 2783 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2784 2785 // First quotient/remainder refinement. 2786 auto One = B.buildConstant(S32, 1); 2787 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2788 if (IsDiv) 2789 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2790 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2791 2792 // Second quotient/remainder refinement. 2793 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2794 if (IsDiv) 2795 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2796 else 2797 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2798 } 2799 2800 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2801 MachineRegisterInfo &MRI, 2802 MachineIRBuilder &B) const { 2803 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2804 Register DstReg = MI.getOperand(0).getReg(); 2805 Register Num = MI.getOperand(1).getReg(); 2806 Register Den = MI.getOperand(2).getReg(); 2807 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2808 MI.eraseFromParent(); 2809 return true; 2810 } 2811 2812 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2813 // 2814 // Return lo, hi of result 2815 // 2816 // %cvt.lo = G_UITOFP Val.lo 2817 // %cvt.hi = G_UITOFP Val.hi 2818 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2819 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2820 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2821 // %mul2 = G_FMUL %mul1, 2**(-32) 2822 // %trunc = G_INTRINSIC_TRUNC %mul2 2823 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2824 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2825 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2826 Register Val) { 2827 const LLT S32 = LLT::scalar(32); 2828 auto Unmerge = B.buildUnmerge(S32, Val); 2829 2830 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2831 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2832 2833 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2834 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2835 2836 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2837 auto Mul1 = 2838 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2839 2840 // 2**(-32) 2841 auto Mul2 = 2842 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2843 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2844 2845 // -(2**32) 2846 auto Mad2 = B.buildFMAD(S32, Trunc, 2847 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2848 2849 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2850 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2851 2852 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2853 } 2854 2855 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2856 Register DstReg, 2857 Register Numer, 2858 Register Denom, 2859 bool IsDiv) const { 2860 const LLT S32 = LLT::scalar(32); 2861 const LLT S64 = LLT::scalar(64); 2862 const LLT S1 = LLT::scalar(1); 2863 Register RcpLo, RcpHi; 2864 2865 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2866 2867 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2868 2869 auto Zero64 = B.buildConstant(S64, 0); 2870 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2871 2872 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2873 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2874 2875 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2876 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2877 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2878 2879 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2880 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2881 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2882 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2883 2884 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2885 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2886 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2887 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2888 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2889 2890 auto Zero32 = B.buildConstant(S32, 0); 2891 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2892 auto Add2_HiC = 2893 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2894 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2895 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2896 2897 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2898 Register NumerLo = UnmergeNumer.getReg(0); 2899 Register NumerHi = UnmergeNumer.getReg(1); 2900 2901 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2902 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2903 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2904 Register Mul3_Lo = UnmergeMul3.getReg(0); 2905 Register Mul3_Hi = UnmergeMul3.getReg(1); 2906 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2907 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2908 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2909 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2910 2911 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2912 Register DenomLo = UnmergeDenom.getReg(0); 2913 Register DenomHi = UnmergeDenom.getReg(1); 2914 2915 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2916 auto C1 = B.buildSExt(S32, CmpHi); 2917 2918 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2919 auto C2 = B.buildSExt(S32, CmpLo); 2920 2921 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2922 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2923 2924 // TODO: Here and below portions of the code can be enclosed into if/endif. 2925 // Currently control flow is unconditional and we have 4 selects after 2926 // potential endif to substitute PHIs. 2927 2928 // if C3 != 0 ... 2929 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2930 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2931 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2932 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2933 2934 auto One64 = B.buildConstant(S64, 1); 2935 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2936 2937 auto C4 = 2938 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2939 auto C5 = 2940 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2941 auto C6 = B.buildSelect( 2942 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2943 2944 // if (C6 != 0) 2945 auto Add4 = B.buildAdd(S64, Add3, One64); 2946 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2947 2948 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2949 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2950 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2951 2952 // endif C6 2953 // endif C3 2954 2955 if (IsDiv) { 2956 auto Sel1 = B.buildSelect( 2957 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2958 B.buildSelect(DstReg, 2959 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2960 } else { 2961 auto Sel2 = B.buildSelect( 2962 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2963 B.buildSelect(DstReg, 2964 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2965 } 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2969 MachineRegisterInfo &MRI, 2970 MachineIRBuilder &B) const { 2971 const LLT S64 = LLT::scalar(64); 2972 const LLT S32 = LLT::scalar(32); 2973 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2974 Register DstReg = MI.getOperand(0).getReg(); 2975 Register Num = MI.getOperand(1).getReg(); 2976 Register Den = MI.getOperand(2).getReg(); 2977 LLT Ty = MRI.getType(DstReg); 2978 2979 if (Ty == S32) 2980 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2981 else if (Ty == S64) 2982 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2983 else 2984 return false; 2985 2986 MI.eraseFromParent(); 2987 return true; 2988 2989 } 2990 2991 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2992 MachineRegisterInfo &MRI, 2993 MachineIRBuilder &B) const { 2994 const LLT S64 = LLT::scalar(64); 2995 const LLT S32 = LLT::scalar(32); 2996 2997 Register DstReg = MI.getOperand(0).getReg(); 2998 const LLT Ty = MRI.getType(DstReg); 2999 if (Ty != S32 && Ty != S64) 3000 return false; 3001 3002 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3003 3004 Register LHS = MI.getOperand(1).getReg(); 3005 Register RHS = MI.getOperand(2).getReg(); 3006 3007 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3008 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3009 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3010 3011 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3012 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3013 3014 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3015 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3016 3017 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3018 if (Ty == S32) 3019 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3020 else 3021 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3022 3023 Register Sign; 3024 if (IsDiv) 3025 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3026 else 3027 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3028 3029 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3030 B.buildSub(DstReg, UDivRem, Sign); 3031 3032 MI.eraseFromParent(); 3033 return true; 3034 } 3035 3036 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3037 MachineRegisterInfo &MRI, 3038 MachineIRBuilder &B) const { 3039 Register Res = MI.getOperand(0).getReg(); 3040 Register LHS = MI.getOperand(1).getReg(); 3041 Register RHS = MI.getOperand(2).getReg(); 3042 3043 uint16_t Flags = MI.getFlags(); 3044 3045 LLT ResTy = MRI.getType(Res); 3046 LLT S32 = LLT::scalar(32); 3047 LLT S64 = LLT::scalar(64); 3048 3049 const MachineFunction &MF = B.getMF(); 3050 bool Unsafe = 3051 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3052 3053 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3054 return false; 3055 3056 if (!Unsafe && ResTy == S32 && 3057 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3058 return false; 3059 3060 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3061 // 1 / x -> RCP(x) 3062 if (CLHS->isExactlyValue(1.0)) { 3063 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3064 .addUse(RHS) 3065 .setMIFlags(Flags); 3066 3067 MI.eraseFromParent(); 3068 return true; 3069 } 3070 3071 // -1 / x -> RCP( FNEG(x) ) 3072 if (CLHS->isExactlyValue(-1.0)) { 3073 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3074 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3075 .addUse(FNeg.getReg(0)) 3076 .setMIFlags(Flags); 3077 3078 MI.eraseFromParent(); 3079 return true; 3080 } 3081 } 3082 3083 // x / y -> x * (1.0 / y) 3084 if (Unsafe) { 3085 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3086 .addUse(RHS) 3087 .setMIFlags(Flags); 3088 B.buildFMul(Res, LHS, RCP, Flags); 3089 3090 MI.eraseFromParent(); 3091 return true; 3092 } 3093 3094 return false; 3095 } 3096 3097 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3098 MachineRegisterInfo &MRI, 3099 MachineIRBuilder &B) const { 3100 Register Res = MI.getOperand(0).getReg(); 3101 Register LHS = MI.getOperand(1).getReg(); 3102 Register RHS = MI.getOperand(2).getReg(); 3103 3104 uint16_t Flags = MI.getFlags(); 3105 3106 LLT S16 = LLT::scalar(16); 3107 LLT S32 = LLT::scalar(32); 3108 3109 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3110 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3111 3112 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3113 .addUse(RHSExt.getReg(0)) 3114 .setMIFlags(Flags); 3115 3116 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3117 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3118 3119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3120 .addUse(RDst.getReg(0)) 3121 .addUse(RHS) 3122 .addUse(LHS) 3123 .setMIFlags(Flags); 3124 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3130 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3131 static void toggleSPDenormMode(bool Enable, 3132 MachineIRBuilder &B, 3133 const GCNSubtarget &ST, 3134 AMDGPU::SIModeRegisterDefaults Mode) { 3135 // Set SP denorm mode to this value. 3136 unsigned SPDenormMode = 3137 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3138 3139 if (ST.hasDenormModeInst()) { 3140 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3141 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3142 3143 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3144 B.buildInstr(AMDGPU::S_DENORM_MODE) 3145 .addImm(NewDenormModeValue); 3146 3147 } else { 3148 // Select FP32 bit field in mode register. 3149 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3150 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3151 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3152 3153 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3154 .addImm(SPDenormMode) 3155 .addImm(SPDenormModeBitField); 3156 } 3157 } 3158 3159 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3160 MachineRegisterInfo &MRI, 3161 MachineIRBuilder &B) const { 3162 Register Res = MI.getOperand(0).getReg(); 3163 Register LHS = MI.getOperand(1).getReg(); 3164 Register RHS = MI.getOperand(2).getReg(); 3165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3166 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3167 3168 uint16_t Flags = MI.getFlags(); 3169 3170 LLT S32 = LLT::scalar(32); 3171 LLT S1 = LLT::scalar(1); 3172 3173 auto One = B.buildFConstant(S32, 1.0f); 3174 3175 auto DenominatorScaled = 3176 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3177 .addUse(LHS) 3178 .addUse(RHS) 3179 .addImm(0) 3180 .setMIFlags(Flags); 3181 auto NumeratorScaled = 3182 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3183 .addUse(LHS) 3184 .addUse(RHS) 3185 .addImm(1) 3186 .setMIFlags(Flags); 3187 3188 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3189 .addUse(DenominatorScaled.getReg(0)) 3190 .setMIFlags(Flags); 3191 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3192 3193 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3194 // aren't modeled as reading it. 3195 if (!Mode.allFP32Denormals()) 3196 toggleSPDenormMode(true, B, ST, Mode); 3197 3198 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3199 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3200 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3201 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3202 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3203 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3204 3205 if (!Mode.allFP32Denormals()) 3206 toggleSPDenormMode(false, B, ST, Mode); 3207 3208 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3209 .addUse(Fma4.getReg(0)) 3210 .addUse(Fma1.getReg(0)) 3211 .addUse(Fma3.getReg(0)) 3212 .addUse(NumeratorScaled.getReg(1)) 3213 .setMIFlags(Flags); 3214 3215 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3216 .addUse(Fmas.getReg(0)) 3217 .addUse(RHS) 3218 .addUse(LHS) 3219 .setMIFlags(Flags); 3220 3221 MI.eraseFromParent(); 3222 return true; 3223 } 3224 3225 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3226 MachineRegisterInfo &MRI, 3227 MachineIRBuilder &B) const { 3228 Register Res = MI.getOperand(0).getReg(); 3229 Register LHS = MI.getOperand(1).getReg(); 3230 Register RHS = MI.getOperand(2).getReg(); 3231 3232 uint16_t Flags = MI.getFlags(); 3233 3234 LLT S64 = LLT::scalar(64); 3235 LLT S1 = LLT::scalar(1); 3236 3237 auto One = B.buildFConstant(S64, 1.0); 3238 3239 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3240 .addUse(LHS) 3241 .addUse(RHS) 3242 .addImm(0) 3243 .setMIFlags(Flags); 3244 3245 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3246 3247 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3248 .addUse(DivScale0.getReg(0)) 3249 .setMIFlags(Flags); 3250 3251 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3252 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3253 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3254 3255 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3256 .addUse(LHS) 3257 .addUse(RHS) 3258 .addImm(1) 3259 .setMIFlags(Flags); 3260 3261 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3262 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3263 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3264 3265 Register Scale; 3266 if (!ST.hasUsableDivScaleConditionOutput()) { 3267 // Workaround a hardware bug on SI where the condition output from div_scale 3268 // is not usable. 3269 3270 LLT S32 = LLT::scalar(32); 3271 3272 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3273 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3274 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3275 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3276 3277 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3278 Scale1Unmerge.getReg(1)); 3279 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3280 Scale0Unmerge.getReg(1)); 3281 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3282 } else { 3283 Scale = DivScale1.getReg(1); 3284 } 3285 3286 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3287 .addUse(Fma4.getReg(0)) 3288 .addUse(Fma3.getReg(0)) 3289 .addUse(Mul.getReg(0)) 3290 .addUse(Scale) 3291 .setMIFlags(Flags); 3292 3293 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3294 .addUse(Fmas.getReg(0)) 3295 .addUse(RHS) 3296 .addUse(LHS) 3297 .setMIFlags(Flags); 3298 3299 MI.eraseFromParent(); 3300 return true; 3301 } 3302 3303 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3304 MachineRegisterInfo &MRI, 3305 MachineIRBuilder &B) const { 3306 Register Res = MI.getOperand(0).getReg(); 3307 Register LHS = MI.getOperand(2).getReg(); 3308 Register RHS = MI.getOperand(3).getReg(); 3309 uint16_t Flags = MI.getFlags(); 3310 3311 LLT S32 = LLT::scalar(32); 3312 LLT S1 = LLT::scalar(1); 3313 3314 auto Abs = B.buildFAbs(S32, RHS, Flags); 3315 const APFloat C0Val(1.0f); 3316 3317 auto C0 = B.buildConstant(S32, 0x6f800000); 3318 auto C1 = B.buildConstant(S32, 0x2f800000); 3319 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3320 3321 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3322 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3323 3324 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3325 3326 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3327 .addUse(Mul0.getReg(0)) 3328 .setMIFlags(Flags); 3329 3330 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3331 3332 B.buildFMul(Res, Sel, Mul1, Flags); 3333 3334 MI.eraseFromParent(); 3335 return true; 3336 } 3337 3338 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3339 // FIXME: Why do we handle this one but not other removed instructions? 3340 // 3341 // Reciprocal square root. The clamp prevents infinite results, clamping 3342 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3343 // +-max_float. 3344 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3345 MachineRegisterInfo &MRI, 3346 MachineIRBuilder &B) const { 3347 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3348 return true; 3349 3350 Register Dst = MI.getOperand(0).getReg(); 3351 Register Src = MI.getOperand(2).getReg(); 3352 auto Flags = MI.getFlags(); 3353 3354 LLT Ty = MRI.getType(Dst); 3355 3356 const fltSemantics *FltSemantics; 3357 if (Ty == LLT::scalar(32)) 3358 FltSemantics = &APFloat::IEEEsingle(); 3359 else if (Ty == LLT::scalar(64)) 3360 FltSemantics = &APFloat::IEEEdouble(); 3361 else 3362 return false; 3363 3364 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3365 .addUse(Src) 3366 .setMIFlags(Flags); 3367 3368 // We don't need to concern ourselves with the snan handling difference, since 3369 // the rsq quieted (or not) so use the one which will directly select. 3370 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3371 const bool UseIEEE = MFI->getMode().IEEE; 3372 3373 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3374 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3375 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3376 3377 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3378 3379 if (UseIEEE) 3380 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3381 else 3382 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3383 MI.eraseFromParent(); 3384 return true; 3385 } 3386 3387 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3388 switch (IID) { 3389 case Intrinsic::amdgcn_ds_fadd: 3390 return AMDGPU::G_ATOMICRMW_FADD; 3391 case Intrinsic::amdgcn_ds_fmin: 3392 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3393 case Intrinsic::amdgcn_ds_fmax: 3394 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3395 default: 3396 llvm_unreachable("not a DS FP intrinsic"); 3397 } 3398 } 3399 3400 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3401 MachineInstr &MI, 3402 Intrinsic::ID IID) const { 3403 GISelChangeObserver &Observer = Helper.Observer; 3404 Observer.changingInstr(MI); 3405 3406 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3407 3408 // The remaining operands were used to set fields in the MemOperand on 3409 // construction. 3410 for (int I = 6; I > 3; --I) 3411 MI.RemoveOperand(I); 3412 3413 MI.RemoveOperand(1); // Remove the intrinsic ID. 3414 Observer.changedInstr(MI); 3415 return true; 3416 } 3417 3418 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3419 MachineRegisterInfo &MRI, 3420 MachineIRBuilder &B) const { 3421 uint64_t Offset = 3422 ST.getTargetLowering()->getImplicitParameterOffset( 3423 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3424 LLT DstTy = MRI.getType(DstReg); 3425 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3426 3427 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3428 if (!loadInputValue(KernargPtrReg, B, 3429 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3430 return false; 3431 3432 // FIXME: This should be nuw 3433 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3434 return true; 3435 } 3436 3437 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3438 MachineRegisterInfo &MRI, 3439 MachineIRBuilder &B) const { 3440 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3441 if (!MFI->isEntryFunction()) { 3442 return legalizePreloadedArgIntrin(MI, MRI, B, 3443 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3444 } 3445 3446 Register DstReg = MI.getOperand(0).getReg(); 3447 if (!getImplicitArgPtr(DstReg, MRI, B)) 3448 return false; 3449 3450 MI.eraseFromParent(); 3451 return true; 3452 } 3453 3454 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3455 MachineRegisterInfo &MRI, 3456 MachineIRBuilder &B, 3457 unsigned AddrSpace) const { 3458 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3459 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3460 Register Hi32 = Unmerge.getReg(1); 3461 3462 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3463 MI.eraseFromParent(); 3464 return true; 3465 } 3466 3467 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3468 // offset (the offset that is included in bounds checking and swizzling, to be 3469 // split between the instruction's voffset and immoffset fields) and soffset 3470 // (the offset that is excluded from bounds checking and swizzling, to go in 3471 // the instruction's soffset field). This function takes the first kind of 3472 // offset and figures out how to split it between voffset and immoffset. 3473 std::tuple<Register, unsigned, unsigned> 3474 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3475 Register OrigOffset) const { 3476 const unsigned MaxImm = 4095; 3477 Register BaseReg; 3478 unsigned TotalConstOffset; 3479 MachineInstr *OffsetDef; 3480 const LLT S32 = LLT::scalar(32); 3481 3482 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3483 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3484 3485 unsigned ImmOffset = TotalConstOffset; 3486 3487 // If the immediate value is too big for the immoffset field, put the value 3488 // and -4096 into the immoffset field so that the value that is copied/added 3489 // for the voffset field is a multiple of 4096, and it stands more chance 3490 // of being CSEd with the copy/add for another similar load/store. 3491 // However, do not do that rounding down to a multiple of 4096 if that is a 3492 // negative number, as it appears to be illegal to have a negative offset 3493 // in the vgpr, even if adding the immediate offset makes it positive. 3494 unsigned Overflow = ImmOffset & ~MaxImm; 3495 ImmOffset -= Overflow; 3496 if ((int32_t)Overflow < 0) { 3497 Overflow += ImmOffset; 3498 ImmOffset = 0; 3499 } 3500 3501 if (Overflow != 0) { 3502 if (!BaseReg) { 3503 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3504 } else { 3505 auto OverflowVal = B.buildConstant(S32, Overflow); 3506 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3507 } 3508 } 3509 3510 if (!BaseReg) 3511 BaseReg = B.buildConstant(S32, 0).getReg(0); 3512 3513 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3514 } 3515 3516 /// Handle register layout difference for f16 images for some subtargets. 3517 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3518 MachineRegisterInfo &MRI, 3519 Register Reg) const { 3520 if (!ST.hasUnpackedD16VMem()) 3521 return Reg; 3522 3523 const LLT S16 = LLT::scalar(16); 3524 const LLT S32 = LLT::scalar(32); 3525 LLT StoreVT = MRI.getType(Reg); 3526 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3527 3528 auto Unmerge = B.buildUnmerge(S16, Reg); 3529 3530 SmallVector<Register, 4> WideRegs; 3531 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3532 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3533 3534 int NumElts = StoreVT.getNumElements(); 3535 3536 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3537 } 3538 3539 Register AMDGPULegalizerInfo::fixStoreSourceType( 3540 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3541 MachineRegisterInfo *MRI = B.getMRI(); 3542 LLT Ty = MRI->getType(VData); 3543 3544 const LLT S16 = LLT::scalar(16); 3545 3546 // Fixup illegal register types for i8 stores. 3547 if (Ty == LLT::scalar(8) || Ty == S16) { 3548 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3549 return AnyExt; 3550 } 3551 3552 if (Ty.isVector()) { 3553 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3554 if (IsFormat) 3555 return handleD16VData(B, *MRI, VData); 3556 } 3557 } 3558 3559 return VData; 3560 } 3561 3562 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3563 MachineRegisterInfo &MRI, 3564 MachineIRBuilder &B, 3565 bool IsTyped, 3566 bool IsFormat) const { 3567 Register VData = MI.getOperand(1).getReg(); 3568 LLT Ty = MRI.getType(VData); 3569 LLT EltTy = Ty.getScalarType(); 3570 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3571 const LLT S32 = LLT::scalar(32); 3572 3573 VData = fixStoreSourceType(B, VData, IsFormat); 3574 Register RSrc = MI.getOperand(2).getReg(); 3575 3576 MachineMemOperand *MMO = *MI.memoperands_begin(); 3577 const int MemSize = MMO->getSize(); 3578 3579 unsigned ImmOffset; 3580 unsigned TotalOffset; 3581 3582 // The typed intrinsics add an immediate after the registers. 3583 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3584 3585 // The struct intrinsic variants add one additional operand over raw. 3586 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3587 Register VIndex; 3588 int OpOffset = 0; 3589 if (HasVIndex) { 3590 VIndex = MI.getOperand(3).getReg(); 3591 OpOffset = 1; 3592 } 3593 3594 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3595 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3596 3597 unsigned Format = 0; 3598 if (IsTyped) { 3599 Format = MI.getOperand(5 + OpOffset).getImm(); 3600 ++OpOffset; 3601 } 3602 3603 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3604 3605 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3606 if (TotalOffset != 0) 3607 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3608 3609 unsigned Opc; 3610 if (IsTyped) { 3611 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3612 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3613 } else if (IsFormat) { 3614 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3615 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3616 } else { 3617 switch (MemSize) { 3618 case 1: 3619 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3620 break; 3621 case 2: 3622 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3623 break; 3624 default: 3625 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3626 break; 3627 } 3628 } 3629 3630 if (!VIndex) 3631 VIndex = B.buildConstant(S32, 0).getReg(0); 3632 3633 auto MIB = B.buildInstr(Opc) 3634 .addUse(VData) // vdata 3635 .addUse(RSrc) // rsrc 3636 .addUse(VIndex) // vindex 3637 .addUse(VOffset) // voffset 3638 .addUse(SOffset) // soffset 3639 .addImm(ImmOffset); // offset(imm) 3640 3641 if (IsTyped) 3642 MIB.addImm(Format); 3643 3644 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3645 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3646 .addMemOperand(MMO); 3647 3648 MI.eraseFromParent(); 3649 return true; 3650 } 3651 3652 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3653 MachineRegisterInfo &MRI, 3654 MachineIRBuilder &B, 3655 bool IsFormat, 3656 bool IsTyped) const { 3657 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3658 MachineMemOperand *MMO = *MI.memoperands_begin(); 3659 const int MemSize = MMO->getSize(); 3660 const LLT S32 = LLT::scalar(32); 3661 3662 Register Dst = MI.getOperand(0).getReg(); 3663 Register RSrc = MI.getOperand(2).getReg(); 3664 3665 // The typed intrinsics add an immediate after the registers. 3666 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3667 3668 // The struct intrinsic variants add one additional operand over raw. 3669 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3670 Register VIndex; 3671 int OpOffset = 0; 3672 if (HasVIndex) { 3673 VIndex = MI.getOperand(3).getReg(); 3674 OpOffset = 1; 3675 } 3676 3677 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3678 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3679 3680 unsigned Format = 0; 3681 if (IsTyped) { 3682 Format = MI.getOperand(5 + OpOffset).getImm(); 3683 ++OpOffset; 3684 } 3685 3686 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3687 unsigned ImmOffset; 3688 unsigned TotalOffset; 3689 3690 LLT Ty = MRI.getType(Dst); 3691 LLT EltTy = Ty.getScalarType(); 3692 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3693 const bool Unpacked = ST.hasUnpackedD16VMem(); 3694 3695 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3696 if (TotalOffset != 0) 3697 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3698 3699 unsigned Opc; 3700 3701 if (IsTyped) { 3702 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3703 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3704 } else if (IsFormat) { 3705 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3706 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3707 } else { 3708 switch (MemSize) { 3709 case 1: 3710 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3711 break; 3712 case 2: 3713 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3714 break; 3715 default: 3716 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3717 break; 3718 } 3719 } 3720 3721 Register LoadDstReg; 3722 3723 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3724 LLT UnpackedTy = Ty.changeElementSize(32); 3725 3726 if (IsExtLoad) 3727 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3728 else if (Unpacked && IsD16 && Ty.isVector()) 3729 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3730 else 3731 LoadDstReg = Dst; 3732 3733 if (!VIndex) 3734 VIndex = B.buildConstant(S32, 0).getReg(0); 3735 3736 auto MIB = B.buildInstr(Opc) 3737 .addDef(LoadDstReg) // vdata 3738 .addUse(RSrc) // rsrc 3739 .addUse(VIndex) // vindex 3740 .addUse(VOffset) // voffset 3741 .addUse(SOffset) // soffset 3742 .addImm(ImmOffset); // offset(imm) 3743 3744 if (IsTyped) 3745 MIB.addImm(Format); 3746 3747 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3748 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3749 .addMemOperand(MMO); 3750 3751 if (LoadDstReg != Dst) { 3752 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3753 3754 // Widen result for extending loads was widened. 3755 if (IsExtLoad) 3756 B.buildTrunc(Dst, LoadDstReg); 3757 else { 3758 // Repack to original 16-bit vector result 3759 // FIXME: G_TRUNC should work, but legalization currently fails 3760 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3761 SmallVector<Register, 4> Repack; 3762 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3763 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3764 B.buildMerge(Dst, Repack); 3765 } 3766 } 3767 3768 MI.eraseFromParent(); 3769 return true; 3770 } 3771 3772 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3773 MachineIRBuilder &B, 3774 bool IsInc) const { 3775 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3776 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3777 B.buildInstr(Opc) 3778 .addDef(MI.getOperand(0).getReg()) 3779 .addUse(MI.getOperand(2).getReg()) 3780 .addUse(MI.getOperand(3).getReg()) 3781 .cloneMemRefs(MI); 3782 MI.eraseFromParent(); 3783 return true; 3784 } 3785 3786 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3787 switch (IntrID) { 3788 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3789 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3790 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3791 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3792 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3793 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3794 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3795 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3796 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3797 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3798 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3799 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3800 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3801 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3802 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3803 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3804 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3805 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3806 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3807 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3808 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3809 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3810 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3811 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3812 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3813 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3814 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3815 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3816 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3817 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3818 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3819 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3820 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3821 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3822 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3823 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3824 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3825 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3826 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3827 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3828 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3829 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3830 default: 3831 llvm_unreachable("unhandled atomic opcode"); 3832 } 3833 } 3834 3835 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3836 MachineIRBuilder &B, 3837 Intrinsic::ID IID) const { 3838 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3839 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3840 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3841 3842 Register Dst; 3843 3844 int OpOffset = 0; 3845 if (HasReturn) { 3846 // A few FP atomics do not support return values. 3847 Dst = MI.getOperand(0).getReg(); 3848 } else { 3849 OpOffset = -1; 3850 } 3851 3852 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3853 Register CmpVal; 3854 3855 if (IsCmpSwap) { 3856 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3857 ++OpOffset; 3858 } 3859 3860 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3861 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3862 3863 // The struct intrinsic variants add one additional operand over raw. 3864 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3865 Register VIndex; 3866 if (HasVIndex) { 3867 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3868 ++OpOffset; 3869 } 3870 3871 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3872 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3873 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3874 3875 MachineMemOperand *MMO = *MI.memoperands_begin(); 3876 3877 unsigned ImmOffset; 3878 unsigned TotalOffset; 3879 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3880 if (TotalOffset != 0) 3881 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3882 3883 if (!VIndex) 3884 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3885 3886 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3887 3888 if (HasReturn) 3889 MIB.addDef(Dst); 3890 3891 MIB.addUse(VData); // vdata 3892 3893 if (IsCmpSwap) 3894 MIB.addReg(CmpVal); 3895 3896 MIB.addUse(RSrc) // rsrc 3897 .addUse(VIndex) // vindex 3898 .addUse(VOffset) // voffset 3899 .addUse(SOffset) // soffset 3900 .addImm(ImmOffset) // offset(imm) 3901 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3902 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3903 .addMemOperand(MMO); 3904 3905 MI.eraseFromParent(); 3906 return true; 3907 } 3908 3909 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3910 /// vector with s16 typed elements. 3911 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3912 SmallVectorImpl<Register> &PackedAddrs, 3913 int AddrIdx, int DimIdx, int EndIdx, 3914 int NumGradients) { 3915 const LLT S16 = LLT::scalar(16); 3916 const LLT V2S16 = LLT::vector(2, 16); 3917 3918 for (int I = AddrIdx; I < EndIdx; ++I) { 3919 MachineOperand &SrcOp = MI.getOperand(I); 3920 if (!SrcOp.isReg()) 3921 continue; // _L to _LZ may have eliminated this. 3922 3923 Register AddrReg = SrcOp.getReg(); 3924 3925 if (I < DimIdx) { 3926 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3927 PackedAddrs.push_back(AddrReg); 3928 } else { 3929 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3930 // derivatives dx/dh and dx/dv are packed with undef. 3931 if (((I + 1) >= EndIdx) || 3932 ((NumGradients / 2) % 2 == 1 && 3933 (I == DimIdx + (NumGradients / 2) - 1 || 3934 I == DimIdx + NumGradients - 1)) || 3935 // Check for _L to _LZ optimization 3936 !MI.getOperand(I + 1).isReg()) { 3937 PackedAddrs.push_back( 3938 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3939 .getReg(0)); 3940 } else { 3941 PackedAddrs.push_back( 3942 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3943 .getReg(0)); 3944 ++I; 3945 } 3946 } 3947 } 3948 } 3949 3950 /// Convert from separate vaddr components to a single vector address register, 3951 /// and replace the remaining operands with $noreg. 3952 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3953 int DimIdx, int NumVAddrs) { 3954 const LLT S32 = LLT::scalar(32); 3955 3956 SmallVector<Register, 8> AddrRegs; 3957 for (int I = 0; I != NumVAddrs; ++I) { 3958 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3959 if (SrcOp.isReg()) { 3960 AddrRegs.push_back(SrcOp.getReg()); 3961 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3962 } 3963 } 3964 3965 int NumAddrRegs = AddrRegs.size(); 3966 if (NumAddrRegs != 1) { 3967 // Round up to 8 elements for v5-v7 3968 // FIXME: Missing intermediate sized register classes and instructions. 3969 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3970 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3971 auto Undef = B.buildUndef(S32); 3972 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3973 NumAddrRegs = RoundedNumRegs; 3974 } 3975 3976 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3977 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3978 } 3979 3980 for (int I = 1; I != NumVAddrs; ++I) { 3981 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3982 if (SrcOp.isReg()) 3983 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3984 } 3985 } 3986 3987 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3988 /// 3989 /// Depending on the subtarget, load/store with 16-bit element data need to be 3990 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3991 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3992 /// registers. 3993 /// 3994 /// We don't want to directly select image instructions just yet, but also want 3995 /// to exposes all register repacking to the legalizer/combiners. We also don't 3996 /// want a selected instrution entering RegBankSelect. In order to avoid 3997 /// defining a multitude of intermediate image instructions, directly hack on 3998 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3999 /// now unnecessary arguments with $noreg. 4000 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4001 MachineInstr &MI, MachineIRBuilder &B, 4002 GISelChangeObserver &Observer, 4003 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 4004 4005 const int NumDefs = MI.getNumExplicitDefs(); 4006 bool IsTFE = NumDefs == 2; 4007 // We are only processing the operands of d16 image operations on subtargets 4008 // that use the unpacked register layout, or need to repack the TFE result. 4009 4010 // TODO: Do we need to guard against already legalized intrinsics? 4011 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4012 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 4013 4014 MachineRegisterInfo *MRI = B.getMRI(); 4015 const LLT S32 = LLT::scalar(32); 4016 const LLT S16 = LLT::scalar(16); 4017 const LLT V2S16 = LLT::vector(2, 16); 4018 4019 // Index of first address argument 4020 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 4021 4022 int NumVAddrs, NumGradients; 4023 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 4024 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 4025 getDMaskIdx(BaseOpcode, NumDefs); 4026 unsigned DMask = 0; 4027 4028 // Check for 16 bit addresses and pack if true. 4029 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 4030 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 4031 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 4032 const bool IsG16 = GradTy == S16; 4033 const bool IsA16 = AddrTy == S16; 4034 4035 int DMaskLanes = 0; 4036 if (!BaseOpcode->Atomic) { 4037 DMask = MI.getOperand(DMaskIdx).getImm(); 4038 if (BaseOpcode->Gather4) { 4039 DMaskLanes = 4; 4040 } else if (DMask != 0) { 4041 DMaskLanes = countPopulation(DMask); 4042 } else if (!IsTFE && !BaseOpcode->Store) { 4043 // If dmask is 0, this is a no-op load. This can be eliminated. 4044 B.buildUndef(MI.getOperand(0)); 4045 MI.eraseFromParent(); 4046 return true; 4047 } 4048 } 4049 4050 Observer.changingInstr(MI); 4051 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4052 4053 unsigned NewOpcode = NumDefs == 0 ? 4054 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4055 4056 // Track that we legalized this 4057 MI.setDesc(B.getTII().get(NewOpcode)); 4058 4059 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4060 // dmask to be at least 1 otherwise the instruction will fail 4061 if (IsTFE && DMask == 0) { 4062 DMask = 0x1; 4063 DMaskLanes = 1; 4064 MI.getOperand(DMaskIdx).setImm(DMask); 4065 } 4066 4067 if (BaseOpcode->Atomic) { 4068 Register VData0 = MI.getOperand(2).getReg(); 4069 LLT Ty = MRI->getType(VData0); 4070 4071 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4072 if (Ty.isVector()) 4073 return false; 4074 4075 if (BaseOpcode->AtomicX2) { 4076 Register VData1 = MI.getOperand(3).getReg(); 4077 // The two values are packed in one register. 4078 LLT PackedTy = LLT::vector(2, Ty); 4079 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4080 MI.getOperand(2).setReg(Concat.getReg(0)); 4081 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4082 } 4083 } 4084 4085 int CorrectedNumVAddrs = NumVAddrs; 4086 4087 // Optimize _L to _LZ when _L is zero 4088 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4089 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 4090 const ConstantFP *ConstantLod; 4091 const int LodIdx = AddrIdx + NumVAddrs - 1; 4092 4093 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 4094 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4095 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4096 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 4097 LZMappingInfo->LZ, ImageDimIntr->Dim); 4098 4099 // The starting indexes should remain in the same place. 4100 --NumVAddrs; 4101 --CorrectedNumVAddrs; 4102 4103 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 4104 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 4105 MI.RemoveOperand(LodIdx); 4106 } 4107 } 4108 } 4109 4110 // Optimize _mip away, when 'lod' is zero 4111 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 4112 int64_t ConstantLod; 4113 const int LodIdx = AddrIdx + NumVAddrs - 1; 4114 4115 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 4116 if (ConstantLod == 0) { 4117 // TODO: Change intrinsic opcode and remove operand instead or replacing 4118 // it with 0, as the _L to _LZ handling is done above. 4119 MI.getOperand(LodIdx).ChangeToImmediate(0); 4120 --CorrectedNumVAddrs; 4121 } 4122 } 4123 } 4124 4125 // Rewrite the addressing register layout before doing anything else. 4126 if (IsA16 || IsG16) { 4127 if (IsA16) { 4128 // Target must support the feature and gradients need to be 16 bit too 4129 if (!ST.hasA16() || !IsG16) 4130 return false; 4131 } else if (!ST.hasG16()) 4132 return false; 4133 4134 if (NumVAddrs > 1) { 4135 SmallVector<Register, 4> PackedRegs; 4136 // Don't compress addresses for G16 4137 const int PackEndIdx = 4138 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4139 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4140 PackEndIdx, NumGradients); 4141 4142 if (!IsA16) { 4143 // Add uncompressed address 4144 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4145 int AddrReg = MI.getOperand(I).getReg(); 4146 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4147 PackedRegs.push_back(AddrReg); 4148 } 4149 } 4150 4151 // See also below in the non-a16 branch 4152 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4153 4154 if (!UseNSA && PackedRegs.size() > 1) { 4155 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4156 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4157 PackedRegs[0] = Concat.getReg(0); 4158 PackedRegs.resize(1); 4159 } 4160 4161 const int NumPacked = PackedRegs.size(); 4162 for (int I = 0; I != NumVAddrs; ++I) { 4163 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4164 if (!SrcOp.isReg()) { 4165 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4166 continue; 4167 } 4168 4169 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4170 4171 if (I < NumPacked) 4172 SrcOp.setReg(PackedRegs[I]); 4173 else 4174 SrcOp.setReg(AMDGPU::NoRegister); 4175 } 4176 } 4177 } else { 4178 // If the register allocator cannot place the address registers contiguously 4179 // without introducing moves, then using the non-sequential address encoding 4180 // is always preferable, since it saves VALU instructions and is usually a 4181 // wash in terms of code size or even better. 4182 // 4183 // However, we currently have no way of hinting to the register allocator 4184 // that MIMG addresses should be placed contiguously when it is possible to 4185 // do so, so force non-NSA for the common 2-address case as a heuristic. 4186 // 4187 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4188 // allocation when possible. 4189 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4190 4191 if (!UseNSA && NumVAddrs > 1) 4192 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4193 } 4194 4195 int Flags = 0; 4196 if (IsA16) 4197 Flags |= 1; 4198 if (IsG16) 4199 Flags |= 2; 4200 MI.addOperand(MachineOperand::CreateImm(Flags)); 4201 4202 if (BaseOpcode->Store) { // No TFE for stores? 4203 // TODO: Handle dmask trim 4204 Register VData = MI.getOperand(1).getReg(); 4205 LLT Ty = MRI->getType(VData); 4206 if (!Ty.isVector() || Ty.getElementType() != S16) 4207 return true; 4208 4209 Register RepackedReg = handleD16VData(B, *MRI, VData); 4210 if (RepackedReg != VData) { 4211 MI.getOperand(1).setReg(RepackedReg); 4212 } 4213 4214 return true; 4215 } 4216 4217 Register DstReg = MI.getOperand(0).getReg(); 4218 LLT Ty = MRI->getType(DstReg); 4219 const LLT EltTy = Ty.getScalarType(); 4220 const bool IsD16 = Ty.getScalarType() == S16; 4221 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4222 4223 // Confirm that the return type is large enough for the dmask specified 4224 if (NumElts < DMaskLanes) 4225 return false; 4226 4227 if (NumElts > 4 || DMaskLanes > 4) 4228 return false; 4229 4230 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4231 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4232 4233 // The raw dword aligned data component of the load. The only legal cases 4234 // where this matters should be when using the packed D16 format, for 4235 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4236 LLT RoundedTy; 4237 4238 // S32 vector to to cover all data, plus TFE result element. 4239 LLT TFETy; 4240 4241 // Register type to use for each loaded component. Will be S32 or V2S16. 4242 LLT RegTy; 4243 4244 if (IsD16 && ST.hasUnpackedD16VMem()) { 4245 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4246 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4247 RegTy = S32; 4248 } else { 4249 unsigned EltSize = EltTy.getSizeInBits(); 4250 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4251 unsigned RoundedSize = 32 * RoundedElts; 4252 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4253 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4254 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4255 } 4256 4257 // The return type does not need adjustment. 4258 // TODO: Should we change s16 case to s32 or <2 x s16>? 4259 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4260 return true; 4261 4262 Register Dst1Reg; 4263 4264 // Insert after the instruction. 4265 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4266 4267 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4268 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4269 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4270 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4271 4272 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4273 4274 MI.getOperand(0).setReg(NewResultReg); 4275 4276 // In the IR, TFE is supposed to be used with a 2 element struct return 4277 // type. The intruction really returns these two values in one contiguous 4278 // register, with one additional dword beyond the loaded data. Rewrite the 4279 // return type to use a single register result. 4280 4281 if (IsTFE) { 4282 Dst1Reg = MI.getOperand(1).getReg(); 4283 if (MRI->getType(Dst1Reg) != S32) 4284 return false; 4285 4286 // TODO: Make sure the TFE operand bit is set. 4287 MI.RemoveOperand(1); 4288 4289 // Handle the easy case that requires no repack instructions. 4290 if (Ty == S32) { 4291 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4292 return true; 4293 } 4294 } 4295 4296 // Now figure out how to copy the new result register back into the old 4297 // result. 4298 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4299 4300 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4301 4302 if (ResultNumRegs == 1) { 4303 assert(!IsTFE); 4304 ResultRegs[0] = NewResultReg; 4305 } else { 4306 // We have to repack into a new vector of some kind. 4307 for (int I = 0; I != NumDataRegs; ++I) 4308 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4309 B.buildUnmerge(ResultRegs, NewResultReg); 4310 4311 // Drop the final TFE element to get the data part. The TFE result is 4312 // directly written to the right place already. 4313 if (IsTFE) 4314 ResultRegs.resize(NumDataRegs); 4315 } 4316 4317 // For an s16 scalar result, we form an s32 result with a truncate regardless 4318 // of packed vs. unpacked. 4319 if (IsD16 && !Ty.isVector()) { 4320 B.buildTrunc(DstReg, ResultRegs[0]); 4321 return true; 4322 } 4323 4324 // Avoid a build/concat_vector of 1 entry. 4325 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4326 B.buildBitcast(DstReg, ResultRegs[0]); 4327 return true; 4328 } 4329 4330 assert(Ty.isVector()); 4331 4332 if (IsD16) { 4333 // For packed D16 results with TFE enabled, all the data components are 4334 // S32. Cast back to the expected type. 4335 // 4336 // TODO: We don't really need to use load s32 elements. We would only need one 4337 // cast for the TFE result if a multiple of v2s16 was used. 4338 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4339 for (Register &Reg : ResultRegs) 4340 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4341 } else if (ST.hasUnpackedD16VMem()) { 4342 for (Register &Reg : ResultRegs) 4343 Reg = B.buildTrunc(S16, Reg).getReg(0); 4344 } 4345 } 4346 4347 auto padWithUndef = [&](LLT Ty, int NumElts) { 4348 if (NumElts == 0) 4349 return; 4350 Register Undef = B.buildUndef(Ty).getReg(0); 4351 for (int I = 0; I != NumElts; ++I) 4352 ResultRegs.push_back(Undef); 4353 }; 4354 4355 // Pad out any elements eliminated due to the dmask. 4356 LLT ResTy = MRI->getType(ResultRegs[0]); 4357 if (!ResTy.isVector()) { 4358 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4359 B.buildBuildVector(DstReg, ResultRegs); 4360 return true; 4361 } 4362 4363 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4364 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4365 4366 // Deal with the one annoying legal case. 4367 const LLT V3S16 = LLT::vector(3, 16); 4368 if (Ty == V3S16) { 4369 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4370 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4371 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4372 return true; 4373 } 4374 4375 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4376 B.buildConcatVectors(DstReg, ResultRegs); 4377 return true; 4378 } 4379 4380 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4381 LegalizerHelper &Helper, MachineInstr &MI) const { 4382 MachineIRBuilder &B = Helper.MIRBuilder; 4383 GISelChangeObserver &Observer = Helper.Observer; 4384 4385 Register Dst = MI.getOperand(0).getReg(); 4386 LLT Ty = B.getMRI()->getType(Dst); 4387 unsigned Size = Ty.getSizeInBits(); 4388 MachineFunction &MF = B.getMF(); 4389 4390 Observer.changingInstr(MI); 4391 4392 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4393 Ty = getBitcastRegisterType(Ty); 4394 Helper.bitcastDst(MI, Ty, 0); 4395 Dst = MI.getOperand(0).getReg(); 4396 B.setInsertPt(B.getMBB(), MI); 4397 } 4398 4399 // FIXME: We don't really need this intermediate instruction. The intrinsic 4400 // should be fixed to have a memory operand. Since it's readnone, we're not 4401 // allowed to add one. 4402 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4403 MI.RemoveOperand(1); // Remove intrinsic ID 4404 4405 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4406 // TODO: Should this use datalayout alignment? 4407 const unsigned MemSize = (Size + 7) / 8; 4408 const Align MemAlign(4); 4409 MachineMemOperand *MMO = MF.getMachineMemOperand( 4410 MachinePointerInfo(), 4411 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4412 MachineMemOperand::MOInvariant, 4413 MemSize, MemAlign); 4414 MI.addMemOperand(MF, MMO); 4415 4416 // There are no 96-bit result scalar loads, but widening to 128-bit should 4417 // always be legal. We may need to restore this to a 96-bit result if it turns 4418 // out this needs to be converted to a vector load during RegBankSelect. 4419 if (!isPowerOf2_32(Size)) { 4420 if (Ty.isVector()) 4421 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4422 else 4423 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4424 } 4425 4426 Observer.changedInstr(MI); 4427 return true; 4428 } 4429 4430 // TODO: Move to selection 4431 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4432 MachineRegisterInfo &MRI, 4433 MachineIRBuilder &B) const { 4434 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4435 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4436 !ST.isTrapHandlerEnabled()) { 4437 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4438 } else { 4439 // Pass queue pointer to trap handler as input, and insert trap instruction 4440 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4441 MachineRegisterInfo &MRI = *B.getMRI(); 4442 4443 Register LiveIn = 4444 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4445 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4446 return false; 4447 4448 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4449 B.buildCopy(SGPR01, LiveIn); 4450 B.buildInstr(AMDGPU::S_TRAP) 4451 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4452 .addReg(SGPR01, RegState::Implicit); 4453 } 4454 4455 MI.eraseFromParent(); 4456 return true; 4457 } 4458 4459 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4460 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4461 // Is non-HSA path or trap-handler disabled? then, report a warning 4462 // accordingly 4463 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4464 !ST.isTrapHandlerEnabled()) { 4465 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4466 "debugtrap handler not supported", 4467 MI.getDebugLoc(), DS_Warning); 4468 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4469 Ctx.diagnose(NoTrap); 4470 } else { 4471 // Insert debug-trap instruction 4472 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4473 } 4474 4475 MI.eraseFromParent(); 4476 return true; 4477 } 4478 4479 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4480 MachineInstr &MI) const { 4481 MachineIRBuilder &B = Helper.MIRBuilder; 4482 MachineRegisterInfo &MRI = *B.getMRI(); 4483 4484 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4485 auto IntrID = MI.getIntrinsicID(); 4486 switch (IntrID) { 4487 case Intrinsic::amdgcn_if: 4488 case Intrinsic::amdgcn_else: { 4489 MachineInstr *Br = nullptr; 4490 MachineBasicBlock *UncondBrTarget = nullptr; 4491 bool Negated = false; 4492 if (MachineInstr *BrCond = 4493 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4494 const SIRegisterInfo *TRI 4495 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4496 4497 Register Def = MI.getOperand(1).getReg(); 4498 Register Use = MI.getOperand(3).getReg(); 4499 4500 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4501 4502 if (Negated) 4503 std::swap(CondBrTarget, UncondBrTarget); 4504 4505 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4506 if (IntrID == Intrinsic::amdgcn_if) { 4507 B.buildInstr(AMDGPU::SI_IF) 4508 .addDef(Def) 4509 .addUse(Use) 4510 .addMBB(UncondBrTarget); 4511 } else { 4512 B.buildInstr(AMDGPU::SI_ELSE) 4513 .addDef(Def) 4514 .addUse(Use) 4515 .addMBB(UncondBrTarget) 4516 .addImm(0); 4517 } 4518 4519 if (Br) { 4520 Br->getOperand(0).setMBB(CondBrTarget); 4521 } else { 4522 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4523 // since we're swapping branch targets it needs to be reinserted. 4524 // FIXME: IRTranslator should probably not do this 4525 B.buildBr(*CondBrTarget); 4526 } 4527 4528 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4529 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4530 MI.eraseFromParent(); 4531 BrCond->eraseFromParent(); 4532 return true; 4533 } 4534 4535 return false; 4536 } 4537 case Intrinsic::amdgcn_loop: { 4538 MachineInstr *Br = nullptr; 4539 MachineBasicBlock *UncondBrTarget = nullptr; 4540 bool Negated = false; 4541 if (MachineInstr *BrCond = 4542 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4543 const SIRegisterInfo *TRI 4544 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4545 4546 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4547 Register Reg = MI.getOperand(2).getReg(); 4548 4549 if (Negated) 4550 std::swap(CondBrTarget, UncondBrTarget); 4551 4552 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4553 B.buildInstr(AMDGPU::SI_LOOP) 4554 .addUse(Reg) 4555 .addMBB(UncondBrTarget); 4556 4557 if (Br) 4558 Br->getOperand(0).setMBB(CondBrTarget); 4559 else 4560 B.buildBr(*CondBrTarget); 4561 4562 MI.eraseFromParent(); 4563 BrCond->eraseFromParent(); 4564 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4565 return true; 4566 } 4567 4568 return false; 4569 } 4570 case Intrinsic::amdgcn_kernarg_segment_ptr: 4571 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4572 // This only makes sense to call in a kernel, so just lower to null. 4573 B.buildConstant(MI.getOperand(0).getReg(), 0); 4574 MI.eraseFromParent(); 4575 return true; 4576 } 4577 4578 return legalizePreloadedArgIntrin( 4579 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4580 case Intrinsic::amdgcn_implicitarg_ptr: 4581 return legalizeImplicitArgPtr(MI, MRI, B); 4582 case Intrinsic::amdgcn_workitem_id_x: 4583 return legalizePreloadedArgIntrin(MI, MRI, B, 4584 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4585 case Intrinsic::amdgcn_workitem_id_y: 4586 return legalizePreloadedArgIntrin(MI, MRI, B, 4587 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4588 case Intrinsic::amdgcn_workitem_id_z: 4589 return legalizePreloadedArgIntrin(MI, MRI, B, 4590 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4591 case Intrinsic::amdgcn_workgroup_id_x: 4592 return legalizePreloadedArgIntrin(MI, MRI, B, 4593 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4594 case Intrinsic::amdgcn_workgroup_id_y: 4595 return legalizePreloadedArgIntrin(MI, MRI, B, 4596 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4597 case Intrinsic::amdgcn_workgroup_id_z: 4598 return legalizePreloadedArgIntrin(MI, MRI, B, 4599 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4600 case Intrinsic::amdgcn_dispatch_ptr: 4601 return legalizePreloadedArgIntrin(MI, MRI, B, 4602 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4603 case Intrinsic::amdgcn_queue_ptr: 4604 return legalizePreloadedArgIntrin(MI, MRI, B, 4605 AMDGPUFunctionArgInfo::QUEUE_PTR); 4606 case Intrinsic::amdgcn_implicit_buffer_ptr: 4607 return legalizePreloadedArgIntrin( 4608 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4609 case Intrinsic::amdgcn_dispatch_id: 4610 return legalizePreloadedArgIntrin(MI, MRI, B, 4611 AMDGPUFunctionArgInfo::DISPATCH_ID); 4612 case Intrinsic::amdgcn_fdiv_fast: 4613 return legalizeFDIVFastIntrin(MI, MRI, B); 4614 case Intrinsic::amdgcn_is_shared: 4615 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4616 case Intrinsic::amdgcn_is_private: 4617 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4618 case Intrinsic::amdgcn_wavefrontsize: { 4619 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4620 MI.eraseFromParent(); 4621 return true; 4622 } 4623 case Intrinsic::amdgcn_s_buffer_load: 4624 return legalizeSBufferLoad(Helper, MI); 4625 case Intrinsic::amdgcn_raw_buffer_store: 4626 case Intrinsic::amdgcn_struct_buffer_store: 4627 return legalizeBufferStore(MI, MRI, B, false, false); 4628 case Intrinsic::amdgcn_raw_buffer_store_format: 4629 case Intrinsic::amdgcn_struct_buffer_store_format: 4630 return legalizeBufferStore(MI, MRI, B, false, true); 4631 case Intrinsic::amdgcn_raw_tbuffer_store: 4632 case Intrinsic::amdgcn_struct_tbuffer_store: 4633 return legalizeBufferStore(MI, MRI, B, true, true); 4634 case Intrinsic::amdgcn_raw_buffer_load: 4635 case Intrinsic::amdgcn_struct_buffer_load: 4636 return legalizeBufferLoad(MI, MRI, B, false, false); 4637 case Intrinsic::amdgcn_raw_buffer_load_format: 4638 case Intrinsic::amdgcn_struct_buffer_load_format: 4639 return legalizeBufferLoad(MI, MRI, B, true, false); 4640 case Intrinsic::amdgcn_raw_tbuffer_load: 4641 case Intrinsic::amdgcn_struct_tbuffer_load: 4642 return legalizeBufferLoad(MI, MRI, B, true, true); 4643 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4644 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4645 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4646 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4647 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4648 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4649 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4650 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4651 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4652 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4653 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4654 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4655 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4656 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4657 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4658 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4659 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4660 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4661 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4662 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4663 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4664 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4665 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4666 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4667 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4668 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4669 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4670 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4671 return legalizeBufferAtomic(MI, B, IntrID); 4672 case Intrinsic::amdgcn_atomic_inc: 4673 return legalizeAtomicIncDec(MI, B, true); 4674 case Intrinsic::amdgcn_atomic_dec: 4675 return legalizeAtomicIncDec(MI, B, false); 4676 case Intrinsic::trap: 4677 return legalizeTrapIntrinsic(MI, MRI, B); 4678 case Intrinsic::debugtrap: 4679 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4680 case Intrinsic::amdgcn_rsq_clamp: 4681 return legalizeRsqClampIntrinsic(MI, MRI, B); 4682 case Intrinsic::amdgcn_ds_fadd: 4683 case Intrinsic::amdgcn_ds_fmin: 4684 case Intrinsic::amdgcn_ds_fmax: 4685 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4686 default: { 4687 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4688 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4689 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4690 return true; 4691 } 4692 } 4693 4694 return true; 4695 } 4696