1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 // If we have a truncating store or an extending load with a data size larger 226 // than 32-bits, we need to reduce to a 32-bit type. 227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // Do not handle extending vector loads. 280 if (Ty.isVector() && MemSize != RegSize) 281 return false; 282 283 // TODO: We should be able to widen loads if the alignment is high enough, but 284 // we also need to modify the memory access size. 285 #if 0 286 // Accept widening loads based on alignment. 287 if (IsLoad && MemSize < Size) 288 MemSize = std::max(MemSize, Align); 289 #endif 290 291 // Only 1-byte and 2-byte to 32-bit extloads are valid. 292 if (MemSize != RegSize && RegSize != 32) 293 return false; 294 295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 296 return false; 297 298 switch (MemSize) { 299 case 8: 300 case 16: 301 case 32: 302 case 64: 303 case 128: 304 break; 305 case 96: 306 if (!ST.hasDwordx3LoadStores()) 307 return false; 308 break; 309 case 256: 310 case 512: 311 // These may contextually need to be broken down. 312 break; 313 default: 314 return false; 315 } 316 317 assert(RegSize >= MemSize); 318 319 if (AlignBits < MemSize) { 320 const SITargetLowering *TLI = ST.getTargetLowering(); 321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 322 Align(AlignBits / 8))) 323 return false; 324 } 325 326 return true; 327 } 328 329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 330 // workaround this. Eventually it should ignore the type for loads and only care 331 // about the size. Return true in cases where we will workaround this for now by 332 // bitcasting. 333 static bool loadStoreBitcastWorkaround(const LLT Ty) { 334 if (EnableNewLegality) 335 return false; 336 337 const unsigned Size = Ty.getSizeInBits(); 338 if (Size <= 64) 339 return false; 340 if (!Ty.isVector()) 341 return true; 342 343 LLT EltTy = Ty.getElementType(); 344 if (EltTy.isPointer()) 345 return true; 346 347 unsigned EltSize = EltTy.getSizeInBits(); 348 return EltSize != 32 && EltSize != 64; 349 } 350 351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 352 const LLT Ty = Query.Types[0]; 353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 354 !loadStoreBitcastWorkaround(Ty); 355 } 356 357 /// Return true if a load or store of the type should be lowered with a bitcast 358 /// to a different type. 359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 360 const unsigned MemSizeInBits) { 361 const unsigned Size = Ty.getSizeInBits(); 362 if (Size != MemSizeInBits) 363 return Size <= 32 && Ty.isVector(); 364 365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 366 return true; 367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 368 !isRegisterVectorElementType(Ty.getElementType()); 369 } 370 371 /// Return true if we should legalize a load by widening an odd sized memory 372 /// access up to the alignment. Note this case when the memory access itself 373 /// changes, not the size of the result register. 374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 375 unsigned AlignInBits, unsigned AddrSpace, 376 unsigned Opcode) { 377 // We don't want to widen cases that are naturally legal. 378 if (isPowerOf2_32(SizeInBits)) 379 return false; 380 381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 382 // end up widening these for a scalar load during RegBankSelect, since there 383 // aren't 96-bit scalar loads. 384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 385 return false; 386 387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 388 return false; 389 390 // A load is known dereferenceable up to the alignment, so it's legal to widen 391 // to it. 392 // 393 // TODO: Could check dereferenceable for less aligned cases. 394 unsigned RoundedSize = NextPowerOf2(SizeInBits); 395 if (AlignInBits < RoundedSize) 396 return false; 397 398 // Do not widen if it would introduce a slow unaligned load. 399 const SITargetLowering *TLI = ST.getTargetLowering(); 400 bool Fast = false; 401 return TLI->allowsMisalignedMemoryAccessesImpl( 402 RoundedSize, AddrSpace, Align(AlignInBits / 8), 403 MachineMemOperand::MOLoad, &Fast) && 404 Fast; 405 } 406 407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 408 unsigned Opcode) { 409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 410 return false; 411 412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 413 Query.MMODescrs[0].AlignInBits, 414 Query.Types[1].getAddressSpace(), Opcode); 415 } 416 417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 418 const GCNTargetMachine &TM) 419 : ST(ST_) { 420 using namespace TargetOpcode; 421 422 auto GetAddrSpacePtr = [&TM](unsigned AS) { 423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 424 }; 425 426 const LLT S1 = LLT::scalar(1); 427 const LLT S8 = LLT::scalar(8); 428 const LLT S16 = LLT::scalar(16); 429 const LLT S32 = LLT::scalar(32); 430 const LLT S64 = LLT::scalar(64); 431 const LLT S128 = LLT::scalar(128); 432 const LLT S256 = LLT::scalar(256); 433 const LLT S512 = LLT::scalar(512); 434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 435 436 const LLT V2S8 = LLT::vector(2, 8); 437 const LLT V2S16 = LLT::vector(2, 16); 438 const LLT V4S16 = LLT::vector(4, 16); 439 440 const LLT V2S32 = LLT::vector(2, 32); 441 const LLT V3S32 = LLT::vector(3, 32); 442 const LLT V4S32 = LLT::vector(4, 32); 443 const LLT V5S32 = LLT::vector(5, 32); 444 const LLT V6S32 = LLT::vector(6, 32); 445 const LLT V7S32 = LLT::vector(7, 32); 446 const LLT V8S32 = LLT::vector(8, 32); 447 const LLT V9S32 = LLT::vector(9, 32); 448 const LLT V10S32 = LLT::vector(10, 32); 449 const LLT V11S32 = LLT::vector(11, 32); 450 const LLT V12S32 = LLT::vector(12, 32); 451 const LLT V13S32 = LLT::vector(13, 32); 452 const LLT V14S32 = LLT::vector(14, 32); 453 const LLT V15S32 = LLT::vector(15, 32); 454 const LLT V16S32 = LLT::vector(16, 32); 455 const LLT V32S32 = LLT::vector(32, 32); 456 457 const LLT V2S64 = LLT::vector(2, 64); 458 const LLT V3S64 = LLT::vector(3, 64); 459 const LLT V4S64 = LLT::vector(4, 64); 460 const LLT V5S64 = LLT::vector(5, 64); 461 const LLT V6S64 = LLT::vector(6, 64); 462 const LLT V7S64 = LLT::vector(7, 64); 463 const LLT V8S64 = LLT::vector(8, 64); 464 const LLT V16S64 = LLT::vector(16, 64); 465 466 std::initializer_list<LLT> AllS32Vectors = 467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 469 std::initializer_list<LLT> AllS64Vectors = 470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 471 472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 479 480 const LLT CodePtr = FlatPtr; 481 482 const std::initializer_list<LLT> AddrSpaces64 = { 483 GlobalPtr, ConstantPtr, FlatPtr 484 }; 485 486 const std::initializer_list<LLT> AddrSpaces32 = { 487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 488 }; 489 490 const std::initializer_list<LLT> FPTypesBase = { 491 S32, S64 492 }; 493 494 const std::initializer_list<LLT> FPTypes16 = { 495 S32, S64, S16 496 }; 497 498 const std::initializer_list<LLT> FPTypesPK16 = { 499 S32, S64, S16, V2S16 500 }; 501 502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 503 504 // s1 for VCC branches, s32 for SCC branches. 505 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 506 507 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 508 // elements for v3s16 509 getActionDefinitionsBuilder(G_PHI) 510 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 511 .legalFor(AllS32Vectors) 512 .legalFor(AllS64Vectors) 513 .legalFor(AddrSpaces64) 514 .legalFor(AddrSpaces32) 515 .legalIf(isPointer(0)) 516 .clampScalar(0, S16, S256) 517 .widenScalarToNextPow2(0, 32) 518 .clampMaxNumElements(0, S32, 16) 519 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 520 .scalarize(0); 521 522 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 523 // Full set of gfx9 features. 524 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 525 .legalFor({S32, S16, V2S16}) 526 .clampScalar(0, S16, S32) 527 .clampMaxNumElements(0, S16, 2) 528 .scalarize(0) 529 .widenScalarToNextPow2(0, 32); 530 531 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 532 .legalFor({S32, S16, V2S16}) // Clamp modifier 533 .minScalarOrElt(0, S16) 534 .clampMaxNumElements(0, S16, 2) 535 .scalarize(0) 536 .widenScalarToNextPow2(0, 32) 537 .lower(); 538 } else if (ST.has16BitInsts()) { 539 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 540 .legalFor({S32, S16}) 541 .clampScalar(0, S16, S32) 542 .scalarize(0) 543 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 544 545 // Technically the saturating operations require clamp bit support, but this 546 // was introduced at the same time as 16-bit operations. 547 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 548 .legalFor({S32, S16}) // Clamp modifier 549 .minScalar(0, S16) 550 .scalarize(0) 551 .widenScalarToNextPow2(0, 16) 552 .lower(); 553 554 // We're just lowering this, but it helps get a better result to try to 555 // coerce to the desired type first. 556 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 557 .minScalar(0, S16) 558 .scalarize(0) 559 .lower(); 560 } else { 561 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 562 .legalFor({S32}) 563 .clampScalar(0, S32, S32) 564 .scalarize(0); 565 566 if (ST.hasIntClamp()) { 567 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 568 .legalFor({S32}) // Clamp modifier. 569 .scalarize(0) 570 .minScalarOrElt(0, S32) 571 .lower(); 572 } else { 573 // Clamp bit support was added in VI, along with 16-bit operations. 574 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 575 .minScalar(0, S32) 576 .scalarize(0) 577 .lower(); 578 } 579 580 // FIXME: DAG expansion gets better results. The widening uses the smaller 581 // range values and goes for the min/max lowering directly. 582 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 583 .minScalar(0, S32) 584 .scalarize(0) 585 .lower(); 586 } 587 588 getActionDefinitionsBuilder( 589 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 590 .customFor({S32, S64}) 591 .clampScalar(0, S32, S64) 592 .widenScalarToNextPow2(0, 32) 593 .scalarize(0); 594 595 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 596 .legalFor({S32}) 597 .maxScalarOrElt(0, S32); 598 599 if (ST.hasVOP3PInsts()) { 600 Mulh 601 .clampMaxNumElements(0, S8, 2) 602 .lowerFor({V2S8}); 603 } 604 605 Mulh 606 .scalarize(0) 607 .lower(); 608 609 // Report legal for any types we can handle anywhere. For the cases only legal 610 // on the SALU, RegBankSelect will be able to re-legalize. 611 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 612 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 613 .clampScalar(0, S32, S64) 614 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 615 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 616 .widenScalarToNextPow2(0) 617 .scalarize(0); 618 619 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 620 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 621 .legalFor({{S32, S1}, {S32, S32}}) 622 .minScalar(0, S32) 623 // TODO: .scalarize(0) 624 .lower(); 625 626 getActionDefinitionsBuilder(G_BITCAST) 627 // Don't worry about the size constraint. 628 .legalIf(all(isRegisterType(0), isRegisterType(1))) 629 .lower(); 630 631 632 getActionDefinitionsBuilder(G_CONSTANT) 633 .legalFor({S1, S32, S64, S16, GlobalPtr, 634 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 635 .legalIf(isPointer(0)) 636 .clampScalar(0, S32, S64) 637 .widenScalarToNextPow2(0); 638 639 getActionDefinitionsBuilder(G_FCONSTANT) 640 .legalFor({S32, S64, S16}) 641 .clampScalar(0, S16, S64); 642 643 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 644 .legalIf(isRegisterType(0)) 645 // s1 and s16 are special cases because they have legal operations on 646 // them, but don't really occupy registers in the normal way. 647 .legalFor({S1, S16}) 648 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 649 .clampScalarOrElt(0, S32, MaxScalar) 650 .widenScalarToNextPow2(0, 32) 651 .clampMaxNumElements(0, S32, 16); 652 653 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 654 655 // If the amount is divergent, we have to do a wave reduction to get the 656 // maximum value, so this is expanded during RegBankSelect. 657 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 658 .legalFor({{PrivatePtr, S32}}); 659 660 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 661 .customIf(typeIsNot(0, PrivatePtr)); 662 663 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 664 665 auto &FPOpActions = getActionDefinitionsBuilder( 666 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 667 .legalFor({S32, S64}); 668 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 669 .customFor({S32, S64}); 670 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 671 .customFor({S32, S64}); 672 673 if (ST.has16BitInsts()) { 674 if (ST.hasVOP3PInsts()) 675 FPOpActions.legalFor({S16, V2S16}); 676 else 677 FPOpActions.legalFor({S16}); 678 679 TrigActions.customFor({S16}); 680 FDIVActions.customFor({S16}); 681 } 682 683 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 684 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 685 686 if (ST.hasVOP3PInsts()) { 687 MinNumMaxNum.customFor(FPTypesPK16) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampMaxNumElements(0, S16, 2) 690 .clampScalar(0, S16, S64) 691 .scalarize(0); 692 } else if (ST.has16BitInsts()) { 693 MinNumMaxNum.customFor(FPTypes16) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else { 697 MinNumMaxNum.customFor(FPTypesBase) 698 .clampScalar(0, S32, S64) 699 .scalarize(0); 700 } 701 702 if (ST.hasVOP3PInsts()) 703 FPOpActions.clampMaxNumElements(0, S16, 2); 704 705 FPOpActions 706 .scalarize(0) 707 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 708 709 TrigActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 FDIVActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 718 .legalFor(FPTypesPK16) 719 .clampMaxNumElements(0, S16, 2) 720 .scalarize(0) 721 .clampScalar(0, S16, S64); 722 723 if (ST.has16BitInsts()) { 724 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 725 .legalFor({S32, S64, S16}) 726 .scalarize(0) 727 .clampScalar(0, S16, S64); 728 } else { 729 getActionDefinitionsBuilder(G_FSQRT) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 734 if (ST.hasFractBug()) { 735 getActionDefinitionsBuilder(G_FFLOOR) 736 .customFor({S64}) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FFLOOR) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 } 746 } 747 748 getActionDefinitionsBuilder(G_FPTRUNC) 749 .legalFor({{S32, S64}, {S16, S32}}) 750 .scalarize(0) 751 .lower(); 752 753 getActionDefinitionsBuilder(G_FPEXT) 754 .legalFor({{S64, S32}, {S32, S16}}) 755 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 756 .scalarize(0); 757 758 getActionDefinitionsBuilder(G_FSUB) 759 // Use actual fsub instruction 760 .legalFor({S32}) 761 // Must use fadd + fneg 762 .lowerFor({S64, S16, V2S16}) 763 .scalarize(0) 764 .clampScalar(0, S32, S64); 765 766 // Whether this is legal depends on the floating point mode for the function. 767 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 768 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 769 FMad.customFor({S32, S16}); 770 else if (ST.hasMadMacF32Insts()) 771 FMad.customFor({S32}); 772 else if (ST.hasMadF16()) 773 FMad.customFor({S16}); 774 FMad.scalarize(0) 775 .lower(); 776 777 auto &FRem = getActionDefinitionsBuilder(G_FREM); 778 if (ST.has16BitInsts()) { 779 FRem.customFor({S16, S32, S64}); 780 } else { 781 FRem.minScalar(0, S32) 782 .customFor({S32, S64}); 783 } 784 FRem.scalarize(0); 785 786 // TODO: Do we need to clamp maximum bitwidth? 787 getActionDefinitionsBuilder(G_TRUNC) 788 .legalIf(isScalar(0)) 789 .legalFor({{V2S16, V2S32}}) 790 .clampMaxNumElements(0, S16, 2) 791 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 792 // situations (like an invalid implicit use), we don't want to infinite loop 793 // in the legalizer. 794 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 795 .alwaysLegal(); 796 797 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 798 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 799 {S32, S1}, {S64, S1}, {S16, S1}}) 800 .scalarize(0) 801 .clampScalar(0, S32, S64) 802 .widenScalarToNextPow2(1, 32); 803 804 // TODO: Split s1->s64 during regbankselect for VALU. 805 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 806 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 807 .lowerFor({{S32, S64}}) 808 .lowerIf(typeIs(1, S1)) 809 .customFor({{S64, S64}}); 810 if (ST.has16BitInsts()) 811 IToFP.legalFor({{S16, S16}}); 812 IToFP.clampScalar(1, S32, S64) 813 .minScalar(0, S32) 814 .scalarize(0) 815 .widenScalarToNextPow2(1); 816 817 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 818 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 819 .customFor({{S64, S32}, {S64, S64}}) 820 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 821 if (ST.has16BitInsts()) 822 FPToI.legalFor({{S16, S16}}); 823 else 824 FPToI.minScalar(1, S32); 825 826 FPToI.minScalar(0, S32) 827 .widenScalarToNextPow2(0, 32) 828 .scalarize(0) 829 .lower(); 830 831 // Lower roundeven into G_FRINT 832 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 833 .scalarize(0) 834 .lower(); 835 836 if (ST.has16BitInsts()) { 837 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 838 .legalFor({S16, S32, S64}) 839 .clampScalar(0, S16, S64) 840 .scalarize(0); 841 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 842 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 843 .legalFor({S32, S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } else { 847 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 848 .legalFor({S32}) 849 .customFor({S64}) 850 .clampScalar(0, S32, S64) 851 .scalarize(0); 852 } 853 854 getActionDefinitionsBuilder(G_PTR_ADD) 855 .legalIf(all(isPointer(0), sameSize(0, 1))) 856 .scalarize(0) 857 .scalarSameSizeAs(1, 0); 858 859 getActionDefinitionsBuilder(G_PTRMASK) 860 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 861 .scalarSameSizeAs(1, 0) 862 .scalarize(0); 863 864 auto &CmpBuilder = 865 getActionDefinitionsBuilder(G_ICMP) 866 // The compare output type differs based on the register bank of the output, 867 // so make both s1 and s32 legal. 868 // 869 // Scalar compares producing output in scc will be promoted to s32, as that 870 // is the allocatable register type that will be needed for the copy from 871 // scc. This will be promoted during RegBankSelect, and we assume something 872 // before that won't try to use s32 result types. 873 // 874 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 875 // bank. 876 .legalForCartesianProduct( 877 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 878 .legalForCartesianProduct( 879 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 880 if (ST.has16BitInsts()) { 881 CmpBuilder.legalFor({{S1, S16}}); 882 } 883 884 CmpBuilder 885 .widenScalarToNextPow2(1) 886 .clampScalar(1, S32, S64) 887 .scalarize(0) 888 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 889 890 getActionDefinitionsBuilder(G_FCMP) 891 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 892 .widenScalarToNextPow2(1) 893 .clampScalar(1, S32, S64) 894 .scalarize(0); 895 896 // FIXME: fpow has a selection pattern that should move to custom lowering. 897 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 898 if (ST.has16BitInsts()) 899 Exp2Ops.legalFor({S32, S16}); 900 else 901 Exp2Ops.legalFor({S32}); 902 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 903 Exp2Ops.scalarize(0); 904 905 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 906 if (ST.has16BitInsts()) 907 ExpOps.customFor({{S32}, {S16}}); 908 else 909 ExpOps.customFor({S32}); 910 ExpOps.clampScalar(0, MinScalarFPTy, S32) 911 .scalarize(0); 912 913 getActionDefinitionsBuilder(G_FPOWI) 914 .clampScalar(0, MinScalarFPTy, S32) 915 .lower(); 916 917 // The 64-bit versions produce 32-bit results, but only on the SALU. 918 getActionDefinitionsBuilder(G_CTPOP) 919 .legalFor({{S32, S32}, {S32, S64}}) 920 .clampScalar(0, S32, S32) 921 .clampScalar(1, S32, S64) 922 .scalarize(0) 923 .widenScalarToNextPow2(0, 32) 924 .widenScalarToNextPow2(1, 32); 925 926 // The hardware instructions return a different result on 0 than the generic 927 // instructions expect. The hardware produces -1, but these produce the 928 // bitwidth. 929 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 930 .scalarize(0) 931 .clampScalar(0, S32, S32) 932 .clampScalar(1, S32, S64) 933 .widenScalarToNextPow2(0, 32) 934 .widenScalarToNextPow2(1, 32) 935 .lower(); 936 937 // The 64-bit versions produce 32-bit results, but only on the SALU. 938 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 939 .legalFor({{S32, S32}, {S32, S64}}) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .scalarize(0) 943 .widenScalarToNextPow2(0, 32) 944 .widenScalarToNextPow2(1, 32); 945 946 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 947 // RegBankSelect. 948 getActionDefinitionsBuilder(G_BITREVERSE) 949 .legalFor({S32, S64}) 950 .clampScalar(0, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0); 953 954 if (ST.has16BitInsts()) { 955 getActionDefinitionsBuilder(G_BSWAP) 956 .legalFor({S16, S32, V2S16}) 957 .clampMaxNumElements(0, S16, 2) 958 // FIXME: Fixing non-power-of-2 before clamp is workaround for 959 // narrowScalar limitation. 960 .widenScalarToNextPow2(0) 961 .clampScalar(0, S16, S32) 962 .scalarize(0); 963 964 if (ST.hasVOP3PInsts()) { 965 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 966 .legalFor({S32, S16, V2S16}) 967 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 968 .clampMaxNumElements(0, S16, 2) 969 .minScalar(0, S16) 970 .widenScalarToNextPow2(0) 971 .scalarize(0) 972 .lower(); 973 } else { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 975 .legalFor({S32, S16}) 976 .widenScalarToNextPow2(0) 977 .minScalar(0, S16) 978 .scalarize(0) 979 .lower(); 980 } 981 } else { 982 // TODO: Should have same legality without v_perm_b32 983 getActionDefinitionsBuilder(G_BSWAP) 984 .legalFor({S32}) 985 .lowerIf(scalarNarrowerThan(0, 32)) 986 // FIXME: Fixing non-power-of-2 before clamp is workaround for 987 // narrowScalar limitation. 988 .widenScalarToNextPow2(0) 989 .maxScalar(0, S32) 990 .scalarize(0) 991 .lower(); 992 993 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 994 .legalFor({S32}) 995 .minScalar(0, S32) 996 .widenScalarToNextPow2(0) 997 .scalarize(0) 998 .lower(); 999 } 1000 1001 getActionDefinitionsBuilder(G_INTTOPTR) 1002 // List the common cases 1003 .legalForCartesianProduct(AddrSpaces64, {S64}) 1004 .legalForCartesianProduct(AddrSpaces32, {S32}) 1005 .scalarize(0) 1006 // Accept any address space as long as the size matches 1007 .legalIf(sameSize(0, 1)) 1008 .widenScalarIf(smallerThan(1, 0), 1009 [](const LegalityQuery &Query) { 1010 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1011 }) 1012 .narrowScalarIf(largerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }); 1016 1017 getActionDefinitionsBuilder(G_PTRTOINT) 1018 // List the common cases 1019 .legalForCartesianProduct(AddrSpaces64, {S64}) 1020 .legalForCartesianProduct(AddrSpaces32, {S32}) 1021 .scalarize(0) 1022 // Accept any address space as long as the size matches 1023 .legalIf(sameSize(0, 1)) 1024 .widenScalarIf(smallerThan(0, 1), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1027 }) 1028 .narrowScalarIf( 1029 largerThan(0, 1), 1030 [](const LegalityQuery &Query) { 1031 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1032 }); 1033 1034 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1035 .scalarize(0) 1036 .custom(); 1037 1038 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1039 bool IsLoad) -> bool { 1040 const LLT DstTy = Query.Types[0]; 1041 1042 // Split vector extloads. 1043 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1044 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1045 1046 if (MemSize < DstTy.getSizeInBits()) 1047 MemSize = std::max(MemSize, AlignBits); 1048 1049 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1050 return true; 1051 1052 const LLT PtrTy = Query.Types[1]; 1053 unsigned AS = PtrTy.getAddressSpace(); 1054 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1055 return true; 1056 1057 // Catch weird sized loads that don't evenly divide into the access sizes 1058 // TODO: May be able to widen depending on alignment etc. 1059 unsigned NumRegs = (MemSize + 31) / 32; 1060 if (NumRegs == 3) { 1061 if (!ST.hasDwordx3LoadStores()) 1062 return true; 1063 } else { 1064 // If the alignment allows, these should have been widened. 1065 if (!isPowerOf2_32(NumRegs)) 1066 return true; 1067 } 1068 1069 if (AlignBits < MemSize) { 1070 const SITargetLowering *TLI = ST.getTargetLowering(); 1071 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1072 Align(AlignBits / 8)); 1073 } 1074 1075 return false; 1076 }; 1077 1078 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1079 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1080 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1081 1082 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1083 // LDS 1084 // TODO: Unsupported flat for SI. 1085 1086 for (unsigned Op : {G_LOAD, G_STORE}) { 1087 const bool IsStore = Op == G_STORE; 1088 1089 auto &Actions = getActionDefinitionsBuilder(Op); 1090 // Explicitly list some common cases. 1091 // TODO: Does this help compile time at all? 1092 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1093 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1094 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1095 {S64, GlobalPtr, 64, GlobalAlign32}, 1096 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1097 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1098 {S32, GlobalPtr, 8, GlobalAlign8}, 1099 {S32, GlobalPtr, 16, GlobalAlign16}, 1100 1101 {S32, LocalPtr, 32, 32}, 1102 {S64, LocalPtr, 64, 32}, 1103 {V2S32, LocalPtr, 64, 32}, 1104 {S32, LocalPtr, 8, 8}, 1105 {S32, LocalPtr, 16, 16}, 1106 {V2S16, LocalPtr, 32, 32}, 1107 1108 {S32, PrivatePtr, 32, 32}, 1109 {S32, PrivatePtr, 8, 8}, 1110 {S32, PrivatePtr, 16, 16}, 1111 {V2S16, PrivatePtr, 32, 32}, 1112 1113 {S32, ConstantPtr, 32, GlobalAlign32}, 1114 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1115 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1116 {S64, ConstantPtr, 64, GlobalAlign32}, 1117 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1118 Actions.legalIf( 1119 [=](const LegalityQuery &Query) -> bool { 1120 return isLoadStoreLegal(ST, Query); 1121 }); 1122 1123 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1124 // 64-bits. 1125 // 1126 // TODO: Should generalize bitcast action into coerce, which will also cover 1127 // inserting addrspacecasts. 1128 Actions.customIf(typeIs(1, Constant32Ptr)); 1129 1130 // Turn any illegal element vectors into something easier to deal 1131 // with. These will ultimately produce 32-bit scalar shifts to extract the 1132 // parts anyway. 1133 // 1134 // For odd 16-bit element vectors, prefer to split those into pieces with 1135 // 16-bit vector parts. 1136 Actions.bitcastIf( 1137 [=](const LegalityQuery &Query) -> bool { 1138 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1139 Query.MMODescrs[0].SizeInBits); 1140 }, bitcastToRegisterType(0)); 1141 1142 if (!IsStore) { 1143 // Widen suitably aligned loads by loading extra bytes. The standard 1144 // legalization actions can't properly express widening memory operands. 1145 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1146 return shouldWidenLoad(ST, Query, G_LOAD); 1147 }); 1148 } 1149 1150 // FIXME: load/store narrowing should be moved to lower action 1151 Actions 1152 .narrowScalarIf( 1153 [=](const LegalityQuery &Query) -> bool { 1154 return !Query.Types[0].isVector() && 1155 needToSplitMemOp(Query, Op == G_LOAD); 1156 }, 1157 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1158 const LLT DstTy = Query.Types[0]; 1159 const LLT PtrTy = Query.Types[1]; 1160 1161 const unsigned DstSize = DstTy.getSizeInBits(); 1162 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1163 1164 // Split extloads. 1165 if (DstSize > MemSize) 1166 return std::make_pair(0, LLT::scalar(MemSize)); 1167 1168 if (!isPowerOf2_32(DstSize)) { 1169 // We're probably decomposing an odd sized store. Try to split 1170 // to the widest type. TODO: Account for alignment. As-is it 1171 // should be OK, since the new parts will be further legalized. 1172 unsigned FloorSize = PowerOf2Floor(DstSize); 1173 return std::make_pair(0, LLT::scalar(FloorSize)); 1174 } 1175 1176 if (DstSize > 32 && (DstSize % 32 != 0)) { 1177 // FIXME: Need a way to specify non-extload of larger size if 1178 // suitably aligned. 1179 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1180 } 1181 1182 unsigned MaxSize = maxSizeForAddrSpace(ST, 1183 PtrTy.getAddressSpace(), 1184 Op == G_LOAD); 1185 if (MemSize > MaxSize) 1186 return std::make_pair(0, LLT::scalar(MaxSize)); 1187 1188 unsigned Align = Query.MMODescrs[0].AlignInBits; 1189 return std::make_pair(0, LLT::scalar(Align)); 1190 }) 1191 .fewerElementsIf( 1192 [=](const LegalityQuery &Query) -> bool { 1193 return Query.Types[0].isVector() && 1194 needToSplitMemOp(Query, Op == G_LOAD); 1195 }, 1196 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1197 const LLT DstTy = Query.Types[0]; 1198 const LLT PtrTy = Query.Types[1]; 1199 1200 LLT EltTy = DstTy.getElementType(); 1201 unsigned MaxSize = maxSizeForAddrSpace(ST, 1202 PtrTy.getAddressSpace(), 1203 Op == G_LOAD); 1204 1205 // FIXME: Handle widened to power of 2 results better. This ends 1206 // up scalarizing. 1207 // FIXME: 3 element stores scalarized on SI 1208 1209 // Split if it's too large for the address space. 1210 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1211 unsigned NumElts = DstTy.getNumElements(); 1212 unsigned EltSize = EltTy.getSizeInBits(); 1213 1214 if (MaxSize % EltSize == 0) { 1215 return std::make_pair( 1216 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1217 } 1218 1219 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1220 1221 // FIXME: Refine when odd breakdowns handled 1222 // The scalars will need to be re-legalized. 1223 if (NumPieces == 1 || NumPieces >= NumElts || 1224 NumElts % NumPieces != 0) 1225 return std::make_pair(0, EltTy); 1226 1227 return std::make_pair(0, 1228 LLT::vector(NumElts / NumPieces, EltTy)); 1229 } 1230 1231 // FIXME: We could probably handle weird extending loads better. 1232 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1233 if (DstTy.getSizeInBits() > MemSize) 1234 return std::make_pair(0, EltTy); 1235 1236 unsigned EltSize = EltTy.getSizeInBits(); 1237 unsigned DstSize = DstTy.getSizeInBits(); 1238 if (!isPowerOf2_32(DstSize)) { 1239 // We're probably decomposing an odd sized store. Try to split 1240 // to the widest type. TODO: Account for alignment. As-is it 1241 // should be OK, since the new parts will be further legalized. 1242 unsigned FloorSize = PowerOf2Floor(DstSize); 1243 return std::make_pair( 1244 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1245 } 1246 1247 // Need to split because of alignment. 1248 unsigned Align = Query.MMODescrs[0].AlignInBits; 1249 if (EltSize > Align && 1250 (EltSize / Align < DstTy.getNumElements())) { 1251 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1252 } 1253 1254 // May need relegalization for the scalars. 1255 return std::make_pair(0, EltTy); 1256 }) 1257 .lowerIfMemSizeNotPow2() 1258 .minScalar(0, S32) 1259 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1260 .widenScalarToNextPow2(0) 1261 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1262 .lower(); 1263 } 1264 1265 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1266 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1267 {S32, GlobalPtr, 16, 2 * 8}, 1268 {S32, LocalPtr, 8, 8}, 1269 {S32, LocalPtr, 16, 16}, 1270 {S32, PrivatePtr, 8, 8}, 1271 {S32, PrivatePtr, 16, 16}, 1272 {S32, ConstantPtr, 8, 8}, 1273 {S32, ConstantPtr, 16, 2 * 8}}) 1274 .legalIf( 1275 [=](const LegalityQuery &Query) -> bool { 1276 return isLoadStoreLegal(ST, Query); 1277 }); 1278 1279 if (ST.hasFlatAddressSpace()) { 1280 ExtLoads.legalForTypesWithMemDesc( 1281 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1282 } 1283 1284 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1285 // 64-bits. 1286 // 1287 // TODO: Should generalize bitcast action into coerce, which will also cover 1288 // inserting addrspacecasts. 1289 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1290 1291 ExtLoads.clampScalar(0, S32, S32) 1292 .widenScalarToNextPow2(0) 1293 .unsupportedIfMemSizeNotPow2() 1294 .lower(); 1295 1296 auto &Atomics = getActionDefinitionsBuilder( 1297 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1298 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1299 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1300 G_ATOMICRMW_UMIN}) 1301 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1302 {S64, GlobalPtr}, {S64, LocalPtr}, 1303 {S32, RegionPtr}, {S64, RegionPtr}}); 1304 if (ST.hasFlatAddressSpace()) { 1305 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1306 } 1307 1308 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1309 if (ST.hasLDSFPAtomics()) { 1310 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1311 if (ST.hasGFX90AInsts()) 1312 Atomic.legalFor({{S64, LocalPtr}}); 1313 } 1314 if (ST.hasAtomicFaddInsts()) 1315 Atomic.legalFor({{S32, GlobalPtr}}); 1316 1317 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1318 // demarshalling 1319 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1320 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1321 {S32, FlatPtr}, {S64, FlatPtr}}) 1322 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1323 {S32, RegionPtr}, {S64, RegionPtr}}); 1324 // TODO: Pointer types, any 32-bit or 64-bit vector 1325 1326 // Condition should be s32 for scalar, s1 for vector. 1327 getActionDefinitionsBuilder(G_SELECT) 1328 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1329 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1330 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1331 .clampScalar(0, S16, S64) 1332 .scalarize(1) 1333 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1334 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1335 .clampMaxNumElements(0, S32, 2) 1336 .clampMaxNumElements(0, LocalPtr, 2) 1337 .clampMaxNumElements(0, PrivatePtr, 2) 1338 .scalarize(0) 1339 .widenScalarToNextPow2(0) 1340 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1341 1342 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1343 // be more flexible with the shift amount type. 1344 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1345 .legalFor({{S32, S32}, {S64, S32}}); 1346 if (ST.has16BitInsts()) { 1347 if (ST.hasVOP3PInsts()) { 1348 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1349 .clampMaxNumElements(0, S16, 2); 1350 } else 1351 Shifts.legalFor({{S16, S16}}); 1352 1353 // TODO: Support 16-bit shift amounts for all types 1354 Shifts.widenScalarIf( 1355 [=](const LegalityQuery &Query) { 1356 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1357 // 32-bit amount. 1358 const LLT ValTy = Query.Types[0]; 1359 const LLT AmountTy = Query.Types[1]; 1360 return ValTy.getSizeInBits() <= 16 && 1361 AmountTy.getSizeInBits() < 16; 1362 }, changeTo(1, S16)); 1363 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1364 Shifts.clampScalar(1, S32, S32); 1365 Shifts.clampScalar(0, S16, S64); 1366 Shifts.widenScalarToNextPow2(0, 16); 1367 1368 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1369 .minScalar(0, S16) 1370 .scalarize(0) 1371 .lower(); 1372 } else { 1373 // Make sure we legalize the shift amount type first, as the general 1374 // expansion for the shifted type will produce much worse code if it hasn't 1375 // been truncated already. 1376 Shifts.clampScalar(1, S32, S32); 1377 Shifts.clampScalar(0, S32, S64); 1378 Shifts.widenScalarToNextPow2(0, 32); 1379 1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1381 .minScalar(0, S32) 1382 .scalarize(0) 1383 .lower(); 1384 } 1385 Shifts.scalarize(0); 1386 1387 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1388 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1389 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1390 unsigned IdxTypeIdx = 2; 1391 1392 getActionDefinitionsBuilder(Op) 1393 .customIf([=](const LegalityQuery &Query) { 1394 const LLT EltTy = Query.Types[EltTypeIdx]; 1395 const LLT VecTy = Query.Types[VecTypeIdx]; 1396 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1397 const unsigned EltSize = EltTy.getSizeInBits(); 1398 return (EltSize == 32 || EltSize == 64) && 1399 VecTy.getSizeInBits() % 32 == 0 && 1400 VecTy.getSizeInBits() <= MaxRegisterSize && 1401 IdxTy.getSizeInBits() == 32; 1402 }) 1403 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1404 bitcastToVectorElement32(VecTypeIdx)) 1405 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1406 .bitcastIf( 1407 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1408 [=](const LegalityQuery &Query) { 1409 // For > 64-bit element types, try to turn this into a 64-bit 1410 // element vector since we may be able to do better indexing 1411 // if this is scalar. If not, fall back to 32. 1412 const LLT EltTy = Query.Types[EltTypeIdx]; 1413 const LLT VecTy = Query.Types[VecTypeIdx]; 1414 const unsigned DstEltSize = EltTy.getSizeInBits(); 1415 const unsigned VecSize = VecTy.getSizeInBits(); 1416 1417 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1418 return std::make_pair( 1419 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1420 }) 1421 .clampScalar(EltTypeIdx, S32, S64) 1422 .clampScalar(VecTypeIdx, S32, S64) 1423 .clampScalar(IdxTypeIdx, S32, S32) 1424 .clampMaxNumElements(VecTypeIdx, S32, 32) 1425 // TODO: Clamp elements for 64-bit vectors? 1426 // It should only be necessary with variable indexes. 1427 // As a last resort, lower to the stack 1428 .lower(); 1429 } 1430 1431 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1432 .unsupportedIf([=](const LegalityQuery &Query) { 1433 const LLT &EltTy = Query.Types[1].getElementType(); 1434 return Query.Types[0] != EltTy; 1435 }); 1436 1437 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1438 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1439 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1440 1441 // FIXME: Doesn't handle extract of illegal sizes. 1442 getActionDefinitionsBuilder(Op) 1443 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1444 // FIXME: Multiples of 16 should not be legal. 1445 .legalIf([=](const LegalityQuery &Query) { 1446 const LLT BigTy = Query.Types[BigTyIdx]; 1447 const LLT LitTy = Query.Types[LitTyIdx]; 1448 return (BigTy.getSizeInBits() % 32 == 0) && 1449 (LitTy.getSizeInBits() % 16 == 0); 1450 }) 1451 .widenScalarIf( 1452 [=](const LegalityQuery &Query) { 1453 const LLT BigTy = Query.Types[BigTyIdx]; 1454 return (BigTy.getScalarSizeInBits() < 16); 1455 }, 1456 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1457 .widenScalarIf( 1458 [=](const LegalityQuery &Query) { 1459 const LLT LitTy = Query.Types[LitTyIdx]; 1460 return (LitTy.getScalarSizeInBits() < 16); 1461 }, 1462 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1463 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1464 .widenScalarToNextPow2(BigTyIdx, 32); 1465 1466 } 1467 1468 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1469 .legalForCartesianProduct(AllS32Vectors, {S32}) 1470 .legalForCartesianProduct(AllS64Vectors, {S64}) 1471 .clampNumElements(0, V16S32, V32S32) 1472 .clampNumElements(0, V2S64, V16S64) 1473 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1474 1475 if (ST.hasScalarPackInsts()) { 1476 BuildVector 1477 // FIXME: Should probably widen s1 vectors straight to s32 1478 .minScalarOrElt(0, S16) 1479 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1480 .minScalar(1, S32); 1481 1482 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1483 .legalFor({V2S16, S32}) 1484 .lower(); 1485 BuildVector.minScalarOrElt(0, S32); 1486 } else { 1487 BuildVector.customFor({V2S16, S16}); 1488 BuildVector.minScalarOrElt(0, S32); 1489 1490 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1491 .customFor({V2S16, S32}) 1492 .lower(); 1493 } 1494 1495 BuildVector.legalIf(isRegisterType(0)); 1496 1497 // FIXME: Clamp maximum size 1498 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1499 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1500 .clampMaxNumElements(0, S32, 32) 1501 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1502 .clampMaxNumElements(0, S16, 64); 1503 1504 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1505 // pre-legalize. 1506 if (ST.hasVOP3PInsts()) { 1507 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1508 .customFor({V2S16, V2S16}) 1509 .lower(); 1510 } else 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1512 1513 // Merge/Unmerge 1514 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1515 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1516 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1517 1518 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1519 const LLT Ty = Query.Types[TypeIdx]; 1520 if (Ty.isVector()) { 1521 const LLT &EltTy = Ty.getElementType(); 1522 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1523 return true; 1524 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1525 return true; 1526 } 1527 return false; 1528 }; 1529 1530 auto &Builder = getActionDefinitionsBuilder(Op) 1531 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1532 .lowerFor({{S16, V2S16}}) 1533 .lowerIf([=](const LegalityQuery &Query) { 1534 const LLT BigTy = Query.Types[BigTyIdx]; 1535 return BigTy.getSizeInBits() == 32; 1536 }) 1537 // Try to widen to s16 first for small types. 1538 // TODO: Only do this on targets with legal s16 shifts 1539 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1540 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1541 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1542 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1543 elementTypeIs(1, S16)), 1544 changeTo(1, V2S16)) 1545 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1546 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1547 // valid. 1548 .clampScalar(LitTyIdx, S32, S512) 1549 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1550 // Break up vectors with weird elements into scalars 1551 .fewerElementsIf( 1552 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1553 scalarize(0)) 1554 .fewerElementsIf( 1555 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1556 scalarize(1)) 1557 .clampScalar(BigTyIdx, S32, MaxScalar); 1558 1559 if (Op == G_MERGE_VALUES) { 1560 Builder.widenScalarIf( 1561 // TODO: Use 16-bit shifts if legal for 8-bit values? 1562 [=](const LegalityQuery &Query) { 1563 const LLT Ty = Query.Types[LitTyIdx]; 1564 return Ty.getSizeInBits() < 32; 1565 }, 1566 changeTo(LitTyIdx, S32)); 1567 } 1568 1569 Builder.widenScalarIf( 1570 [=](const LegalityQuery &Query) { 1571 const LLT Ty = Query.Types[BigTyIdx]; 1572 return !isPowerOf2_32(Ty.getSizeInBits()) && 1573 Ty.getSizeInBits() % 16 != 0; 1574 }, 1575 [=](const LegalityQuery &Query) { 1576 // Pick the next power of 2, or a multiple of 64 over 128. 1577 // Whichever is smaller. 1578 const LLT &Ty = Query.Types[BigTyIdx]; 1579 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1580 if (NewSizeInBits >= 256) { 1581 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1582 if (RoundedTo < NewSizeInBits) 1583 NewSizeInBits = RoundedTo; 1584 } 1585 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1586 }) 1587 // Any vectors left are the wrong size. Scalarize them. 1588 .scalarize(0) 1589 .scalarize(1); 1590 } 1591 1592 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1593 // RegBankSelect. 1594 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1595 .legalFor({{S32}, {S64}}); 1596 1597 if (ST.hasVOP3PInsts()) { 1598 SextInReg.lowerFor({{V2S16}}) 1599 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1600 // get more vector shift opportunities, since we'll get those when 1601 // expanded. 1602 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1603 } else if (ST.has16BitInsts()) { 1604 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1605 } else { 1606 // Prefer to promote to s32 before lowering if we don't have 16-bit 1607 // shifts. This avoid a lot of intermediate truncate and extend operations. 1608 SextInReg.lowerFor({{S32}, {S64}}); 1609 } 1610 1611 SextInReg 1612 .scalarize(0) 1613 .clampScalar(0, S32, S64) 1614 .lower(); 1615 1616 // TODO: Only Try to form v2s16 with legal packed instructions. 1617 getActionDefinitionsBuilder(G_FSHR) 1618 .legalFor({{S32, S32}}) 1619 .lowerFor({{V2S16, V2S16}}) 1620 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1621 .scalarize(0) 1622 .lower(); 1623 1624 if (ST.hasVOP3PInsts()) { 1625 getActionDefinitionsBuilder(G_FSHL) 1626 .lowerFor({{V2S16, V2S16}}) 1627 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1628 .scalarize(0) 1629 .lower(); 1630 } else { 1631 getActionDefinitionsBuilder(G_FSHL) 1632 .scalarize(0) 1633 .lower(); 1634 } 1635 1636 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1637 .legalFor({S64}); 1638 1639 getActionDefinitionsBuilder(G_FENCE) 1640 .alwaysLegal(); 1641 1642 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1643 .scalarize(0) 1644 .minScalar(0, S32) 1645 .lower(); 1646 1647 getActionDefinitionsBuilder({ 1648 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1649 G_FCOPYSIGN, 1650 1651 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1652 G_ATOMICRMW_NAND, 1653 G_ATOMICRMW_FSUB, 1654 G_READ_REGISTER, 1655 G_WRITE_REGISTER, 1656 1657 G_SADDO, G_SSUBO, 1658 1659 // TODO: Implement 1660 G_FMINIMUM, G_FMAXIMUM}).lower(); 1661 1662 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1663 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1664 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1665 .unsupported(); 1666 1667 getLegacyLegalizerInfo().computeTables(); 1668 verify(*ST.getInstrInfo()); 1669 } 1670 1671 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1672 MachineInstr &MI) const { 1673 MachineIRBuilder &B = Helper.MIRBuilder; 1674 MachineRegisterInfo &MRI = *B.getMRI(); 1675 1676 switch (MI.getOpcode()) { 1677 case TargetOpcode::G_ADDRSPACE_CAST: 1678 return legalizeAddrSpaceCast(MI, MRI, B); 1679 case TargetOpcode::G_FRINT: 1680 return legalizeFrint(MI, MRI, B); 1681 case TargetOpcode::G_FCEIL: 1682 return legalizeFceil(MI, MRI, B); 1683 case TargetOpcode::G_FREM: 1684 return legalizeFrem(MI, MRI, B); 1685 case TargetOpcode::G_INTRINSIC_TRUNC: 1686 return legalizeIntrinsicTrunc(MI, MRI, B); 1687 case TargetOpcode::G_SITOFP: 1688 return legalizeITOFP(MI, MRI, B, true); 1689 case TargetOpcode::G_UITOFP: 1690 return legalizeITOFP(MI, MRI, B, false); 1691 case TargetOpcode::G_FPTOSI: 1692 return legalizeFPTOI(MI, MRI, B, true); 1693 case TargetOpcode::G_FPTOUI: 1694 return legalizeFPTOI(MI, MRI, B, false); 1695 case TargetOpcode::G_FMINNUM: 1696 case TargetOpcode::G_FMAXNUM: 1697 case TargetOpcode::G_FMINNUM_IEEE: 1698 case TargetOpcode::G_FMAXNUM_IEEE: 1699 return legalizeMinNumMaxNum(Helper, MI); 1700 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1701 return legalizeExtractVectorElt(MI, MRI, B); 1702 case TargetOpcode::G_INSERT_VECTOR_ELT: 1703 return legalizeInsertVectorElt(MI, MRI, B); 1704 case TargetOpcode::G_SHUFFLE_VECTOR: 1705 return legalizeShuffleVector(MI, MRI, B); 1706 case TargetOpcode::G_FSIN: 1707 case TargetOpcode::G_FCOS: 1708 return legalizeSinCos(MI, MRI, B); 1709 case TargetOpcode::G_GLOBAL_VALUE: 1710 return legalizeGlobalValue(MI, MRI, B); 1711 case TargetOpcode::G_LOAD: 1712 case TargetOpcode::G_SEXTLOAD: 1713 case TargetOpcode::G_ZEXTLOAD: 1714 return legalizeLoad(Helper, MI); 1715 case TargetOpcode::G_FMAD: 1716 return legalizeFMad(MI, MRI, B); 1717 case TargetOpcode::G_FDIV: 1718 return legalizeFDIV(MI, MRI, B); 1719 case TargetOpcode::G_UDIV: 1720 case TargetOpcode::G_UREM: 1721 case TargetOpcode::G_UDIVREM: 1722 return legalizeUnsignedDIV_REM(MI, MRI, B); 1723 case TargetOpcode::G_SDIV: 1724 case TargetOpcode::G_SREM: 1725 case TargetOpcode::G_SDIVREM: 1726 return legalizeSignedDIV_REM(MI, MRI, B); 1727 case TargetOpcode::G_ATOMIC_CMPXCHG: 1728 return legalizeAtomicCmpXChg(MI, MRI, B); 1729 case TargetOpcode::G_FLOG: 1730 return legalizeFlog(MI, B, numbers::ln2f); 1731 case TargetOpcode::G_FLOG10: 1732 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1733 case TargetOpcode::G_FEXP: 1734 return legalizeFExp(MI, B); 1735 case TargetOpcode::G_FPOW: 1736 return legalizeFPow(MI, B); 1737 case TargetOpcode::G_FFLOOR: 1738 return legalizeFFloor(MI, MRI, B); 1739 case TargetOpcode::G_BUILD_VECTOR: 1740 return legalizeBuildVector(MI, MRI, B); 1741 default: 1742 return false; 1743 } 1744 1745 llvm_unreachable("expected switch to return"); 1746 } 1747 1748 Register AMDGPULegalizerInfo::getSegmentAperture( 1749 unsigned AS, 1750 MachineRegisterInfo &MRI, 1751 MachineIRBuilder &B) const { 1752 MachineFunction &MF = B.getMF(); 1753 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1754 const LLT S32 = LLT::scalar(32); 1755 1756 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1757 1758 if (ST.hasApertureRegs()) { 1759 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1760 // getreg. 1761 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1762 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1763 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1764 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1765 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1766 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1767 unsigned Encoding = 1768 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1769 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1770 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1771 1772 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1773 1774 B.buildInstr(AMDGPU::S_GETREG_B32) 1775 .addDef(GetReg) 1776 .addImm(Encoding); 1777 MRI.setType(GetReg, S32); 1778 1779 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1780 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1781 } 1782 1783 Register QueuePtr = MRI.createGenericVirtualRegister( 1784 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1785 1786 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1787 return Register(); 1788 1789 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1790 // private_segment_aperture_base_hi. 1791 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1792 1793 // TODO: can we be smarter about machine pointer info? 1794 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1795 MachineMemOperand *MMO = MF.getMachineMemOperand( 1796 PtrInfo, 1797 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1798 MachineMemOperand::MOInvariant, 1799 4, commonAlignment(Align(64), StructOffset)); 1800 1801 Register LoadAddr; 1802 1803 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1804 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1805 } 1806 1807 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1808 MachineInstr &MI, MachineRegisterInfo &MRI, 1809 MachineIRBuilder &B) const { 1810 MachineFunction &MF = B.getMF(); 1811 1812 const LLT S32 = LLT::scalar(32); 1813 Register Dst = MI.getOperand(0).getReg(); 1814 Register Src = MI.getOperand(1).getReg(); 1815 1816 LLT DstTy = MRI.getType(Dst); 1817 LLT SrcTy = MRI.getType(Src); 1818 unsigned DestAS = DstTy.getAddressSpace(); 1819 unsigned SrcAS = SrcTy.getAddressSpace(); 1820 1821 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1822 // vector element. 1823 assert(!DstTy.isVector()); 1824 1825 const AMDGPUTargetMachine &TM 1826 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1827 1828 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1829 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1830 return true; 1831 } 1832 1833 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1834 // Truncate. 1835 B.buildExtract(Dst, Src, 0); 1836 MI.eraseFromParent(); 1837 return true; 1838 } 1839 1840 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1841 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1842 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1843 1844 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1845 // another. Merge operands are required to be the same type, but creating an 1846 // extra ptrtoint would be kind of pointless. 1847 auto HighAddr = B.buildConstant( 1848 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1849 B.buildMerge(Dst, {Src, HighAddr}); 1850 MI.eraseFromParent(); 1851 return true; 1852 } 1853 1854 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1855 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1856 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1857 unsigned NullVal = TM.getNullPointerValue(DestAS); 1858 1859 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1860 auto FlatNull = B.buildConstant(SrcTy, 0); 1861 1862 // Extract low 32-bits of the pointer. 1863 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1864 1865 auto CmpRes = 1866 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1867 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1868 1869 MI.eraseFromParent(); 1870 return true; 1871 } 1872 1873 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1874 return false; 1875 1876 if (!ST.hasFlatAddressSpace()) 1877 return false; 1878 1879 auto SegmentNull = 1880 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1881 auto FlatNull = 1882 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1883 1884 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1885 if (!ApertureReg.isValid()) 1886 return false; 1887 1888 auto CmpRes = 1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1890 1891 // Coerce the type of the low half of the result so we can use merge_values. 1892 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1893 1894 // TODO: Should we allow mismatched types but matching sizes in merges to 1895 // avoid the ptrtoint? 1896 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1897 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1898 1899 MI.eraseFromParent(); 1900 return true; 1901 } 1902 1903 bool AMDGPULegalizerInfo::legalizeFrint( 1904 MachineInstr &MI, MachineRegisterInfo &MRI, 1905 MachineIRBuilder &B) const { 1906 Register Src = MI.getOperand(1).getReg(); 1907 LLT Ty = MRI.getType(Src); 1908 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1909 1910 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1911 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1912 1913 auto C1 = B.buildFConstant(Ty, C1Val); 1914 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1915 1916 // TODO: Should this propagate fast-math-flags? 1917 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1918 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1919 1920 auto C2 = B.buildFConstant(Ty, C2Val); 1921 auto Fabs = B.buildFAbs(Ty, Src); 1922 1923 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1924 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizeFceil( 1930 MachineInstr &MI, MachineRegisterInfo &MRI, 1931 MachineIRBuilder &B) const { 1932 1933 const LLT S1 = LLT::scalar(1); 1934 const LLT S64 = LLT::scalar(64); 1935 1936 Register Src = MI.getOperand(1).getReg(); 1937 assert(MRI.getType(Src) == S64); 1938 1939 // result = trunc(src) 1940 // if (src > 0.0 && src != result) 1941 // result += 1.0 1942 1943 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1944 1945 const auto Zero = B.buildFConstant(S64, 0.0); 1946 const auto One = B.buildFConstant(S64, 1.0); 1947 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1948 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1949 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1950 auto Add = B.buildSelect(S64, And, One, Zero); 1951 1952 // TODO: Should this propagate fast-math-flags? 1953 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1954 return true; 1955 } 1956 1957 bool AMDGPULegalizerInfo::legalizeFrem( 1958 MachineInstr &MI, MachineRegisterInfo &MRI, 1959 MachineIRBuilder &B) const { 1960 Register DstReg = MI.getOperand(0).getReg(); 1961 Register Src0Reg = MI.getOperand(1).getReg(); 1962 Register Src1Reg = MI.getOperand(2).getReg(); 1963 auto Flags = MI.getFlags(); 1964 LLT Ty = MRI.getType(DstReg); 1965 1966 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1967 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1968 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1969 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1970 MI.eraseFromParent(); 1971 return true; 1972 } 1973 1974 static MachineInstrBuilder extractF64Exponent(Register Hi, 1975 MachineIRBuilder &B) { 1976 const unsigned FractBits = 52; 1977 const unsigned ExpBits = 11; 1978 LLT S32 = LLT::scalar(32); 1979 1980 auto Const0 = B.buildConstant(S32, FractBits - 32); 1981 auto Const1 = B.buildConstant(S32, ExpBits); 1982 1983 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1984 .addUse(Hi) 1985 .addUse(Const0.getReg(0)) 1986 .addUse(Const1.getReg(0)); 1987 1988 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B) const { 1994 const LLT S1 = LLT::scalar(1); 1995 const LLT S32 = LLT::scalar(32); 1996 const LLT S64 = LLT::scalar(64); 1997 1998 Register Src = MI.getOperand(1).getReg(); 1999 assert(MRI.getType(Src) == S64); 2000 2001 // TODO: Should this use extract since the low half is unused? 2002 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2003 Register Hi = Unmerge.getReg(1); 2004 2005 // Extract the upper half, since this is where we will find the sign and 2006 // exponent. 2007 auto Exp = extractF64Exponent(Hi, B); 2008 2009 const unsigned FractBits = 52; 2010 2011 // Extract the sign bit. 2012 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2013 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2014 2015 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2016 2017 const auto Zero32 = B.buildConstant(S32, 0); 2018 2019 // Extend back to 64-bits. 2020 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2021 2022 auto Shr = B.buildAShr(S64, FractMask, Exp); 2023 auto Not = B.buildNot(S64, Shr); 2024 auto Tmp0 = B.buildAnd(S64, Src, Not); 2025 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2026 2027 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2028 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2029 2030 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2031 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2032 MI.eraseFromParent(); 2033 return true; 2034 } 2035 2036 bool AMDGPULegalizerInfo::legalizeITOFP( 2037 MachineInstr &MI, MachineRegisterInfo &MRI, 2038 MachineIRBuilder &B, bool Signed) const { 2039 2040 Register Dst = MI.getOperand(0).getReg(); 2041 Register Src = MI.getOperand(1).getReg(); 2042 2043 const LLT S64 = LLT::scalar(64); 2044 const LLT S32 = LLT::scalar(32); 2045 2046 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2047 2048 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2049 2050 auto CvtHi = Signed ? 2051 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2052 B.buildUITOFP(S64, Unmerge.getReg(1)); 2053 2054 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2055 2056 auto ThirtyTwo = B.buildConstant(S32, 32); 2057 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2058 .addUse(CvtHi.getReg(0)) 2059 .addUse(ThirtyTwo.getReg(0)); 2060 2061 // TODO: Should this propagate fast-math-flags? 2062 B.buildFAdd(Dst, LdExp, CvtLo); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 // TODO: Copied from DAG implementation. Verify logic and document how this 2068 // actually works. 2069 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2070 MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B, 2072 bool Signed) const { 2073 2074 Register Dst = MI.getOperand(0).getReg(); 2075 Register Src = MI.getOperand(1).getReg(); 2076 2077 const LLT S64 = LLT::scalar(64); 2078 const LLT S32 = LLT::scalar(32); 2079 2080 const LLT SrcLT = MRI.getType(Src); 2081 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2082 2083 unsigned Flags = MI.getFlags(); 2084 2085 // The basic idea of converting a floating point number into a pair of 32-bit 2086 // integers is illustrated as follows: 2087 // 2088 // tf := trunc(val); 2089 // hif := floor(tf * 2^-32); 2090 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2091 // hi := fptoi(hif); 2092 // lo := fptoi(lof); 2093 // 2094 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2095 MachineInstrBuilder Sign; 2096 if (Signed && SrcLT == S32) { 2097 // However, a 32-bit floating point number has only 23 bits mantissa and 2098 // it's not enough to hold all the significant bits of `lof` if val is 2099 // negative. To avoid the loss of precision, We need to take the absolute 2100 // value after truncating and flip the result back based on the original 2101 // signedness. 2102 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2103 Trunc = B.buildFAbs(S32, Trunc, Flags); 2104 } 2105 MachineInstrBuilder K0, K1; 2106 if (SrcLT == S64) { 2107 K0 = B.buildFConstant(S64, 2108 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2109 K1 = B.buildFConstant(S64, 2110 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2111 } else { 2112 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2113 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2114 } 2115 2116 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2117 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2118 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2119 2120 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2121 : B.buildFPTOUI(S32, FloorMul); 2122 auto Lo = B.buildFPTOUI(S32, Fma); 2123 2124 if (Signed && SrcLT == S32) { 2125 // Flip the result based on the signedness, which is either all 0s or 1s. 2126 Sign = B.buildMerge(S64, {Sign, Sign}); 2127 // r := xor({lo, hi}, sign) - sign; 2128 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2129 } else 2130 B.buildMerge(Dst, {Lo, Hi}); 2131 MI.eraseFromParent(); 2132 2133 return true; 2134 } 2135 2136 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2137 MachineInstr &MI) const { 2138 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2139 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2140 2141 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2142 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2143 2144 // With ieee_mode disabled, the instructions have the correct behavior 2145 // already for G_FMINNUM/G_FMAXNUM 2146 if (!MFI->getMode().IEEE) 2147 return !IsIEEEOp; 2148 2149 if (IsIEEEOp) 2150 return true; 2151 2152 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2153 } 2154 2155 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2156 MachineInstr &MI, MachineRegisterInfo &MRI, 2157 MachineIRBuilder &B) const { 2158 // TODO: Should move some of this into LegalizerHelper. 2159 2160 // TODO: Promote dynamic indexing of s16 to s32 2161 2162 // FIXME: Artifact combiner probably should have replaced the truncated 2163 // constant before this, so we shouldn't need 2164 // getConstantVRegValWithLookThrough. 2165 Optional<ValueAndVReg> MaybeIdxVal = 2166 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2167 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2168 return true; 2169 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2170 2171 Register Dst = MI.getOperand(0).getReg(); 2172 Register Vec = MI.getOperand(1).getReg(); 2173 2174 LLT VecTy = MRI.getType(Vec); 2175 LLT EltTy = VecTy.getElementType(); 2176 assert(EltTy == MRI.getType(Dst)); 2177 2178 if (IdxVal < VecTy.getNumElements()) 2179 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2180 else 2181 B.buildUndef(Dst); 2182 2183 MI.eraseFromParent(); 2184 return true; 2185 } 2186 2187 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2188 MachineInstr &MI, MachineRegisterInfo &MRI, 2189 MachineIRBuilder &B) const { 2190 // TODO: Should move some of this into LegalizerHelper. 2191 2192 // TODO: Promote dynamic indexing of s16 to s32 2193 2194 // FIXME: Artifact combiner probably should have replaced the truncated 2195 // constant before this, so we shouldn't need 2196 // getConstantVRegValWithLookThrough. 2197 Optional<ValueAndVReg> MaybeIdxVal = 2198 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2199 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2200 return true; 2201 2202 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2203 Register Dst = MI.getOperand(0).getReg(); 2204 Register Vec = MI.getOperand(1).getReg(); 2205 Register Ins = MI.getOperand(2).getReg(); 2206 2207 LLT VecTy = MRI.getType(Vec); 2208 LLT EltTy = VecTy.getElementType(); 2209 assert(EltTy == MRI.getType(Ins)); 2210 2211 if (IdxVal < VecTy.getNumElements()) 2212 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2213 else 2214 B.buildUndef(Dst); 2215 2216 MI.eraseFromParent(); 2217 return true; 2218 } 2219 2220 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2221 MachineInstr &MI, MachineRegisterInfo &MRI, 2222 MachineIRBuilder &B) const { 2223 const LLT V2S16 = LLT::vector(2, 16); 2224 2225 Register Dst = MI.getOperand(0).getReg(); 2226 Register Src0 = MI.getOperand(1).getReg(); 2227 LLT DstTy = MRI.getType(Dst); 2228 LLT SrcTy = MRI.getType(Src0); 2229 2230 if (SrcTy == V2S16 && DstTy == V2S16 && 2231 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2232 return true; 2233 2234 MachineIRBuilder HelperBuilder(MI); 2235 GISelObserverWrapper DummyObserver; 2236 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2237 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2238 } 2239 2240 bool AMDGPULegalizerInfo::legalizeSinCos( 2241 MachineInstr &MI, MachineRegisterInfo &MRI, 2242 MachineIRBuilder &B) const { 2243 2244 Register DstReg = MI.getOperand(0).getReg(); 2245 Register SrcReg = MI.getOperand(1).getReg(); 2246 LLT Ty = MRI.getType(DstReg); 2247 unsigned Flags = MI.getFlags(); 2248 2249 Register TrigVal; 2250 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2251 if (ST.hasTrigReducedRange()) { 2252 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2253 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2254 .addUse(MulVal.getReg(0)) 2255 .setMIFlags(Flags).getReg(0); 2256 } else 2257 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2258 2259 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2260 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2261 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2262 .addUse(TrigVal) 2263 .setMIFlags(Flags); 2264 MI.eraseFromParent(); 2265 return true; 2266 } 2267 2268 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2269 MachineIRBuilder &B, 2270 const GlobalValue *GV, 2271 int64_t Offset, 2272 unsigned GAFlags) const { 2273 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2274 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2275 // to the following code sequence: 2276 // 2277 // For constant address space: 2278 // s_getpc_b64 s[0:1] 2279 // s_add_u32 s0, s0, $symbol 2280 // s_addc_u32 s1, s1, 0 2281 // 2282 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2283 // a fixup or relocation is emitted to replace $symbol with a literal 2284 // constant, which is a pc-relative offset from the encoding of the $symbol 2285 // operand to the global variable. 2286 // 2287 // For global address space: 2288 // s_getpc_b64 s[0:1] 2289 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2290 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2291 // 2292 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2293 // fixups or relocations are emitted to replace $symbol@*@lo and 2294 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2295 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2296 // operand to the global variable. 2297 // 2298 // What we want here is an offset from the value returned by s_getpc 2299 // (which is the address of the s_add_u32 instruction) to the global 2300 // variable, but since the encoding of $symbol starts 4 bytes after the start 2301 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2302 // small. This requires us to add 4 to the global variable offset in order to 2303 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2304 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2305 // instruction. 2306 2307 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2308 2309 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2310 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2311 2312 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2313 .addDef(PCReg); 2314 2315 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2316 if (GAFlags == SIInstrInfo::MO_NONE) 2317 MIB.addImm(0); 2318 else 2319 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2320 2321 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2322 2323 if (PtrTy.getSizeInBits() == 32) 2324 B.buildExtract(DstReg, PCReg, 0); 2325 return true; 2326 } 2327 2328 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2329 MachineInstr &MI, MachineRegisterInfo &MRI, 2330 MachineIRBuilder &B) const { 2331 Register DstReg = MI.getOperand(0).getReg(); 2332 LLT Ty = MRI.getType(DstReg); 2333 unsigned AS = Ty.getAddressSpace(); 2334 2335 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2336 MachineFunction &MF = B.getMF(); 2337 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2338 2339 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2340 if (!MFI->isModuleEntryFunction() && 2341 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2342 const Function &Fn = MF.getFunction(); 2343 DiagnosticInfoUnsupported BadLDSDecl( 2344 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2345 DS_Warning); 2346 Fn.getContext().diagnose(BadLDSDecl); 2347 2348 // We currently don't have a way to correctly allocate LDS objects that 2349 // aren't directly associated with a kernel. We do force inlining of 2350 // functions that use local objects. However, if these dead functions are 2351 // not eliminated, we don't want a compile time error. Just emit a warning 2352 // and a trap, since there should be no callable path here. 2353 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2354 B.buildUndef(DstReg); 2355 MI.eraseFromParent(); 2356 return true; 2357 } 2358 2359 // TODO: We could emit code to handle the initialization somewhere. 2360 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2361 const SITargetLowering *TLI = ST.getTargetLowering(); 2362 if (!TLI->shouldUseLDSConstAddress(GV)) { 2363 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2364 return true; // Leave in place; 2365 } 2366 2367 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2368 Type *Ty = GV->getValueType(); 2369 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2370 // zero-sized type in other languages to declare the dynamic shared 2371 // memory which size is not known at the compile time. They will be 2372 // allocated by the runtime and placed directly after the static 2373 // allocated ones. They all share the same offset. 2374 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2375 // Adjust alignment for that dynamic shared memory array. 2376 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2377 LLT S32 = LLT::scalar(32); 2378 auto Sz = 2379 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2380 B.buildIntToPtr(DstReg, Sz); 2381 MI.eraseFromParent(); 2382 return true; 2383 } 2384 } 2385 2386 B.buildConstant( 2387 DstReg, 2388 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2389 MI.eraseFromParent(); 2390 return true; 2391 } 2392 2393 const Function &Fn = MF.getFunction(); 2394 DiagnosticInfoUnsupported BadInit( 2395 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2396 Fn.getContext().diagnose(BadInit); 2397 return true; 2398 } 2399 2400 const SITargetLowering *TLI = ST.getTargetLowering(); 2401 2402 if (TLI->shouldEmitFixup(GV)) { 2403 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2404 MI.eraseFromParent(); 2405 return true; 2406 } 2407 2408 if (TLI->shouldEmitPCReloc(GV)) { 2409 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2410 MI.eraseFromParent(); 2411 return true; 2412 } 2413 2414 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2415 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2416 2417 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2418 MachinePointerInfo::getGOT(MF), 2419 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2420 MachineMemOperand::MOInvariant, 2421 8 /*Size*/, Align(8)); 2422 2423 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2424 2425 if (Ty.getSizeInBits() == 32) { 2426 // Truncate if this is a 32-bit constant adrdess. 2427 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2428 B.buildExtract(DstReg, Load, 0); 2429 } else 2430 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2431 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 static LLT widenToNextPowerOf2(LLT Ty) { 2437 if (Ty.isVector()) 2438 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2439 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2440 } 2441 2442 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2443 MachineInstr &MI) const { 2444 MachineIRBuilder &B = Helper.MIRBuilder; 2445 MachineRegisterInfo &MRI = *B.getMRI(); 2446 GISelChangeObserver &Observer = Helper.Observer; 2447 2448 Register PtrReg = MI.getOperand(1).getReg(); 2449 LLT PtrTy = MRI.getType(PtrReg); 2450 unsigned AddrSpace = PtrTy.getAddressSpace(); 2451 2452 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2453 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2454 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2455 Observer.changingInstr(MI); 2456 MI.getOperand(1).setReg(Cast.getReg(0)); 2457 Observer.changedInstr(MI); 2458 return true; 2459 } 2460 2461 if (MI.getOpcode() != AMDGPU::G_LOAD) 2462 return false; 2463 2464 Register ValReg = MI.getOperand(0).getReg(); 2465 LLT ValTy = MRI.getType(ValReg); 2466 2467 MachineMemOperand *MMO = *MI.memoperands_begin(); 2468 const unsigned ValSize = ValTy.getSizeInBits(); 2469 const unsigned MemSize = 8 * MMO->getSize(); 2470 const Align MemAlign = MMO->getAlign(); 2471 const unsigned AlignInBits = 8 * MemAlign.value(); 2472 2473 // Widen non-power-of-2 loads to the alignment if needed 2474 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2475 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2476 2477 // This was already the correct extending load result type, so just adjust 2478 // the memory type. 2479 if (WideMemSize == ValSize) { 2480 MachineFunction &MF = B.getMF(); 2481 2482 MachineMemOperand *WideMMO = 2483 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2484 Observer.changingInstr(MI); 2485 MI.setMemRefs(MF, {WideMMO}); 2486 Observer.changedInstr(MI); 2487 return true; 2488 } 2489 2490 // Don't bother handling edge case that should probably never be produced. 2491 if (ValSize > WideMemSize) 2492 return false; 2493 2494 LLT WideTy = widenToNextPowerOf2(ValTy); 2495 2496 Register WideLoad; 2497 if (!WideTy.isVector()) { 2498 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2499 B.buildTrunc(ValReg, WideLoad).getReg(0); 2500 } else { 2501 // Extract the subvector. 2502 2503 if (isRegisterType(ValTy)) { 2504 // If this a case where G_EXTRACT is legal, use it. 2505 // (e.g. <3 x s32> -> <4 x s32>) 2506 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2507 B.buildExtract(ValReg, WideLoad, 0); 2508 } else { 2509 // For cases where the widened type isn't a nice register value, unmerge 2510 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2511 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2512 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2513 B.setInsertPt(B.getMBB(), MI.getIterator()); 2514 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2515 } 2516 } 2517 2518 MI.eraseFromParent(); 2519 return true; 2520 } 2521 2522 return false; 2523 } 2524 2525 bool AMDGPULegalizerInfo::legalizeFMad( 2526 MachineInstr &MI, MachineRegisterInfo &MRI, 2527 MachineIRBuilder &B) const { 2528 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2529 assert(Ty.isScalar()); 2530 2531 MachineFunction &MF = B.getMF(); 2532 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2533 2534 // TODO: Always legal with future ftz flag. 2535 // FIXME: Do we need just output? 2536 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2537 return true; 2538 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2539 return true; 2540 2541 MachineIRBuilder HelperBuilder(MI); 2542 GISelObserverWrapper DummyObserver; 2543 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2544 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2545 } 2546 2547 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2548 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2549 Register DstReg = MI.getOperand(0).getReg(); 2550 Register PtrReg = MI.getOperand(1).getReg(); 2551 Register CmpVal = MI.getOperand(2).getReg(); 2552 Register NewVal = MI.getOperand(3).getReg(); 2553 2554 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2555 "this should not have been custom lowered"); 2556 2557 LLT ValTy = MRI.getType(CmpVal); 2558 LLT VecTy = LLT::vector(2, ValTy); 2559 2560 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2561 2562 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2563 .addDef(DstReg) 2564 .addUse(PtrReg) 2565 .addUse(PackedVal) 2566 .setMemRefs(MI.memoperands()); 2567 2568 MI.eraseFromParent(); 2569 return true; 2570 } 2571 2572 bool AMDGPULegalizerInfo::legalizeFlog( 2573 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2574 Register Dst = MI.getOperand(0).getReg(); 2575 Register Src = MI.getOperand(1).getReg(); 2576 LLT Ty = B.getMRI()->getType(Dst); 2577 unsigned Flags = MI.getFlags(); 2578 2579 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2580 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2581 2582 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2583 MI.eraseFromParent(); 2584 return true; 2585 } 2586 2587 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2588 MachineIRBuilder &B) const { 2589 Register Dst = MI.getOperand(0).getReg(); 2590 Register Src = MI.getOperand(1).getReg(); 2591 unsigned Flags = MI.getFlags(); 2592 LLT Ty = B.getMRI()->getType(Dst); 2593 2594 auto K = B.buildFConstant(Ty, numbers::log2e); 2595 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2596 B.buildFExp2(Dst, Mul, Flags); 2597 MI.eraseFromParent(); 2598 return true; 2599 } 2600 2601 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2602 MachineIRBuilder &B) const { 2603 Register Dst = MI.getOperand(0).getReg(); 2604 Register Src0 = MI.getOperand(1).getReg(); 2605 Register Src1 = MI.getOperand(2).getReg(); 2606 unsigned Flags = MI.getFlags(); 2607 LLT Ty = B.getMRI()->getType(Dst); 2608 const LLT S16 = LLT::scalar(16); 2609 const LLT S32 = LLT::scalar(32); 2610 2611 if (Ty == S32) { 2612 auto Log = B.buildFLog2(S32, Src0, Flags); 2613 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2614 .addUse(Log.getReg(0)) 2615 .addUse(Src1) 2616 .setMIFlags(Flags); 2617 B.buildFExp2(Dst, Mul, Flags); 2618 } else if (Ty == S16) { 2619 // There's no f16 fmul_legacy, so we need to convert for it. 2620 auto Log = B.buildFLog2(S16, Src0, Flags); 2621 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2622 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2623 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2624 .addUse(Ext0.getReg(0)) 2625 .addUse(Ext1.getReg(0)) 2626 .setMIFlags(Flags); 2627 2628 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2629 } else 2630 return false; 2631 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 // Find a source register, ignoring any possible source modifiers. 2637 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2638 Register ModSrc = OrigSrc; 2639 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2640 ModSrc = SrcFNeg->getOperand(1).getReg(); 2641 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2642 ModSrc = SrcFAbs->getOperand(1).getReg(); 2643 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2644 ModSrc = SrcFAbs->getOperand(1).getReg(); 2645 return ModSrc; 2646 } 2647 2648 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2649 MachineRegisterInfo &MRI, 2650 MachineIRBuilder &B) const { 2651 2652 const LLT S1 = LLT::scalar(1); 2653 const LLT S64 = LLT::scalar(64); 2654 Register Dst = MI.getOperand(0).getReg(); 2655 Register OrigSrc = MI.getOperand(1).getReg(); 2656 unsigned Flags = MI.getFlags(); 2657 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2658 "this should not have been custom lowered"); 2659 2660 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2661 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2662 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2663 // V_FRACT bug is: 2664 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2665 // 2666 // Convert floor(x) to (x - fract(x)) 2667 2668 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2669 .addUse(OrigSrc) 2670 .setMIFlags(Flags); 2671 2672 // Give source modifier matching some assistance before obscuring a foldable 2673 // pattern. 2674 2675 // TODO: We can avoid the neg on the fract? The input sign to fract 2676 // shouldn't matter? 2677 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2678 2679 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2680 2681 Register Min = MRI.createGenericVirtualRegister(S64); 2682 2683 // We don't need to concern ourselves with the snan handling difference, so 2684 // use the one which will directly select. 2685 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2686 if (MFI->getMode().IEEE) 2687 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2688 else 2689 B.buildFMinNum(Min, Fract, Const, Flags); 2690 2691 Register CorrectedFract = Min; 2692 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2693 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2694 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2695 } 2696 2697 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2698 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2699 2700 MI.eraseFromParent(); 2701 return true; 2702 } 2703 2704 // Turn an illegal packed v2s16 build vector into bit operations. 2705 // TODO: This should probably be a bitcast action in LegalizerHelper. 2706 bool AMDGPULegalizerInfo::legalizeBuildVector( 2707 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2708 Register Dst = MI.getOperand(0).getReg(); 2709 const LLT S32 = LLT::scalar(32); 2710 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2711 2712 Register Src0 = MI.getOperand(1).getReg(); 2713 Register Src1 = MI.getOperand(2).getReg(); 2714 assert(MRI.getType(Src0) == LLT::scalar(16)); 2715 2716 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2717 B.buildBitcast(Dst, Merge); 2718 2719 MI.eraseFromParent(); 2720 return true; 2721 } 2722 2723 // Check that this is a G_XOR x, -1 2724 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2725 if (MI.getOpcode() != TargetOpcode::G_XOR) 2726 return false; 2727 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2728 return ConstVal && *ConstVal == -1; 2729 } 2730 2731 // Return the use branch instruction, otherwise null if the usage is invalid. 2732 static MachineInstr * 2733 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2734 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2735 Register CondDef = MI.getOperand(0).getReg(); 2736 if (!MRI.hasOneNonDBGUse(CondDef)) 2737 return nullptr; 2738 2739 MachineBasicBlock *Parent = MI.getParent(); 2740 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2741 2742 if (isNot(MRI, *UseMI)) { 2743 Register NegatedCond = UseMI->getOperand(0).getReg(); 2744 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2745 return nullptr; 2746 2747 // We're deleting the def of this value, so we need to remove it. 2748 UseMI->eraseFromParent(); 2749 2750 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2751 Negated = true; 2752 } 2753 2754 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2755 return nullptr; 2756 2757 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2758 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2759 if (Next == Parent->end()) { 2760 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2761 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2762 return nullptr; 2763 UncondBrTarget = &*NextMBB; 2764 } else { 2765 if (Next->getOpcode() != AMDGPU::G_BR) 2766 return nullptr; 2767 Br = &*Next; 2768 UncondBrTarget = Br->getOperand(0).getMBB(); 2769 } 2770 2771 return UseMI; 2772 } 2773 2774 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2775 const ArgDescriptor *Arg, 2776 const TargetRegisterClass *ArgRC, 2777 LLT ArgTy) const { 2778 MCRegister SrcReg = Arg->getRegister(); 2779 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2780 assert(DstReg.isVirtual() && "Virtual register expected"); 2781 2782 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2783 ArgTy); 2784 if (Arg->isMasked()) { 2785 // TODO: Should we try to emit this once in the entry block? 2786 const LLT S32 = LLT::scalar(32); 2787 const unsigned Mask = Arg->getMask(); 2788 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2789 2790 Register AndMaskSrc = LiveIn; 2791 2792 if (Shift != 0) { 2793 auto ShiftAmt = B.buildConstant(S32, Shift); 2794 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2795 } 2796 2797 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2798 } else { 2799 B.buildCopy(DstReg, LiveIn); 2800 } 2801 2802 return true; 2803 } 2804 2805 bool AMDGPULegalizerInfo::loadInputValue( 2806 Register DstReg, MachineIRBuilder &B, 2807 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2808 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2809 const ArgDescriptor *Arg; 2810 const TargetRegisterClass *ArgRC; 2811 LLT ArgTy; 2812 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2813 2814 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2815 return false; // TODO: Handle these 2816 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2817 } 2818 2819 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2820 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2821 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2822 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2823 return false; 2824 2825 MI.eraseFromParent(); 2826 return true; 2827 } 2828 2829 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2830 MachineRegisterInfo &MRI, 2831 MachineIRBuilder &B) const { 2832 Register Dst = MI.getOperand(0).getReg(); 2833 LLT DstTy = MRI.getType(Dst); 2834 LLT S16 = LLT::scalar(16); 2835 LLT S32 = LLT::scalar(32); 2836 LLT S64 = LLT::scalar(64); 2837 2838 if (DstTy == S16) 2839 return legalizeFDIV16(MI, MRI, B); 2840 if (DstTy == S32) 2841 return legalizeFDIV32(MI, MRI, B); 2842 if (DstTy == S64) 2843 return legalizeFDIV64(MI, MRI, B); 2844 2845 return false; 2846 } 2847 2848 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2849 Register DstDivReg, 2850 Register DstRemReg, 2851 Register X, 2852 Register Y) const { 2853 const LLT S1 = LLT::scalar(1); 2854 const LLT S32 = LLT::scalar(32); 2855 2856 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2857 // algorithm used here. 2858 2859 // Initial estimate of inv(y). 2860 auto FloatY = B.buildUITOFP(S32, Y); 2861 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2862 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2863 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2864 auto Z = B.buildFPTOUI(S32, ScaledY); 2865 2866 // One round of UNR. 2867 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2868 auto NegYZ = B.buildMul(S32, NegY, Z); 2869 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2870 2871 // Quotient/remainder estimate. 2872 auto Q = B.buildUMulH(S32, X, Z); 2873 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2874 2875 // First quotient/remainder refinement. 2876 auto One = B.buildConstant(S32, 1); 2877 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2878 if (DstDivReg) 2879 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2880 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2881 2882 // Second quotient/remainder refinement. 2883 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2884 if (DstDivReg) 2885 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2886 2887 if (DstRemReg) 2888 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2889 } 2890 2891 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2892 // 2893 // Return lo, hi of result 2894 // 2895 // %cvt.lo = G_UITOFP Val.lo 2896 // %cvt.hi = G_UITOFP Val.hi 2897 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2898 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2899 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2900 // %mul2 = G_FMUL %mul1, 2**(-32) 2901 // %trunc = G_INTRINSIC_TRUNC %mul2 2902 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2903 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2904 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2905 Register Val) { 2906 const LLT S32 = LLT::scalar(32); 2907 auto Unmerge = B.buildUnmerge(S32, Val); 2908 2909 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2910 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2911 2912 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2913 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2914 2915 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2916 auto Mul1 = 2917 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2918 2919 // 2**(-32) 2920 auto Mul2 = 2921 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2922 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2923 2924 // -(2**32) 2925 auto Mad2 = B.buildFMAD(S32, Trunc, 2926 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2927 2928 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2929 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2930 2931 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2932 } 2933 2934 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2935 Register DstDivReg, 2936 Register DstRemReg, 2937 Register Numer, 2938 Register Denom) const { 2939 const LLT S32 = LLT::scalar(32); 2940 const LLT S64 = LLT::scalar(64); 2941 const LLT S1 = LLT::scalar(1); 2942 Register RcpLo, RcpHi; 2943 2944 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2945 2946 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2947 2948 auto Zero64 = B.buildConstant(S64, 0); 2949 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2950 2951 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2952 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2953 2954 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2955 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2956 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2957 2958 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2959 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2960 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2961 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2962 2963 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2964 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2965 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2966 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2967 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2968 2969 auto Zero32 = B.buildConstant(S32, 0); 2970 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2971 auto Add2_HiC = 2972 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2973 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2974 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2975 2976 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2977 Register NumerLo = UnmergeNumer.getReg(0); 2978 Register NumerHi = UnmergeNumer.getReg(1); 2979 2980 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2981 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2982 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2983 Register Mul3_Lo = UnmergeMul3.getReg(0); 2984 Register Mul3_Hi = UnmergeMul3.getReg(1); 2985 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2986 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2987 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2988 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2989 2990 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2991 Register DenomLo = UnmergeDenom.getReg(0); 2992 Register DenomHi = UnmergeDenom.getReg(1); 2993 2994 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2995 auto C1 = B.buildSExt(S32, CmpHi); 2996 2997 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2998 auto C2 = B.buildSExt(S32, CmpLo); 2999 3000 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3001 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3002 3003 // TODO: Here and below portions of the code can be enclosed into if/endif. 3004 // Currently control flow is unconditional and we have 4 selects after 3005 // potential endif to substitute PHIs. 3006 3007 // if C3 != 0 ... 3008 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3009 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3010 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3011 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3012 3013 auto One64 = B.buildConstant(S64, 1); 3014 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3015 3016 auto C4 = 3017 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3018 auto C5 = 3019 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3020 auto C6 = B.buildSelect( 3021 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3022 3023 // if (C6 != 0) 3024 auto Add4 = B.buildAdd(S64, Add3, One64); 3025 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3026 3027 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3028 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3029 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3030 3031 // endif C6 3032 // endif C3 3033 3034 if (DstDivReg) { 3035 auto Sel1 = B.buildSelect( 3036 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3037 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3038 Sel1, MulHi3); 3039 } 3040 3041 if (DstRemReg) { 3042 auto Sel2 = B.buildSelect( 3043 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3044 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3045 Sel2, Sub1); 3046 } 3047 } 3048 3049 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3050 MachineRegisterInfo &MRI, 3051 MachineIRBuilder &B) const { 3052 Register DstDivReg, DstRemReg; 3053 switch (MI.getOpcode()) { 3054 default: 3055 llvm_unreachable("Unexpected opcode!"); 3056 case AMDGPU::G_UDIV: { 3057 DstDivReg = MI.getOperand(0).getReg(); 3058 break; 3059 } 3060 case AMDGPU::G_UREM: { 3061 DstRemReg = MI.getOperand(0).getReg(); 3062 break; 3063 } 3064 case AMDGPU::G_UDIVREM: { 3065 DstDivReg = MI.getOperand(0).getReg(); 3066 DstRemReg = MI.getOperand(1).getReg(); 3067 break; 3068 } 3069 } 3070 3071 const LLT S64 = LLT::scalar(64); 3072 const LLT S32 = LLT::scalar(32); 3073 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3074 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3075 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3076 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3077 3078 if (Ty == S32) 3079 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3080 else if (Ty == S64) 3081 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3082 else 3083 return false; 3084 3085 MI.eraseFromParent(); 3086 return true; 3087 } 3088 3089 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3090 MachineRegisterInfo &MRI, 3091 MachineIRBuilder &B) const { 3092 const LLT S64 = LLT::scalar(64); 3093 const LLT S32 = LLT::scalar(32); 3094 3095 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3096 if (Ty != S32 && Ty != S64) 3097 return false; 3098 3099 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3100 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3101 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3102 3103 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3104 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3105 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3106 3107 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3108 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3109 3110 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3111 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3112 3113 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3114 switch (MI.getOpcode()) { 3115 default: 3116 llvm_unreachable("Unexpected opcode!"); 3117 case AMDGPU::G_SDIV: { 3118 DstDivReg = MI.getOperand(0).getReg(); 3119 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3120 break; 3121 } 3122 case AMDGPU::G_SREM: { 3123 DstRemReg = MI.getOperand(0).getReg(); 3124 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3125 break; 3126 } 3127 case AMDGPU::G_SDIVREM: { 3128 DstDivReg = MI.getOperand(0).getReg(); 3129 DstRemReg = MI.getOperand(1).getReg(); 3130 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3131 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3132 break; 3133 } 3134 } 3135 3136 if (Ty == S32) 3137 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3138 else 3139 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3140 3141 if (DstDivReg) { 3142 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3143 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3144 B.buildSub(DstDivReg, SignXor, Sign); 3145 } 3146 3147 if (DstRemReg) { 3148 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3149 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3150 B.buildSub(DstRemReg, SignXor, Sign); 3151 } 3152 3153 MI.eraseFromParent(); 3154 return true; 3155 } 3156 3157 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3158 MachineRegisterInfo &MRI, 3159 MachineIRBuilder &B) const { 3160 Register Res = MI.getOperand(0).getReg(); 3161 Register LHS = MI.getOperand(1).getReg(); 3162 Register RHS = MI.getOperand(2).getReg(); 3163 uint16_t Flags = MI.getFlags(); 3164 LLT ResTy = MRI.getType(Res); 3165 3166 const MachineFunction &MF = B.getMF(); 3167 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3168 MI.getFlag(MachineInstr::FmAfn); 3169 3170 if (!AllowInaccurateRcp) 3171 return false; 3172 3173 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3174 // 1 / x -> RCP(x) 3175 if (CLHS->isExactlyValue(1.0)) { 3176 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3177 .addUse(RHS) 3178 .setMIFlags(Flags); 3179 3180 MI.eraseFromParent(); 3181 return true; 3182 } 3183 3184 // -1 / x -> RCP( FNEG(x) ) 3185 if (CLHS->isExactlyValue(-1.0)) { 3186 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3187 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3188 .addUse(FNeg.getReg(0)) 3189 .setMIFlags(Flags); 3190 3191 MI.eraseFromParent(); 3192 return true; 3193 } 3194 } 3195 3196 // x / y -> x * (1.0 / y) 3197 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3198 .addUse(RHS) 3199 .setMIFlags(Flags); 3200 B.buildFMul(Res, LHS, RCP, Flags); 3201 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3207 MachineRegisterInfo &MRI, 3208 MachineIRBuilder &B) const { 3209 Register Res = MI.getOperand(0).getReg(); 3210 Register X = MI.getOperand(1).getReg(); 3211 Register Y = MI.getOperand(2).getReg(); 3212 uint16_t Flags = MI.getFlags(); 3213 LLT ResTy = MRI.getType(Res); 3214 3215 const MachineFunction &MF = B.getMF(); 3216 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3217 MI.getFlag(MachineInstr::FmAfn); 3218 3219 if (!AllowInaccurateRcp) 3220 return false; 3221 3222 auto NegY = B.buildFNeg(ResTy, Y); 3223 auto One = B.buildFConstant(ResTy, 1.0); 3224 3225 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3226 .addUse(Y) 3227 .setMIFlags(Flags); 3228 3229 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3230 R = B.buildFMA(ResTy, Tmp0, R, R); 3231 3232 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3233 R = B.buildFMA(ResTy, Tmp1, R, R); 3234 3235 auto Ret = B.buildFMul(ResTy, X, R); 3236 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3237 3238 B.buildFMA(Res, Tmp2, R, Ret); 3239 MI.eraseFromParent(); 3240 return true; 3241 } 3242 3243 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3244 MachineRegisterInfo &MRI, 3245 MachineIRBuilder &B) const { 3246 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3247 return true; 3248 3249 Register Res = MI.getOperand(0).getReg(); 3250 Register LHS = MI.getOperand(1).getReg(); 3251 Register RHS = MI.getOperand(2).getReg(); 3252 3253 uint16_t Flags = MI.getFlags(); 3254 3255 LLT S16 = LLT::scalar(16); 3256 LLT S32 = LLT::scalar(32); 3257 3258 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3259 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3260 3261 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3262 .addUse(RHSExt.getReg(0)) 3263 .setMIFlags(Flags); 3264 3265 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3266 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3267 3268 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3269 .addUse(RDst.getReg(0)) 3270 .addUse(RHS) 3271 .addUse(LHS) 3272 .setMIFlags(Flags); 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 3278 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3279 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3280 static void toggleSPDenormMode(bool Enable, 3281 MachineIRBuilder &B, 3282 const GCNSubtarget &ST, 3283 AMDGPU::SIModeRegisterDefaults Mode) { 3284 // Set SP denorm mode to this value. 3285 unsigned SPDenormMode = 3286 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3287 3288 if (ST.hasDenormModeInst()) { 3289 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3290 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3291 3292 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3293 B.buildInstr(AMDGPU::S_DENORM_MODE) 3294 .addImm(NewDenormModeValue); 3295 3296 } else { 3297 // Select FP32 bit field in mode register. 3298 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3299 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3300 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3301 3302 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3303 .addImm(SPDenormMode) 3304 .addImm(SPDenormModeBitField); 3305 } 3306 } 3307 3308 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3309 MachineRegisterInfo &MRI, 3310 MachineIRBuilder &B) const { 3311 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3312 return true; 3313 3314 Register Res = MI.getOperand(0).getReg(); 3315 Register LHS = MI.getOperand(1).getReg(); 3316 Register RHS = MI.getOperand(2).getReg(); 3317 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3318 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3319 3320 uint16_t Flags = MI.getFlags(); 3321 3322 LLT S32 = LLT::scalar(32); 3323 LLT S1 = LLT::scalar(1); 3324 3325 auto One = B.buildFConstant(S32, 1.0f); 3326 3327 auto DenominatorScaled = 3328 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3329 .addUse(LHS) 3330 .addUse(RHS) 3331 .addImm(0) 3332 .setMIFlags(Flags); 3333 auto NumeratorScaled = 3334 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3335 .addUse(LHS) 3336 .addUse(RHS) 3337 .addImm(1) 3338 .setMIFlags(Flags); 3339 3340 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3341 .addUse(DenominatorScaled.getReg(0)) 3342 .setMIFlags(Flags); 3343 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3344 3345 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3346 // aren't modeled as reading it. 3347 if (!Mode.allFP32Denormals()) 3348 toggleSPDenormMode(true, B, ST, Mode); 3349 3350 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3351 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3352 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3353 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3354 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3355 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3356 3357 if (!Mode.allFP32Denormals()) 3358 toggleSPDenormMode(false, B, ST, Mode); 3359 3360 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3361 .addUse(Fma4.getReg(0)) 3362 .addUse(Fma1.getReg(0)) 3363 .addUse(Fma3.getReg(0)) 3364 .addUse(NumeratorScaled.getReg(1)) 3365 .setMIFlags(Flags); 3366 3367 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3368 .addUse(Fmas.getReg(0)) 3369 .addUse(RHS) 3370 .addUse(LHS) 3371 .setMIFlags(Flags); 3372 3373 MI.eraseFromParent(); 3374 return true; 3375 } 3376 3377 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3378 MachineRegisterInfo &MRI, 3379 MachineIRBuilder &B) const { 3380 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3381 return true; 3382 3383 Register Res = MI.getOperand(0).getReg(); 3384 Register LHS = MI.getOperand(1).getReg(); 3385 Register RHS = MI.getOperand(2).getReg(); 3386 3387 uint16_t Flags = MI.getFlags(); 3388 3389 LLT S64 = LLT::scalar(64); 3390 LLT S1 = LLT::scalar(1); 3391 3392 auto One = B.buildFConstant(S64, 1.0); 3393 3394 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3395 .addUse(LHS) 3396 .addUse(RHS) 3397 .addImm(0) 3398 .setMIFlags(Flags); 3399 3400 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3401 3402 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3403 .addUse(DivScale0.getReg(0)) 3404 .setMIFlags(Flags); 3405 3406 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3407 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3408 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3409 3410 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3411 .addUse(LHS) 3412 .addUse(RHS) 3413 .addImm(1) 3414 .setMIFlags(Flags); 3415 3416 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3417 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3418 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3419 3420 Register Scale; 3421 if (!ST.hasUsableDivScaleConditionOutput()) { 3422 // Workaround a hardware bug on SI where the condition output from div_scale 3423 // is not usable. 3424 3425 LLT S32 = LLT::scalar(32); 3426 3427 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3428 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3429 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3430 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3431 3432 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3433 Scale1Unmerge.getReg(1)); 3434 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3435 Scale0Unmerge.getReg(1)); 3436 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3437 } else { 3438 Scale = DivScale1.getReg(1); 3439 } 3440 3441 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3442 .addUse(Fma4.getReg(0)) 3443 .addUse(Fma3.getReg(0)) 3444 .addUse(Mul.getReg(0)) 3445 .addUse(Scale) 3446 .setMIFlags(Flags); 3447 3448 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3449 .addUse(Fmas.getReg(0)) 3450 .addUse(RHS) 3451 .addUse(LHS) 3452 .setMIFlags(Flags); 3453 3454 MI.eraseFromParent(); 3455 return true; 3456 } 3457 3458 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3459 MachineRegisterInfo &MRI, 3460 MachineIRBuilder &B) const { 3461 Register Res = MI.getOperand(0).getReg(); 3462 Register LHS = MI.getOperand(2).getReg(); 3463 Register RHS = MI.getOperand(3).getReg(); 3464 uint16_t Flags = MI.getFlags(); 3465 3466 LLT S32 = LLT::scalar(32); 3467 LLT S1 = LLT::scalar(1); 3468 3469 auto Abs = B.buildFAbs(S32, RHS, Flags); 3470 const APFloat C0Val(1.0f); 3471 3472 auto C0 = B.buildConstant(S32, 0x6f800000); 3473 auto C1 = B.buildConstant(S32, 0x2f800000); 3474 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3475 3476 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3477 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3478 3479 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3480 3481 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3482 .addUse(Mul0.getReg(0)) 3483 .setMIFlags(Flags); 3484 3485 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3486 3487 B.buildFMul(Res, Sel, Mul1, Flags); 3488 3489 MI.eraseFromParent(); 3490 return true; 3491 } 3492 3493 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3494 // FIXME: Why do we handle this one but not other removed instructions? 3495 // 3496 // Reciprocal square root. The clamp prevents infinite results, clamping 3497 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3498 // +-max_float. 3499 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3500 MachineRegisterInfo &MRI, 3501 MachineIRBuilder &B) const { 3502 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3503 return true; 3504 3505 Register Dst = MI.getOperand(0).getReg(); 3506 Register Src = MI.getOperand(2).getReg(); 3507 auto Flags = MI.getFlags(); 3508 3509 LLT Ty = MRI.getType(Dst); 3510 3511 const fltSemantics *FltSemantics; 3512 if (Ty == LLT::scalar(32)) 3513 FltSemantics = &APFloat::IEEEsingle(); 3514 else if (Ty == LLT::scalar(64)) 3515 FltSemantics = &APFloat::IEEEdouble(); 3516 else 3517 return false; 3518 3519 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3520 .addUse(Src) 3521 .setMIFlags(Flags); 3522 3523 // We don't need to concern ourselves with the snan handling difference, since 3524 // the rsq quieted (or not) so use the one which will directly select. 3525 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3526 const bool UseIEEE = MFI->getMode().IEEE; 3527 3528 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3529 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3530 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3531 3532 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3533 3534 if (UseIEEE) 3535 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3536 else 3537 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3538 MI.eraseFromParent(); 3539 return true; 3540 } 3541 3542 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3543 switch (IID) { 3544 case Intrinsic::amdgcn_ds_fadd: 3545 return AMDGPU::G_ATOMICRMW_FADD; 3546 case Intrinsic::amdgcn_ds_fmin: 3547 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3548 case Intrinsic::amdgcn_ds_fmax: 3549 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3550 default: 3551 llvm_unreachable("not a DS FP intrinsic"); 3552 } 3553 } 3554 3555 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3556 MachineInstr &MI, 3557 Intrinsic::ID IID) const { 3558 GISelChangeObserver &Observer = Helper.Observer; 3559 Observer.changingInstr(MI); 3560 3561 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3562 3563 // The remaining operands were used to set fields in the MemOperand on 3564 // construction. 3565 for (int I = 6; I > 3; --I) 3566 MI.RemoveOperand(I); 3567 3568 MI.RemoveOperand(1); // Remove the intrinsic ID. 3569 Observer.changedInstr(MI); 3570 return true; 3571 } 3572 3573 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3574 MachineRegisterInfo &MRI, 3575 MachineIRBuilder &B) const { 3576 uint64_t Offset = 3577 ST.getTargetLowering()->getImplicitParameterOffset( 3578 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3579 LLT DstTy = MRI.getType(DstReg); 3580 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3581 3582 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3583 if (!loadInputValue(KernargPtrReg, B, 3584 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3585 return false; 3586 3587 // FIXME: This should be nuw 3588 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3589 return true; 3590 } 3591 3592 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3593 MachineRegisterInfo &MRI, 3594 MachineIRBuilder &B) const { 3595 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3596 if (!MFI->isEntryFunction()) { 3597 return legalizePreloadedArgIntrin(MI, MRI, B, 3598 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3599 } 3600 3601 Register DstReg = MI.getOperand(0).getReg(); 3602 if (!getImplicitArgPtr(DstReg, MRI, B)) 3603 return false; 3604 3605 MI.eraseFromParent(); 3606 return true; 3607 } 3608 3609 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3610 MachineRegisterInfo &MRI, 3611 MachineIRBuilder &B, 3612 unsigned AddrSpace) const { 3613 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3614 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3615 Register Hi32 = Unmerge.getReg(1); 3616 3617 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3618 MI.eraseFromParent(); 3619 return true; 3620 } 3621 3622 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3623 // offset (the offset that is included in bounds checking and swizzling, to be 3624 // split between the instruction's voffset and immoffset fields) and soffset 3625 // (the offset that is excluded from bounds checking and swizzling, to go in 3626 // the instruction's soffset field). This function takes the first kind of 3627 // offset and figures out how to split it between voffset and immoffset. 3628 std::tuple<Register, unsigned, unsigned> 3629 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3630 Register OrigOffset) const { 3631 const unsigned MaxImm = 4095; 3632 Register BaseReg; 3633 unsigned TotalConstOffset; 3634 const LLT S32 = LLT::scalar(32); 3635 MachineRegisterInfo &MRI = *B.getMRI(); 3636 3637 std::tie(BaseReg, TotalConstOffset) = 3638 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3639 3640 unsigned ImmOffset = TotalConstOffset; 3641 3642 // If BaseReg is a pointer, convert it to int. 3643 if (MRI.getType(BaseReg).isPointer()) 3644 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3645 3646 // If the immediate value is too big for the immoffset field, put the value 3647 // and -4096 into the immoffset field so that the value that is copied/added 3648 // for the voffset field is a multiple of 4096, and it stands more chance 3649 // of being CSEd with the copy/add for another similar load/store. 3650 // However, do not do that rounding down to a multiple of 4096 if that is a 3651 // negative number, as it appears to be illegal to have a negative offset 3652 // in the vgpr, even if adding the immediate offset makes it positive. 3653 unsigned Overflow = ImmOffset & ~MaxImm; 3654 ImmOffset -= Overflow; 3655 if ((int32_t)Overflow < 0) { 3656 Overflow += ImmOffset; 3657 ImmOffset = 0; 3658 } 3659 3660 if (Overflow != 0) { 3661 if (!BaseReg) { 3662 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3663 } else { 3664 auto OverflowVal = B.buildConstant(S32, Overflow); 3665 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3666 } 3667 } 3668 3669 if (!BaseReg) 3670 BaseReg = B.buildConstant(S32, 0).getReg(0); 3671 3672 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3673 } 3674 3675 /// Handle register layout difference for f16 images for some subtargets. 3676 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3677 MachineRegisterInfo &MRI, 3678 Register Reg, 3679 bool ImageStore) const { 3680 const LLT S16 = LLT::scalar(16); 3681 const LLT S32 = LLT::scalar(32); 3682 LLT StoreVT = MRI.getType(Reg); 3683 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3684 3685 if (ST.hasUnpackedD16VMem()) { 3686 auto Unmerge = B.buildUnmerge(S16, Reg); 3687 3688 SmallVector<Register, 4> WideRegs; 3689 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3690 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3691 3692 int NumElts = StoreVT.getNumElements(); 3693 3694 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3695 } 3696 3697 if (ImageStore && ST.hasImageStoreD16Bug()) { 3698 if (StoreVT.getNumElements() == 2) { 3699 SmallVector<Register, 4> PackedRegs; 3700 Reg = B.buildBitcast(S32, Reg).getReg(0); 3701 PackedRegs.push_back(Reg); 3702 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3703 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3704 } 3705 3706 if (StoreVT.getNumElements() == 3) { 3707 SmallVector<Register, 4> PackedRegs; 3708 auto Unmerge = B.buildUnmerge(S16, Reg); 3709 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3710 PackedRegs.push_back(Unmerge.getReg(I)); 3711 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3712 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3713 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3714 } 3715 3716 if (StoreVT.getNumElements() == 4) { 3717 SmallVector<Register, 4> PackedRegs; 3718 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3719 auto Unmerge = B.buildUnmerge(S32, Reg); 3720 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3721 PackedRegs.push_back(Unmerge.getReg(I)); 3722 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3723 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3724 } 3725 3726 llvm_unreachable("invalid data type"); 3727 } 3728 3729 return Reg; 3730 } 3731 3732 Register AMDGPULegalizerInfo::fixStoreSourceType( 3733 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3734 MachineRegisterInfo *MRI = B.getMRI(); 3735 LLT Ty = MRI->getType(VData); 3736 3737 const LLT S16 = LLT::scalar(16); 3738 3739 // Fixup illegal register types for i8 stores. 3740 if (Ty == LLT::scalar(8) || Ty == S16) { 3741 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3742 return AnyExt; 3743 } 3744 3745 if (Ty.isVector()) { 3746 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3747 if (IsFormat) 3748 return handleD16VData(B, *MRI, VData); 3749 } 3750 } 3751 3752 return VData; 3753 } 3754 3755 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3756 MachineRegisterInfo &MRI, 3757 MachineIRBuilder &B, 3758 bool IsTyped, 3759 bool IsFormat) const { 3760 Register VData = MI.getOperand(1).getReg(); 3761 LLT Ty = MRI.getType(VData); 3762 LLT EltTy = Ty.getScalarType(); 3763 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3764 const LLT S32 = LLT::scalar(32); 3765 3766 VData = fixStoreSourceType(B, VData, IsFormat); 3767 Register RSrc = MI.getOperand(2).getReg(); 3768 3769 MachineMemOperand *MMO = *MI.memoperands_begin(); 3770 const int MemSize = MMO->getSize(); 3771 3772 unsigned ImmOffset; 3773 unsigned TotalOffset; 3774 3775 // The typed intrinsics add an immediate after the registers. 3776 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3777 3778 // The struct intrinsic variants add one additional operand over raw. 3779 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3780 Register VIndex; 3781 int OpOffset = 0; 3782 if (HasVIndex) { 3783 VIndex = MI.getOperand(3).getReg(); 3784 OpOffset = 1; 3785 } 3786 3787 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3788 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3789 3790 unsigned Format = 0; 3791 if (IsTyped) { 3792 Format = MI.getOperand(5 + OpOffset).getImm(); 3793 ++OpOffset; 3794 } 3795 3796 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3797 3798 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3799 if (TotalOffset != 0) 3800 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3801 3802 unsigned Opc; 3803 if (IsTyped) { 3804 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3805 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3806 } else if (IsFormat) { 3807 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3808 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3809 } else { 3810 switch (MemSize) { 3811 case 1: 3812 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3813 break; 3814 case 2: 3815 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3816 break; 3817 default: 3818 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3819 break; 3820 } 3821 } 3822 3823 if (!VIndex) 3824 VIndex = B.buildConstant(S32, 0).getReg(0); 3825 3826 auto MIB = B.buildInstr(Opc) 3827 .addUse(VData) // vdata 3828 .addUse(RSrc) // rsrc 3829 .addUse(VIndex) // vindex 3830 .addUse(VOffset) // voffset 3831 .addUse(SOffset) // soffset 3832 .addImm(ImmOffset); // offset(imm) 3833 3834 if (IsTyped) 3835 MIB.addImm(Format); 3836 3837 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3838 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3839 .addMemOperand(MMO); 3840 3841 MI.eraseFromParent(); 3842 return true; 3843 } 3844 3845 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3846 MachineRegisterInfo &MRI, 3847 MachineIRBuilder &B, 3848 bool IsFormat, 3849 bool IsTyped) const { 3850 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3851 MachineMemOperand *MMO = *MI.memoperands_begin(); 3852 const int MemSize = MMO->getSize(); 3853 const LLT S32 = LLT::scalar(32); 3854 3855 Register Dst = MI.getOperand(0).getReg(); 3856 Register RSrc = MI.getOperand(2).getReg(); 3857 3858 // The typed intrinsics add an immediate after the registers. 3859 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3860 3861 // The struct intrinsic variants add one additional operand over raw. 3862 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3863 Register VIndex; 3864 int OpOffset = 0; 3865 if (HasVIndex) { 3866 VIndex = MI.getOperand(3).getReg(); 3867 OpOffset = 1; 3868 } 3869 3870 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3871 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3872 3873 unsigned Format = 0; 3874 if (IsTyped) { 3875 Format = MI.getOperand(5 + OpOffset).getImm(); 3876 ++OpOffset; 3877 } 3878 3879 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3880 unsigned ImmOffset; 3881 unsigned TotalOffset; 3882 3883 LLT Ty = MRI.getType(Dst); 3884 LLT EltTy = Ty.getScalarType(); 3885 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3886 const bool Unpacked = ST.hasUnpackedD16VMem(); 3887 3888 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3889 if (TotalOffset != 0) 3890 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3891 3892 unsigned Opc; 3893 3894 if (IsTyped) { 3895 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3896 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3897 } else if (IsFormat) { 3898 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3899 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3900 } else { 3901 switch (MemSize) { 3902 case 1: 3903 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3904 break; 3905 case 2: 3906 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3907 break; 3908 default: 3909 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3910 break; 3911 } 3912 } 3913 3914 Register LoadDstReg; 3915 3916 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3917 LLT UnpackedTy = Ty.changeElementSize(32); 3918 3919 if (IsExtLoad) 3920 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3921 else if (Unpacked && IsD16 && Ty.isVector()) 3922 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3923 else 3924 LoadDstReg = Dst; 3925 3926 if (!VIndex) 3927 VIndex = B.buildConstant(S32, 0).getReg(0); 3928 3929 auto MIB = B.buildInstr(Opc) 3930 .addDef(LoadDstReg) // vdata 3931 .addUse(RSrc) // rsrc 3932 .addUse(VIndex) // vindex 3933 .addUse(VOffset) // voffset 3934 .addUse(SOffset) // soffset 3935 .addImm(ImmOffset); // offset(imm) 3936 3937 if (IsTyped) 3938 MIB.addImm(Format); 3939 3940 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3941 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3942 .addMemOperand(MMO); 3943 3944 if (LoadDstReg != Dst) { 3945 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3946 3947 // Widen result for extending loads was widened. 3948 if (IsExtLoad) 3949 B.buildTrunc(Dst, LoadDstReg); 3950 else { 3951 // Repack to original 16-bit vector result 3952 // FIXME: G_TRUNC should work, but legalization currently fails 3953 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3954 SmallVector<Register, 4> Repack; 3955 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3956 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3957 B.buildMerge(Dst, Repack); 3958 } 3959 } 3960 3961 MI.eraseFromParent(); 3962 return true; 3963 } 3964 3965 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3966 MachineIRBuilder &B, 3967 bool IsInc) const { 3968 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3969 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3970 B.buildInstr(Opc) 3971 .addDef(MI.getOperand(0).getReg()) 3972 .addUse(MI.getOperand(2).getReg()) 3973 .addUse(MI.getOperand(3).getReg()) 3974 .cloneMemRefs(MI); 3975 MI.eraseFromParent(); 3976 return true; 3977 } 3978 3979 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3980 switch (IntrID) { 3981 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3982 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3984 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3985 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3986 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3987 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3988 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3990 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3991 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3992 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3993 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3994 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3996 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3997 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3999 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4000 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4001 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4002 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4003 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4005 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4006 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4007 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4008 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4009 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4011 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4012 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4014 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4015 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4016 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4017 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4019 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4020 case Intrinsic::amdgcn_buffer_atomic_fadd: 4021 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4022 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4023 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4024 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4025 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4026 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4027 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4028 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4029 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4030 default: 4031 llvm_unreachable("unhandled atomic opcode"); 4032 } 4033 } 4034 4035 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4036 MachineIRBuilder &B, 4037 Intrinsic::ID IID) const { 4038 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4039 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4040 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4041 4042 Register Dst; 4043 4044 int OpOffset = 0; 4045 if (HasReturn) { 4046 // A few FP atomics do not support return values. 4047 Dst = MI.getOperand(0).getReg(); 4048 } else { 4049 OpOffset = -1; 4050 } 4051 4052 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4053 Register CmpVal; 4054 4055 if (IsCmpSwap) { 4056 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4057 ++OpOffset; 4058 } 4059 4060 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4061 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4062 4063 // The struct intrinsic variants add one additional operand over raw. 4064 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4065 Register VIndex; 4066 if (HasVIndex) { 4067 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4068 ++OpOffset; 4069 } 4070 4071 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4072 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4073 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4074 4075 MachineMemOperand *MMO = *MI.memoperands_begin(); 4076 4077 unsigned ImmOffset; 4078 unsigned TotalOffset; 4079 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4080 if (TotalOffset != 0) 4081 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4082 4083 if (!VIndex) 4084 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4085 4086 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4087 4088 if (HasReturn) 4089 MIB.addDef(Dst); 4090 4091 MIB.addUse(VData); // vdata 4092 4093 if (IsCmpSwap) 4094 MIB.addReg(CmpVal); 4095 4096 MIB.addUse(RSrc) // rsrc 4097 .addUse(VIndex) // vindex 4098 .addUse(VOffset) // voffset 4099 .addUse(SOffset) // soffset 4100 .addImm(ImmOffset) // offset(imm) 4101 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4102 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4103 .addMemOperand(MMO); 4104 4105 MI.eraseFromParent(); 4106 return true; 4107 } 4108 4109 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4110 /// vector with s16 typed elements. 4111 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4112 SmallVectorImpl<Register> &PackedAddrs, 4113 unsigned ArgOffset, 4114 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4115 bool IsA16, bool IsG16) { 4116 const LLT S16 = LLT::scalar(16); 4117 const LLT V2S16 = LLT::vector(2, 16); 4118 auto EndIdx = Intr->VAddrEnd; 4119 4120 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4121 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4122 if (!SrcOp.isReg()) 4123 continue; // _L to _LZ may have eliminated this. 4124 4125 Register AddrReg = SrcOp.getReg(); 4126 4127 if (I < Intr->GradientStart) { 4128 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4129 PackedAddrs.push_back(AddrReg); 4130 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4131 (I >= Intr->CoordStart && !IsA16)) { 4132 // Handle any gradient or coordinate operands that should not be packed 4133 PackedAddrs.push_back(AddrReg); 4134 } else { 4135 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4136 // derivatives dx/dh and dx/dv are packed with undef. 4137 if (((I + 1) >= EndIdx) || 4138 ((Intr->NumGradients / 2) % 2 == 1 && 4139 (I == static_cast<unsigned>(Intr->GradientStart + 4140 (Intr->NumGradients / 2) - 1) || 4141 I == static_cast<unsigned>(Intr->GradientStart + 4142 Intr->NumGradients - 1))) || 4143 // Check for _L to _LZ optimization 4144 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4145 PackedAddrs.push_back( 4146 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4147 .getReg(0)); 4148 } else { 4149 PackedAddrs.push_back( 4150 B.buildBuildVector( 4151 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4152 .getReg(0)); 4153 ++I; 4154 } 4155 } 4156 } 4157 } 4158 4159 /// Convert from separate vaddr components to a single vector address register, 4160 /// and replace the remaining operands with $noreg. 4161 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4162 int DimIdx, int NumVAddrs) { 4163 const LLT S32 = LLT::scalar(32); 4164 4165 SmallVector<Register, 8> AddrRegs; 4166 for (int I = 0; I != NumVAddrs; ++I) { 4167 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4168 if (SrcOp.isReg()) { 4169 AddrRegs.push_back(SrcOp.getReg()); 4170 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4171 } 4172 } 4173 4174 int NumAddrRegs = AddrRegs.size(); 4175 if (NumAddrRegs != 1) { 4176 // Round up to 8 elements for v5-v7 4177 // FIXME: Missing intermediate sized register classes and instructions. 4178 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4179 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4180 auto Undef = B.buildUndef(S32); 4181 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4182 NumAddrRegs = RoundedNumRegs; 4183 } 4184 4185 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4186 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4187 } 4188 4189 for (int I = 1; I != NumVAddrs; ++I) { 4190 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4191 if (SrcOp.isReg()) 4192 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4193 } 4194 } 4195 4196 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4197 /// 4198 /// Depending on the subtarget, load/store with 16-bit element data need to be 4199 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4200 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4201 /// registers. 4202 /// 4203 /// We don't want to directly select image instructions just yet, but also want 4204 /// to exposes all register repacking to the legalizer/combiners. We also don't 4205 /// want a selected instrution entering RegBankSelect. In order to avoid 4206 /// defining a multitude of intermediate image instructions, directly hack on 4207 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4208 /// now unnecessary arguments with $noreg. 4209 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4210 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4211 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4212 4213 const unsigned NumDefs = MI.getNumExplicitDefs(); 4214 const unsigned ArgOffset = NumDefs + 1; 4215 bool IsTFE = NumDefs == 2; 4216 // We are only processing the operands of d16 image operations on subtargets 4217 // that use the unpacked register layout, or need to repack the TFE result. 4218 4219 // TODO: Do we need to guard against already legalized intrinsics? 4220 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4221 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4222 4223 MachineRegisterInfo *MRI = B.getMRI(); 4224 const LLT S32 = LLT::scalar(32); 4225 const LLT S16 = LLT::scalar(16); 4226 const LLT V2S16 = LLT::vector(2, 16); 4227 4228 unsigned DMask = 0; 4229 4230 // Check for 16 bit addresses and pack if true. 4231 LLT GradTy = 4232 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4233 LLT AddrTy = 4234 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4235 const bool IsG16 = GradTy == S16; 4236 const bool IsA16 = AddrTy == S16; 4237 4238 int DMaskLanes = 0; 4239 if (!BaseOpcode->Atomic) { 4240 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4241 if (BaseOpcode->Gather4) { 4242 DMaskLanes = 4; 4243 } else if (DMask != 0) { 4244 DMaskLanes = countPopulation(DMask); 4245 } else if (!IsTFE && !BaseOpcode->Store) { 4246 // If dmask is 0, this is a no-op load. This can be eliminated. 4247 B.buildUndef(MI.getOperand(0)); 4248 MI.eraseFromParent(); 4249 return true; 4250 } 4251 } 4252 4253 Observer.changingInstr(MI); 4254 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4255 4256 unsigned NewOpcode = NumDefs == 0 ? 4257 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4258 4259 // Track that we legalized this 4260 MI.setDesc(B.getTII().get(NewOpcode)); 4261 4262 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4263 // dmask to be at least 1 otherwise the instruction will fail 4264 if (IsTFE && DMask == 0) { 4265 DMask = 0x1; 4266 DMaskLanes = 1; 4267 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4268 } 4269 4270 if (BaseOpcode->Atomic) { 4271 Register VData0 = MI.getOperand(2).getReg(); 4272 LLT Ty = MRI->getType(VData0); 4273 4274 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4275 if (Ty.isVector()) 4276 return false; 4277 4278 if (BaseOpcode->AtomicX2) { 4279 Register VData1 = MI.getOperand(3).getReg(); 4280 // The two values are packed in one register. 4281 LLT PackedTy = LLT::vector(2, Ty); 4282 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4283 MI.getOperand(2).setReg(Concat.getReg(0)); 4284 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4285 } 4286 } 4287 4288 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4289 4290 // Optimize _L to _LZ when _L is zero 4291 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4292 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4293 const ConstantFP *ConstantLod; 4294 4295 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4296 m_GFCst(ConstantLod))) { 4297 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4298 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4299 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4300 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4301 Intr->Dim); 4302 4303 // The starting indexes should remain in the same place. 4304 --CorrectedNumVAddrs; 4305 4306 MI.getOperand(MI.getNumExplicitDefs()) 4307 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4308 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4309 Intr = NewImageDimIntr; 4310 } 4311 } 4312 } 4313 4314 // Optimize _mip away, when 'lod' is zero 4315 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4316 int64_t ConstantLod; 4317 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4318 m_ICst(ConstantLod))) { 4319 if (ConstantLod == 0) { 4320 // TODO: Change intrinsic opcode and remove operand instead or replacing 4321 // it with 0, as the _L to _LZ handling is done above. 4322 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4323 --CorrectedNumVAddrs; 4324 } 4325 } 4326 } 4327 4328 // Rewrite the addressing register layout before doing anything else. 4329 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4330 // 16 bit gradients are supported, but are tied to the A16 control 4331 // so both gradients and addresses must be 16 bit 4332 return false; 4333 } 4334 4335 if (IsA16 && !ST.hasA16()) { 4336 // A16 not supported 4337 return false; 4338 } 4339 4340 if (IsA16 || IsG16) { 4341 if (Intr->NumVAddrs > 1) { 4342 SmallVector<Register, 4> PackedRegs; 4343 4344 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4345 IsG16); 4346 4347 // See also below in the non-a16 branch 4348 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4349 4350 if (!UseNSA && PackedRegs.size() > 1) { 4351 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4352 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4353 PackedRegs[0] = Concat.getReg(0); 4354 PackedRegs.resize(1); 4355 } 4356 4357 const unsigned NumPacked = PackedRegs.size(); 4358 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4359 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4360 if (!SrcOp.isReg()) { 4361 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4362 continue; 4363 } 4364 4365 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4366 4367 if (I - Intr->VAddrStart < NumPacked) 4368 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4369 else 4370 SrcOp.setReg(AMDGPU::NoRegister); 4371 } 4372 } 4373 } else { 4374 // If the register allocator cannot place the address registers contiguously 4375 // without introducing moves, then using the non-sequential address encoding 4376 // is always preferable, since it saves VALU instructions and is usually a 4377 // wash in terms of code size or even better. 4378 // 4379 // However, we currently have no way of hinting to the register allocator 4380 // that MIMG addresses should be placed contiguously when it is possible to 4381 // do so, so force non-NSA for the common 2-address case as a heuristic. 4382 // 4383 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4384 // allocation when possible. 4385 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4386 4387 if (!UseNSA && Intr->NumVAddrs > 1) 4388 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4389 Intr->NumVAddrs); 4390 } 4391 4392 int Flags = 0; 4393 if (IsA16) 4394 Flags |= 1; 4395 if (IsG16) 4396 Flags |= 2; 4397 MI.addOperand(MachineOperand::CreateImm(Flags)); 4398 4399 if (BaseOpcode->Store) { // No TFE for stores? 4400 // TODO: Handle dmask trim 4401 Register VData = MI.getOperand(1).getReg(); 4402 LLT Ty = MRI->getType(VData); 4403 if (!Ty.isVector() || Ty.getElementType() != S16) 4404 return true; 4405 4406 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4407 if (RepackedReg != VData) { 4408 MI.getOperand(1).setReg(RepackedReg); 4409 } 4410 4411 return true; 4412 } 4413 4414 Register DstReg = MI.getOperand(0).getReg(); 4415 LLT Ty = MRI->getType(DstReg); 4416 const LLT EltTy = Ty.getScalarType(); 4417 const bool IsD16 = Ty.getScalarType() == S16; 4418 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4419 4420 // Confirm that the return type is large enough for the dmask specified 4421 if (NumElts < DMaskLanes) 4422 return false; 4423 4424 if (NumElts > 4 || DMaskLanes > 4) 4425 return false; 4426 4427 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4428 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4429 4430 // The raw dword aligned data component of the load. The only legal cases 4431 // where this matters should be when using the packed D16 format, for 4432 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4433 LLT RoundedTy; 4434 4435 // S32 vector to to cover all data, plus TFE result element. 4436 LLT TFETy; 4437 4438 // Register type to use for each loaded component. Will be S32 or V2S16. 4439 LLT RegTy; 4440 4441 if (IsD16 && ST.hasUnpackedD16VMem()) { 4442 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4443 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4444 RegTy = S32; 4445 } else { 4446 unsigned EltSize = EltTy.getSizeInBits(); 4447 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4448 unsigned RoundedSize = 32 * RoundedElts; 4449 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4450 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4451 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4452 } 4453 4454 // The return type does not need adjustment. 4455 // TODO: Should we change s16 case to s32 or <2 x s16>? 4456 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4457 return true; 4458 4459 Register Dst1Reg; 4460 4461 // Insert after the instruction. 4462 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4463 4464 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4465 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4466 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4467 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4468 4469 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4470 4471 MI.getOperand(0).setReg(NewResultReg); 4472 4473 // In the IR, TFE is supposed to be used with a 2 element struct return 4474 // type. The intruction really returns these two values in one contiguous 4475 // register, with one additional dword beyond the loaded data. Rewrite the 4476 // return type to use a single register result. 4477 4478 if (IsTFE) { 4479 Dst1Reg = MI.getOperand(1).getReg(); 4480 if (MRI->getType(Dst1Reg) != S32) 4481 return false; 4482 4483 // TODO: Make sure the TFE operand bit is set. 4484 MI.RemoveOperand(1); 4485 4486 // Handle the easy case that requires no repack instructions. 4487 if (Ty == S32) { 4488 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4489 return true; 4490 } 4491 } 4492 4493 // Now figure out how to copy the new result register back into the old 4494 // result. 4495 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4496 4497 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4498 4499 if (ResultNumRegs == 1) { 4500 assert(!IsTFE); 4501 ResultRegs[0] = NewResultReg; 4502 } else { 4503 // We have to repack into a new vector of some kind. 4504 for (int I = 0; I != NumDataRegs; ++I) 4505 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4506 B.buildUnmerge(ResultRegs, NewResultReg); 4507 4508 // Drop the final TFE element to get the data part. The TFE result is 4509 // directly written to the right place already. 4510 if (IsTFE) 4511 ResultRegs.resize(NumDataRegs); 4512 } 4513 4514 // For an s16 scalar result, we form an s32 result with a truncate regardless 4515 // of packed vs. unpacked. 4516 if (IsD16 && !Ty.isVector()) { 4517 B.buildTrunc(DstReg, ResultRegs[0]); 4518 return true; 4519 } 4520 4521 // Avoid a build/concat_vector of 1 entry. 4522 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4523 B.buildBitcast(DstReg, ResultRegs[0]); 4524 return true; 4525 } 4526 4527 assert(Ty.isVector()); 4528 4529 if (IsD16) { 4530 // For packed D16 results with TFE enabled, all the data components are 4531 // S32. Cast back to the expected type. 4532 // 4533 // TODO: We don't really need to use load s32 elements. We would only need one 4534 // cast for the TFE result if a multiple of v2s16 was used. 4535 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4536 for (Register &Reg : ResultRegs) 4537 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4538 } else if (ST.hasUnpackedD16VMem()) { 4539 for (Register &Reg : ResultRegs) 4540 Reg = B.buildTrunc(S16, Reg).getReg(0); 4541 } 4542 } 4543 4544 auto padWithUndef = [&](LLT Ty, int NumElts) { 4545 if (NumElts == 0) 4546 return; 4547 Register Undef = B.buildUndef(Ty).getReg(0); 4548 for (int I = 0; I != NumElts; ++I) 4549 ResultRegs.push_back(Undef); 4550 }; 4551 4552 // Pad out any elements eliminated due to the dmask. 4553 LLT ResTy = MRI->getType(ResultRegs[0]); 4554 if (!ResTy.isVector()) { 4555 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4556 B.buildBuildVector(DstReg, ResultRegs); 4557 return true; 4558 } 4559 4560 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4561 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4562 4563 // Deal with the one annoying legal case. 4564 const LLT V3S16 = LLT::vector(3, 16); 4565 if (Ty == V3S16) { 4566 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4567 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4568 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4569 return true; 4570 } 4571 4572 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4573 B.buildConcatVectors(DstReg, ResultRegs); 4574 return true; 4575 } 4576 4577 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4578 LegalizerHelper &Helper, MachineInstr &MI) const { 4579 MachineIRBuilder &B = Helper.MIRBuilder; 4580 GISelChangeObserver &Observer = Helper.Observer; 4581 4582 Register Dst = MI.getOperand(0).getReg(); 4583 LLT Ty = B.getMRI()->getType(Dst); 4584 unsigned Size = Ty.getSizeInBits(); 4585 MachineFunction &MF = B.getMF(); 4586 4587 Observer.changingInstr(MI); 4588 4589 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4590 Ty = getBitcastRegisterType(Ty); 4591 Helper.bitcastDst(MI, Ty, 0); 4592 Dst = MI.getOperand(0).getReg(); 4593 B.setInsertPt(B.getMBB(), MI); 4594 } 4595 4596 // FIXME: We don't really need this intermediate instruction. The intrinsic 4597 // should be fixed to have a memory operand. Since it's readnone, we're not 4598 // allowed to add one. 4599 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4600 MI.RemoveOperand(1); // Remove intrinsic ID 4601 4602 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4603 // TODO: Should this use datalayout alignment? 4604 const unsigned MemSize = (Size + 7) / 8; 4605 const Align MemAlign(4); 4606 MachineMemOperand *MMO = MF.getMachineMemOperand( 4607 MachinePointerInfo(), 4608 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4609 MachineMemOperand::MOInvariant, 4610 MemSize, MemAlign); 4611 MI.addMemOperand(MF, MMO); 4612 4613 // There are no 96-bit result scalar loads, but widening to 128-bit should 4614 // always be legal. We may need to restore this to a 96-bit result if it turns 4615 // out this needs to be converted to a vector load during RegBankSelect. 4616 if (!isPowerOf2_32(Size)) { 4617 if (Ty.isVector()) 4618 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4619 else 4620 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4621 } 4622 4623 Observer.changedInstr(MI); 4624 return true; 4625 } 4626 4627 // TODO: Move to selection 4628 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4629 MachineRegisterInfo &MRI, 4630 MachineIRBuilder &B) const { 4631 if (!ST.isTrapHandlerEnabled() || 4632 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4633 return legalizeTrapEndpgm(MI, MRI, B); 4634 4635 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4636 switch (*HsaAbiVer) { 4637 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4638 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4639 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4640 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4641 return ST.supportsGetDoorbellID() ? 4642 legalizeTrapHsa(MI, MRI, B) : 4643 legalizeTrapHsaQueuePtr(MI, MRI, B); 4644 } 4645 } 4646 4647 llvm_unreachable("Unknown trap handler"); 4648 } 4649 4650 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4651 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4652 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4653 MI.eraseFromParent(); 4654 return true; 4655 } 4656 4657 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4658 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4659 // Pass queue pointer to trap handler as input, and insert trap instruction 4660 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4661 Register LiveIn = 4662 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4663 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4664 return false; 4665 4666 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4667 B.buildCopy(SGPR01, LiveIn); 4668 B.buildInstr(AMDGPU::S_TRAP) 4669 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4670 .addReg(SGPR01, RegState::Implicit); 4671 4672 MI.eraseFromParent(); 4673 return true; 4674 } 4675 4676 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4677 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4678 B.buildInstr(AMDGPU::S_TRAP) 4679 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4680 MI.eraseFromParent(); 4681 return true; 4682 } 4683 4684 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4685 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4686 // Is non-HSA path or trap-handler disabled? then, report a warning 4687 // accordingly 4688 if (!ST.isTrapHandlerEnabled() || 4689 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4690 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4691 "debugtrap handler not supported", 4692 MI.getDebugLoc(), DS_Warning); 4693 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4694 Ctx.diagnose(NoTrap); 4695 } else { 4696 // Insert debug-trap instruction 4697 B.buildInstr(AMDGPU::S_TRAP) 4698 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4699 } 4700 4701 MI.eraseFromParent(); 4702 return true; 4703 } 4704 4705 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4706 MachineIRBuilder &B) const { 4707 MachineRegisterInfo &MRI = *B.getMRI(); 4708 const LLT S16 = LLT::scalar(16); 4709 const LLT S32 = LLT::scalar(32); 4710 4711 Register DstReg = MI.getOperand(0).getReg(); 4712 Register NodePtr = MI.getOperand(2).getReg(); 4713 Register RayExtent = MI.getOperand(3).getReg(); 4714 Register RayOrigin = MI.getOperand(4).getReg(); 4715 Register RayDir = MI.getOperand(5).getReg(); 4716 Register RayInvDir = MI.getOperand(6).getReg(); 4717 Register TDescr = MI.getOperand(7).getReg(); 4718 4719 if (!ST.hasGFX10_AEncoding()) { 4720 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4721 "intrinsic not supported on subtarget", 4722 MI.getDebugLoc()); 4723 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4724 return false; 4725 } 4726 4727 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4728 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4729 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4730 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4731 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4732 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4733 4734 SmallVector<Register, 12> Ops; 4735 if (Is64) { 4736 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4737 Ops.push_back(Unmerge.getReg(0)); 4738 Ops.push_back(Unmerge.getReg(1)); 4739 } else { 4740 Ops.push_back(NodePtr); 4741 } 4742 Ops.push_back(RayExtent); 4743 4744 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4745 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4746 Ops.push_back(Unmerge.getReg(0)); 4747 Ops.push_back(Unmerge.getReg(1)); 4748 Ops.push_back(Unmerge.getReg(2)); 4749 }; 4750 4751 packLanes(RayOrigin); 4752 if (IsA16) { 4753 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4754 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4755 Register R1 = MRI.createGenericVirtualRegister(S32); 4756 Register R2 = MRI.createGenericVirtualRegister(S32); 4757 Register R3 = MRI.createGenericVirtualRegister(S32); 4758 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4759 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4760 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4761 Ops.push_back(R1); 4762 Ops.push_back(R2); 4763 Ops.push_back(R3); 4764 } else { 4765 packLanes(RayDir); 4766 packLanes(RayInvDir); 4767 } 4768 4769 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4770 .addDef(DstReg) 4771 .addImm(Opcode); 4772 4773 for (Register R : Ops) { 4774 MIB.addUse(R); 4775 } 4776 4777 MIB.addUse(TDescr) 4778 .addImm(IsA16 ? 1 : 0) 4779 .cloneMemRefs(MI); 4780 4781 MI.eraseFromParent(); 4782 return true; 4783 } 4784 4785 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4786 MachineInstr &MI) const { 4787 MachineIRBuilder &B = Helper.MIRBuilder; 4788 MachineRegisterInfo &MRI = *B.getMRI(); 4789 4790 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4791 auto IntrID = MI.getIntrinsicID(); 4792 switch (IntrID) { 4793 case Intrinsic::amdgcn_if: 4794 case Intrinsic::amdgcn_else: { 4795 MachineInstr *Br = nullptr; 4796 MachineBasicBlock *UncondBrTarget = nullptr; 4797 bool Negated = false; 4798 if (MachineInstr *BrCond = 4799 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4800 const SIRegisterInfo *TRI 4801 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4802 4803 Register Def = MI.getOperand(1).getReg(); 4804 Register Use = MI.getOperand(3).getReg(); 4805 4806 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4807 4808 if (Negated) 4809 std::swap(CondBrTarget, UncondBrTarget); 4810 4811 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4812 if (IntrID == Intrinsic::amdgcn_if) { 4813 B.buildInstr(AMDGPU::SI_IF) 4814 .addDef(Def) 4815 .addUse(Use) 4816 .addMBB(UncondBrTarget); 4817 } else { 4818 B.buildInstr(AMDGPU::SI_ELSE) 4819 .addDef(Def) 4820 .addUse(Use) 4821 .addMBB(UncondBrTarget); 4822 } 4823 4824 if (Br) { 4825 Br->getOperand(0).setMBB(CondBrTarget); 4826 } else { 4827 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4828 // since we're swapping branch targets it needs to be reinserted. 4829 // FIXME: IRTranslator should probably not do this 4830 B.buildBr(*CondBrTarget); 4831 } 4832 4833 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4834 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4835 MI.eraseFromParent(); 4836 BrCond->eraseFromParent(); 4837 return true; 4838 } 4839 4840 return false; 4841 } 4842 case Intrinsic::amdgcn_loop: { 4843 MachineInstr *Br = nullptr; 4844 MachineBasicBlock *UncondBrTarget = nullptr; 4845 bool Negated = false; 4846 if (MachineInstr *BrCond = 4847 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4848 const SIRegisterInfo *TRI 4849 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4850 4851 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4852 Register Reg = MI.getOperand(2).getReg(); 4853 4854 if (Negated) 4855 std::swap(CondBrTarget, UncondBrTarget); 4856 4857 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4858 B.buildInstr(AMDGPU::SI_LOOP) 4859 .addUse(Reg) 4860 .addMBB(UncondBrTarget); 4861 4862 if (Br) 4863 Br->getOperand(0).setMBB(CondBrTarget); 4864 else 4865 B.buildBr(*CondBrTarget); 4866 4867 MI.eraseFromParent(); 4868 BrCond->eraseFromParent(); 4869 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4870 return true; 4871 } 4872 4873 return false; 4874 } 4875 case Intrinsic::amdgcn_kernarg_segment_ptr: 4876 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4877 // This only makes sense to call in a kernel, so just lower to null. 4878 B.buildConstant(MI.getOperand(0).getReg(), 0); 4879 MI.eraseFromParent(); 4880 return true; 4881 } 4882 4883 return legalizePreloadedArgIntrin( 4884 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4885 case Intrinsic::amdgcn_implicitarg_ptr: 4886 return legalizeImplicitArgPtr(MI, MRI, B); 4887 case Intrinsic::amdgcn_workitem_id_x: 4888 return legalizePreloadedArgIntrin(MI, MRI, B, 4889 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4890 case Intrinsic::amdgcn_workitem_id_y: 4891 return legalizePreloadedArgIntrin(MI, MRI, B, 4892 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4893 case Intrinsic::amdgcn_workitem_id_z: 4894 return legalizePreloadedArgIntrin(MI, MRI, B, 4895 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4896 case Intrinsic::amdgcn_workgroup_id_x: 4897 return legalizePreloadedArgIntrin(MI, MRI, B, 4898 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4899 case Intrinsic::amdgcn_workgroup_id_y: 4900 return legalizePreloadedArgIntrin(MI, MRI, B, 4901 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4902 case Intrinsic::amdgcn_workgroup_id_z: 4903 return legalizePreloadedArgIntrin(MI, MRI, B, 4904 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4905 case Intrinsic::amdgcn_dispatch_ptr: 4906 return legalizePreloadedArgIntrin(MI, MRI, B, 4907 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4908 case Intrinsic::amdgcn_queue_ptr: 4909 return legalizePreloadedArgIntrin(MI, MRI, B, 4910 AMDGPUFunctionArgInfo::QUEUE_PTR); 4911 case Intrinsic::amdgcn_implicit_buffer_ptr: 4912 return legalizePreloadedArgIntrin( 4913 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4914 case Intrinsic::amdgcn_dispatch_id: 4915 return legalizePreloadedArgIntrin(MI, MRI, B, 4916 AMDGPUFunctionArgInfo::DISPATCH_ID); 4917 case Intrinsic::amdgcn_fdiv_fast: 4918 return legalizeFDIVFastIntrin(MI, MRI, B); 4919 case Intrinsic::amdgcn_is_shared: 4920 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4921 case Intrinsic::amdgcn_is_private: 4922 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4923 case Intrinsic::amdgcn_wavefrontsize: { 4924 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4925 MI.eraseFromParent(); 4926 return true; 4927 } 4928 case Intrinsic::amdgcn_s_buffer_load: 4929 return legalizeSBufferLoad(Helper, MI); 4930 case Intrinsic::amdgcn_raw_buffer_store: 4931 case Intrinsic::amdgcn_struct_buffer_store: 4932 return legalizeBufferStore(MI, MRI, B, false, false); 4933 case Intrinsic::amdgcn_raw_buffer_store_format: 4934 case Intrinsic::amdgcn_struct_buffer_store_format: 4935 return legalizeBufferStore(MI, MRI, B, false, true); 4936 case Intrinsic::amdgcn_raw_tbuffer_store: 4937 case Intrinsic::amdgcn_struct_tbuffer_store: 4938 return legalizeBufferStore(MI, MRI, B, true, true); 4939 case Intrinsic::amdgcn_raw_buffer_load: 4940 case Intrinsic::amdgcn_struct_buffer_load: 4941 return legalizeBufferLoad(MI, MRI, B, false, false); 4942 case Intrinsic::amdgcn_raw_buffer_load_format: 4943 case Intrinsic::amdgcn_struct_buffer_load_format: 4944 return legalizeBufferLoad(MI, MRI, B, true, false); 4945 case Intrinsic::amdgcn_raw_tbuffer_load: 4946 case Intrinsic::amdgcn_struct_tbuffer_load: 4947 return legalizeBufferLoad(MI, MRI, B, true, true); 4948 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4949 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4950 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4951 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4952 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4953 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4954 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4955 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4956 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4957 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4958 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4959 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4960 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4961 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4962 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4963 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4964 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4965 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4966 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4967 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4968 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4969 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4970 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4971 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4972 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4973 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4974 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4975 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4976 case Intrinsic::amdgcn_buffer_atomic_fadd: 4977 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4978 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4979 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4980 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4981 return legalizeBufferAtomic(MI, B, IntrID); 4982 case Intrinsic::amdgcn_atomic_inc: 4983 return legalizeAtomicIncDec(MI, B, true); 4984 case Intrinsic::amdgcn_atomic_dec: 4985 return legalizeAtomicIncDec(MI, B, false); 4986 case Intrinsic::trap: 4987 return legalizeTrapIntrinsic(MI, MRI, B); 4988 case Intrinsic::debugtrap: 4989 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4990 case Intrinsic::amdgcn_rsq_clamp: 4991 return legalizeRsqClampIntrinsic(MI, MRI, B); 4992 case Intrinsic::amdgcn_ds_fadd: 4993 case Intrinsic::amdgcn_ds_fmin: 4994 case Intrinsic::amdgcn_ds_fmax: 4995 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4996 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4997 return legalizeBVHIntrinsic(MI, B); 4998 default: { 4999 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5000 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5001 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5002 return true; 5003 } 5004 } 5005 5006 return true; 5007 } 5008