1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const unsigned MemSizeInBits) { 365 const unsigned Size = Ty.getSizeInBits(); 366 if (Size != MemSizeInBits) 367 return Size <= 32 && Ty.isVector(); 368 369 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 370 return true; 371 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 372 !isRegisterVectorElementType(Ty.getElementType()); 373 } 374 375 /// Return true if we should legalize a load by widening an odd sized memory 376 /// access up to the alignment. Note this case when the memory access itself 377 /// changes, not the size of the result register. 378 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 379 unsigned AlignInBits, unsigned AddrSpace, 380 unsigned Opcode) { 381 // We don't want to widen cases that are naturally legal. 382 if (isPowerOf2_32(SizeInBits)) 383 return false; 384 385 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 386 // end up widening these for a scalar load during RegBankSelect, since there 387 // aren't 96-bit scalar loads. 388 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 389 return false; 390 391 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 392 return false; 393 394 // A load is known dereferenceable up to the alignment, so it's legal to widen 395 // to it. 396 // 397 // TODO: Could check dereferenceable for less aligned cases. 398 unsigned RoundedSize = NextPowerOf2(SizeInBits); 399 if (AlignInBits < RoundedSize) 400 return false; 401 402 // Do not widen if it would introduce a slow unaligned load. 403 const SITargetLowering *TLI = ST.getTargetLowering(); 404 bool Fast = false; 405 return TLI->allowsMisalignedMemoryAccessesImpl( 406 RoundedSize, AddrSpace, Align(AlignInBits / 8), 407 MachineMemOperand::MOLoad, &Fast) && 408 Fast; 409 } 410 411 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 412 unsigned Opcode) { 413 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 414 return false; 415 416 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 417 Query.MMODescrs[0].AlignInBits, 418 Query.Types[1].getAddressSpace(), Opcode); 419 } 420 421 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 422 const GCNTargetMachine &TM) 423 : ST(ST_) { 424 using namespace TargetOpcode; 425 426 auto GetAddrSpacePtr = [&TM](unsigned AS) { 427 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 428 }; 429 430 const LLT S1 = LLT::scalar(1); 431 const LLT S8 = LLT::scalar(8); 432 const LLT S16 = LLT::scalar(16); 433 const LLT S32 = LLT::scalar(32); 434 const LLT S64 = LLT::scalar(64); 435 const LLT S128 = LLT::scalar(128); 436 const LLT S256 = LLT::scalar(256); 437 const LLT S512 = LLT::scalar(512); 438 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 439 440 const LLT V2S8 = LLT::fixed_vector(2, 8); 441 const LLT V2S16 = LLT::fixed_vector(2, 16); 442 const LLT V4S16 = LLT::fixed_vector(4, 16); 443 444 const LLT V2S32 = LLT::fixed_vector(2, 32); 445 const LLT V3S32 = LLT::fixed_vector(3, 32); 446 const LLT V4S32 = LLT::fixed_vector(4, 32); 447 const LLT V5S32 = LLT::fixed_vector(5, 32); 448 const LLT V6S32 = LLT::fixed_vector(6, 32); 449 const LLT V7S32 = LLT::fixed_vector(7, 32); 450 const LLT V8S32 = LLT::fixed_vector(8, 32); 451 const LLT V9S32 = LLT::fixed_vector(9, 32); 452 const LLT V10S32 = LLT::fixed_vector(10, 32); 453 const LLT V11S32 = LLT::fixed_vector(11, 32); 454 const LLT V12S32 = LLT::fixed_vector(12, 32); 455 const LLT V13S32 = LLT::fixed_vector(13, 32); 456 const LLT V14S32 = LLT::fixed_vector(14, 32); 457 const LLT V15S32 = LLT::fixed_vector(15, 32); 458 const LLT V16S32 = LLT::fixed_vector(16, 32); 459 const LLT V32S32 = LLT::fixed_vector(32, 32); 460 461 const LLT V2S64 = LLT::fixed_vector(2, 64); 462 const LLT V3S64 = LLT::fixed_vector(3, 64); 463 const LLT V4S64 = LLT::fixed_vector(4, 64); 464 const LLT V5S64 = LLT::fixed_vector(5, 64); 465 const LLT V6S64 = LLT::fixed_vector(6, 64); 466 const LLT V7S64 = LLT::fixed_vector(7, 64); 467 const LLT V8S64 = LLT::fixed_vector(8, 64); 468 const LLT V16S64 = LLT::fixed_vector(16, 64); 469 470 std::initializer_list<LLT> AllS32Vectors = 471 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 472 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 473 std::initializer_list<LLT> AllS64Vectors = 474 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 475 476 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 477 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 478 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 479 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 480 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 481 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 482 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 483 484 const LLT CodePtr = FlatPtr; 485 486 const std::initializer_list<LLT> AddrSpaces64 = { 487 GlobalPtr, ConstantPtr, FlatPtr 488 }; 489 490 const std::initializer_list<LLT> AddrSpaces32 = { 491 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 492 }; 493 494 const std::initializer_list<LLT> FPTypesBase = { 495 S32, S64 496 }; 497 498 const std::initializer_list<LLT> FPTypes16 = { 499 S32, S64, S16 500 }; 501 502 const std::initializer_list<LLT> FPTypesPK16 = { 503 S32, S64, S16, V2S16 504 }; 505 506 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 507 508 // s1 for VCC branches, s32 for SCC branches. 509 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 510 511 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 512 // elements for v3s16 513 getActionDefinitionsBuilder(G_PHI) 514 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 515 .legalFor(AllS32Vectors) 516 .legalFor(AllS64Vectors) 517 .legalFor(AddrSpaces64) 518 .legalFor(AddrSpaces32) 519 .legalIf(isPointer(0)) 520 .clampScalar(0, S16, S256) 521 .widenScalarToNextPow2(0, 32) 522 .clampMaxNumElements(0, S32, 16) 523 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 524 .scalarize(0); 525 526 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 527 // Full set of gfx9 features. 528 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 529 .legalFor({S32, S16, V2S16}) 530 .clampScalar(0, S16, S32) 531 .clampMaxNumElements(0, S16, 2) 532 .scalarize(0) 533 .widenScalarToNextPow2(0, 32); 534 535 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 536 .legalFor({S32, S16, V2S16}) // Clamp modifier 537 .minScalarOrElt(0, S16) 538 .clampMaxNumElements(0, S16, 2) 539 .scalarize(0) 540 .widenScalarToNextPow2(0, 32) 541 .lower(); 542 } else if (ST.has16BitInsts()) { 543 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 544 .legalFor({S32, S16}) 545 .clampScalar(0, S16, S32) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 548 549 // Technically the saturating operations require clamp bit support, but this 550 // was introduced at the same time as 16-bit operations. 551 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 552 .legalFor({S32, S16}) // Clamp modifier 553 .minScalar(0, S16) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 16) 556 .lower(); 557 558 // We're just lowering this, but it helps get a better result to try to 559 // coerce to the desired type first. 560 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 561 .minScalar(0, S16) 562 .scalarize(0) 563 .lower(); 564 } else { 565 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 566 .legalFor({S32}) 567 .clampScalar(0, S32, S32) 568 .scalarize(0); 569 570 if (ST.hasIntClamp()) { 571 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 572 .legalFor({S32}) // Clamp modifier. 573 .scalarize(0) 574 .minScalarOrElt(0, S32) 575 .lower(); 576 } else { 577 // Clamp bit support was added in VI, along with 16-bit operations. 578 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 // FIXME: DAG expansion gets better results. The widening uses the smaller 585 // range values and goes for the min/max lowering directly. 586 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 getActionDefinitionsBuilder( 593 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 594 .customFor({S32, S64}) 595 .clampScalar(0, S32, S64) 596 .widenScalarToNextPow2(0, 32) 597 .scalarize(0); 598 599 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 600 .legalFor({S32}) 601 .maxScalarOrElt(0, S32); 602 603 if (ST.hasVOP3PInsts()) { 604 Mulh 605 .clampMaxNumElements(0, S8, 2) 606 .lowerFor({V2S8}); 607 } 608 609 Mulh 610 .scalarize(0) 611 .lower(); 612 613 // Report legal for any types we can handle anywhere. For the cases only legal 614 // on the SALU, RegBankSelect will be able to re-legalize. 615 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 616 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 617 .clampScalar(0, S32, S64) 618 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 619 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 620 .widenScalarToNextPow2(0) 621 .scalarize(0); 622 623 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 624 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 625 .legalFor({{S32, S1}, {S32, S32}}) 626 .minScalar(0, S32) 627 // TODO: .scalarize(0) 628 .lower(); 629 630 getActionDefinitionsBuilder(G_BITCAST) 631 // Don't worry about the size constraint. 632 .legalIf(all(isRegisterType(0), isRegisterType(1))) 633 .lower(); 634 635 636 getActionDefinitionsBuilder(G_CONSTANT) 637 .legalFor({S1, S32, S64, S16, GlobalPtr, 638 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 639 .legalIf(isPointer(0)) 640 .clampScalar(0, S32, S64) 641 .widenScalarToNextPow2(0); 642 643 getActionDefinitionsBuilder(G_FCONSTANT) 644 .legalFor({S32, S64, S16}) 645 .clampScalar(0, S16, S64); 646 647 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 648 .legalIf(isRegisterType(0)) 649 // s1 and s16 are special cases because they have legal operations on 650 // them, but don't really occupy registers in the normal way. 651 .legalFor({S1, S16}) 652 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 653 .clampScalarOrElt(0, S32, MaxScalar) 654 .widenScalarToNextPow2(0, 32) 655 .clampMaxNumElements(0, S32, 16); 656 657 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 658 659 // If the amount is divergent, we have to do a wave reduction to get the 660 // maximum value, so this is expanded during RegBankSelect. 661 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 662 .legalFor({{PrivatePtr, S32}}); 663 664 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 665 .customIf(typeIsNot(0, PrivatePtr)); 666 667 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 668 669 auto &FPOpActions = getActionDefinitionsBuilder( 670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 671 .legalFor({S32, S64}); 672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 673 .customFor({S32, S64}); 674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 675 .customFor({S32, S64}); 676 677 if (ST.has16BitInsts()) { 678 if (ST.hasVOP3PInsts()) 679 FPOpActions.legalFor({S16, V2S16}); 680 else 681 FPOpActions.legalFor({S16}); 682 683 TrigActions.customFor({S16}); 684 FDIVActions.customFor({S16}); 685 } 686 687 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 689 690 if (ST.hasVOP3PInsts()) { 691 MinNumMaxNum.customFor(FPTypesPK16) 692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 693 .clampMaxNumElements(0, S16, 2) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else if (ST.has16BitInsts()) { 697 MinNumMaxNum.customFor(FPTypes16) 698 .clampScalar(0, S16, S64) 699 .scalarize(0); 700 } else { 701 MinNumMaxNum.customFor(FPTypesBase) 702 .clampScalar(0, S32, S64) 703 .scalarize(0); 704 } 705 706 if (ST.hasVOP3PInsts()) 707 FPOpActions.clampMaxNumElements(0, S16, 2); 708 709 FPOpActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 TrigActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 FDIVActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 722 .legalFor(FPTypesPK16) 723 .clampMaxNumElements(0, S16, 2) 724 .scalarize(0) 725 .clampScalar(0, S16, S64); 726 727 if (ST.has16BitInsts()) { 728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 729 .legalFor({S32, S64, S16}) 730 .scalarize(0) 731 .clampScalar(0, S16, S64); 732 } else { 733 getActionDefinitionsBuilder(G_FSQRT) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 738 if (ST.hasFractBug()) { 739 getActionDefinitionsBuilder(G_FFLOOR) 740 .customFor({S64}) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 } else { 745 getActionDefinitionsBuilder(G_FFLOOR) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } 750 } 751 752 getActionDefinitionsBuilder(G_FPTRUNC) 753 .legalFor({{S32, S64}, {S16, S32}}) 754 .scalarize(0) 755 .lower(); 756 757 getActionDefinitionsBuilder(G_FPEXT) 758 .legalFor({{S64, S32}, {S32, S16}}) 759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 760 .scalarize(0); 761 762 getActionDefinitionsBuilder(G_FSUB) 763 // Use actual fsub instruction 764 .legalFor({S32}) 765 // Must use fadd + fneg 766 .lowerFor({S64, S16, V2S16}) 767 .scalarize(0) 768 .clampScalar(0, S32, S64); 769 770 // Whether this is legal depends on the floating point mode for the function. 771 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 772 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 773 FMad.customFor({S32, S16}); 774 else if (ST.hasMadMacF32Insts()) 775 FMad.customFor({S32}); 776 else if (ST.hasMadF16()) 777 FMad.customFor({S16}); 778 FMad.scalarize(0) 779 .lower(); 780 781 auto &FRem = getActionDefinitionsBuilder(G_FREM); 782 if (ST.has16BitInsts()) { 783 FRem.customFor({S16, S32, S64}); 784 } else { 785 FRem.minScalar(0, S32) 786 .customFor({S32, S64}); 787 } 788 FRem.scalarize(0); 789 790 // TODO: Do we need to clamp maximum bitwidth? 791 getActionDefinitionsBuilder(G_TRUNC) 792 .legalIf(isScalar(0)) 793 .legalFor({{V2S16, V2S32}}) 794 .clampMaxNumElements(0, S16, 2) 795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 796 // situations (like an invalid implicit use), we don't want to infinite loop 797 // in the legalizer. 798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 799 .alwaysLegal(); 800 801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 803 {S32, S1}, {S64, S1}, {S16, S1}}) 804 .scalarize(0) 805 .clampScalar(0, S32, S64) 806 .widenScalarToNextPow2(1, 32); 807 808 // TODO: Split s1->s64 during regbankselect for VALU. 809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 811 .lowerFor({{S32, S64}}) 812 .lowerIf(typeIs(1, S1)) 813 .customFor({{S64, S64}}); 814 if (ST.has16BitInsts()) 815 IToFP.legalFor({{S16, S16}}); 816 IToFP.clampScalar(1, S32, S64) 817 .minScalar(0, S32) 818 .scalarize(0) 819 .widenScalarToNextPow2(1); 820 821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 823 .customFor({{S64, S32}, {S64, S64}}) 824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 825 if (ST.has16BitInsts()) 826 FPToI.legalFor({{S16, S16}}); 827 else 828 FPToI.minScalar(1, S32); 829 830 FPToI.minScalar(0, S32) 831 .widenScalarToNextPow2(0, 32) 832 .scalarize(0) 833 .lower(); 834 835 // Lower roundeven into G_FRINT 836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 837 .scalarize(0) 838 .lower(); 839 840 if (ST.has16BitInsts()) { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S16, S32, S64}) 843 .clampScalar(0, S16, S64) 844 .scalarize(0); 845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S32, S64}) 848 .clampScalar(0, S32, S64) 849 .scalarize(0); 850 } else { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32}) 853 .customFor({S64}) 854 .clampScalar(0, S32, S64) 855 .scalarize(0); 856 } 857 858 getActionDefinitionsBuilder(G_PTR_ADD) 859 .legalIf(all(isPointer(0), sameSize(0, 1))) 860 .scalarize(0) 861 .scalarSameSizeAs(1, 0); 862 863 getActionDefinitionsBuilder(G_PTRMASK) 864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 865 .scalarSameSizeAs(1, 0) 866 .scalarize(0); 867 868 auto &CmpBuilder = 869 getActionDefinitionsBuilder(G_ICMP) 870 // The compare output type differs based on the register bank of the output, 871 // so make both s1 and s32 legal. 872 // 873 // Scalar compares producing output in scc will be promoted to s32, as that 874 // is the allocatable register type that will be needed for the copy from 875 // scc. This will be promoted during RegBankSelect, and we assume something 876 // before that won't try to use s32 result types. 877 // 878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 879 // bank. 880 .legalForCartesianProduct( 881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 882 .legalForCartesianProduct( 883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 884 if (ST.has16BitInsts()) { 885 CmpBuilder.legalFor({{S1, S16}}); 886 } 887 888 CmpBuilder 889 .widenScalarToNextPow2(1) 890 .clampScalar(1, S32, S64) 891 .scalarize(0) 892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 893 894 getActionDefinitionsBuilder(G_FCMP) 895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0); 899 900 // FIXME: fpow has a selection pattern that should move to custom lowering. 901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 902 if (ST.has16BitInsts()) 903 Exp2Ops.legalFor({S32, S16}); 904 else 905 Exp2Ops.legalFor({S32}); 906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 907 Exp2Ops.scalarize(0); 908 909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 910 if (ST.has16BitInsts()) 911 ExpOps.customFor({{S32}, {S16}}); 912 else 913 ExpOps.customFor({S32}); 914 ExpOps.clampScalar(0, MinScalarFPTy, S32) 915 .scalarize(0); 916 917 getActionDefinitionsBuilder(G_FPOWI) 918 .clampScalar(0, MinScalarFPTy, S32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder(G_CTPOP) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 // The hardware instructions return a different result on 0 than the generic 931 // instructions expect. The hardware produces -1, but these produce the 932 // bitwidth. 933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 934 .scalarize(0) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32) 939 .lower(); 940 941 // The 64-bit versions produce 32-bit results, but only on the SALU. 942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 943 .legalFor({{S32, S32}, {S32, S64}}) 944 .clampScalar(0, S32, S32) 945 .clampScalar(1, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0, 32) 948 .widenScalarToNextPow2(1, 32); 949 950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 951 // RegBankSelect. 952 getActionDefinitionsBuilder(G_BITREVERSE) 953 .legalFor({S32, S64}) 954 .clampScalar(0, S32, S64) 955 .scalarize(0) 956 .widenScalarToNextPow2(0); 957 958 if (ST.has16BitInsts()) { 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S16, S32, V2S16}) 961 .clampMaxNumElements(0, S16, 2) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .clampScalar(0, S16, S32) 966 .scalarize(0); 967 968 if (ST.hasVOP3PInsts()) { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 970 .legalFor({S32, S16, V2S16}) 971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 972 .clampMaxNumElements(0, S16, 2) 973 .minScalar(0, S16) 974 .widenScalarToNextPow2(0) 975 .scalarize(0) 976 .lower(); 977 } else { 978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 979 .legalFor({S32, S16}) 980 .widenScalarToNextPow2(0) 981 .minScalar(0, S16) 982 .scalarize(0) 983 .lower(); 984 } 985 } else { 986 // TODO: Should have same legality without v_perm_b32 987 getActionDefinitionsBuilder(G_BSWAP) 988 .legalFor({S32}) 989 .lowerIf(scalarNarrowerThan(0, 32)) 990 // FIXME: Fixing non-power-of-2 before clamp is workaround for 991 // narrowScalar limitation. 992 .widenScalarToNextPow2(0) 993 .maxScalar(0, S32) 994 .scalarize(0) 995 .lower(); 996 997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 998 .legalFor({S32}) 999 .minScalar(0, S32) 1000 .widenScalarToNextPow2(0) 1001 .scalarize(0) 1002 .lower(); 1003 } 1004 1005 getActionDefinitionsBuilder(G_INTTOPTR) 1006 // List the common cases 1007 .legalForCartesianProduct(AddrSpaces64, {S64}) 1008 .legalForCartesianProduct(AddrSpaces32, {S32}) 1009 .scalarize(0) 1010 // Accept any address space as long as the size matches 1011 .legalIf(sameSize(0, 1)) 1012 .widenScalarIf(smallerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }) 1016 .narrowScalarIf(largerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }); 1020 1021 getActionDefinitionsBuilder(G_PTRTOINT) 1022 // List the common cases 1023 .legalForCartesianProduct(AddrSpaces64, {S64}) 1024 .legalForCartesianProduct(AddrSpaces32, {S32}) 1025 .scalarize(0) 1026 // Accept any address space as long as the size matches 1027 .legalIf(sameSize(0, 1)) 1028 .widenScalarIf(smallerThan(0, 1), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1031 }) 1032 .narrowScalarIf( 1033 largerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }); 1037 1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1039 .scalarize(0) 1040 .custom(); 1041 1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1043 bool IsLoad) -> bool { 1044 const LLT DstTy = Query.Types[0]; 1045 1046 // Split vector extloads. 1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1049 1050 if (MemSize < DstTy.getSizeInBits()) 1051 MemSize = std::max(MemSize, AlignBits); 1052 1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1054 return true; 1055 1056 const LLT PtrTy = Query.Types[1]; 1057 unsigned AS = PtrTy.getAddressSpace(); 1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1059 return true; 1060 1061 // Catch weird sized loads that don't evenly divide into the access sizes 1062 // TODO: May be able to widen depending on alignment etc. 1063 unsigned NumRegs = (MemSize + 31) / 32; 1064 if (NumRegs == 3) { 1065 if (!ST.hasDwordx3LoadStores()) 1066 return true; 1067 } else { 1068 // If the alignment allows, these should have been widened. 1069 if (!isPowerOf2_32(NumRegs)) 1070 return true; 1071 } 1072 1073 if (AlignBits < MemSize) { 1074 const SITargetLowering *TLI = ST.getTargetLowering(); 1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1076 Align(AlignBits / 8)); 1077 } 1078 1079 return false; 1080 }; 1081 1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1085 1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1087 // LDS 1088 // TODO: Unsupported flat for SI. 1089 1090 for (unsigned Op : {G_LOAD, G_STORE}) { 1091 const bool IsStore = Op == G_STORE; 1092 1093 auto &Actions = getActionDefinitionsBuilder(Op); 1094 // Explicitly list some common cases. 1095 // TODO: Does this help compile time at all? 1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1097 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1098 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1099 {S64, GlobalPtr, 64, GlobalAlign32}, 1100 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1101 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1102 {S32, GlobalPtr, 8, GlobalAlign8}, 1103 {S32, GlobalPtr, 16, GlobalAlign16}, 1104 1105 {S32, LocalPtr, 32, 32}, 1106 {S64, LocalPtr, 64, 32}, 1107 {V2S32, LocalPtr, 64, 32}, 1108 {S32, LocalPtr, 8, 8}, 1109 {S32, LocalPtr, 16, 16}, 1110 {V2S16, LocalPtr, 32, 32}, 1111 1112 {S32, PrivatePtr, 32, 32}, 1113 {S32, PrivatePtr, 8, 8}, 1114 {S32, PrivatePtr, 16, 16}, 1115 {V2S16, PrivatePtr, 32, 32}, 1116 1117 {S32, ConstantPtr, 32, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1119 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1120 {S64, ConstantPtr, 64, GlobalAlign32}, 1121 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1122 Actions.legalIf( 1123 [=](const LegalityQuery &Query) -> bool { 1124 return isLoadStoreLegal(ST, Query); 1125 }); 1126 1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1128 // 64-bits. 1129 // 1130 // TODO: Should generalize bitcast action into coerce, which will also cover 1131 // inserting addrspacecasts. 1132 Actions.customIf(typeIs(1, Constant32Ptr)); 1133 1134 // Turn any illegal element vectors into something easier to deal 1135 // with. These will ultimately produce 32-bit scalar shifts to extract the 1136 // parts anyway. 1137 // 1138 // For odd 16-bit element vectors, prefer to split those into pieces with 1139 // 16-bit vector parts. 1140 Actions.bitcastIf( 1141 [=](const LegalityQuery &Query) -> bool { 1142 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1143 Query.MMODescrs[0].SizeInBits); 1144 }, bitcastToRegisterType(0)); 1145 1146 if (!IsStore) { 1147 // Widen suitably aligned loads by loading extra bytes. The standard 1148 // legalization actions can't properly express widening memory operands. 1149 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1150 return shouldWidenLoad(ST, Query, G_LOAD); 1151 }); 1152 } 1153 1154 // FIXME: load/store narrowing should be moved to lower action 1155 Actions 1156 .narrowScalarIf( 1157 [=](const LegalityQuery &Query) -> bool { 1158 return !Query.Types[0].isVector() && 1159 needToSplitMemOp(Query, Op == G_LOAD); 1160 }, 1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1162 const LLT DstTy = Query.Types[0]; 1163 const LLT PtrTy = Query.Types[1]; 1164 1165 const unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1167 1168 // Split extloads. 1169 if (DstSize > MemSize) 1170 return std::make_pair(0, LLT::scalar(MemSize)); 1171 1172 if (!isPowerOf2_32(DstSize)) { 1173 // We're probably decomposing an odd sized store. Try to split 1174 // to the widest type. TODO: Account for alignment. As-is it 1175 // should be OK, since the new parts will be further legalized. 1176 unsigned FloorSize = PowerOf2Floor(DstSize); 1177 return std::make_pair(0, LLT::scalar(FloorSize)); 1178 } 1179 1180 if (DstSize > 32 && (DstSize % 32 != 0)) { 1181 // FIXME: Need a way to specify non-extload of larger size if 1182 // suitably aligned. 1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1184 } 1185 1186 unsigned MaxSize = maxSizeForAddrSpace(ST, 1187 PtrTy.getAddressSpace(), 1188 Op == G_LOAD); 1189 if (MemSize > MaxSize) 1190 return std::make_pair(0, LLT::scalar(MaxSize)); 1191 1192 unsigned Align = Query.MMODescrs[0].AlignInBits; 1193 return std::make_pair(0, LLT::scalar(Align)); 1194 }) 1195 .fewerElementsIf( 1196 [=](const LegalityQuery &Query) -> bool { 1197 return Query.Types[0].isVector() && 1198 needToSplitMemOp(Query, Op == G_LOAD); 1199 }, 1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1201 const LLT DstTy = Query.Types[0]; 1202 const LLT PtrTy = Query.Types[1]; 1203 1204 LLT EltTy = DstTy.getElementType(); 1205 unsigned MaxSize = maxSizeForAddrSpace(ST, 1206 PtrTy.getAddressSpace(), 1207 Op == G_LOAD); 1208 1209 // FIXME: Handle widened to power of 2 results better. This ends 1210 // up scalarizing. 1211 // FIXME: 3 element stores scalarized on SI 1212 1213 // Split if it's too large for the address space. 1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1215 unsigned NumElts = DstTy.getNumElements(); 1216 unsigned EltSize = EltTy.getSizeInBits(); 1217 1218 if (MaxSize % EltSize == 0) { 1219 return std::make_pair( 1220 0, LLT::scalarOrVector( 1221 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1222 } 1223 1224 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1225 1226 // FIXME: Refine when odd breakdowns handled 1227 // The scalars will need to be re-legalized. 1228 if (NumPieces == 1 || NumPieces >= NumElts || 1229 NumElts % NumPieces != 0) 1230 return std::make_pair(0, EltTy); 1231 1232 return std::make_pair( 1233 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1234 } 1235 1236 // FIXME: We could probably handle weird extending loads better. 1237 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1238 if (DstTy.getSizeInBits() > MemSize) 1239 return std::make_pair(0, EltTy); 1240 1241 unsigned EltSize = EltTy.getSizeInBits(); 1242 unsigned DstSize = DstTy.getSizeInBits(); 1243 if (!isPowerOf2_32(DstSize)) { 1244 // We're probably decomposing an odd sized store. Try to split 1245 // to the widest type. TODO: Account for alignment. As-is it 1246 // should be OK, since the new parts will be further legalized. 1247 unsigned FloorSize = PowerOf2Floor(DstSize); 1248 return std::make_pair( 1249 0, LLT::scalarOrVector( 1250 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1251 } 1252 1253 // Need to split because of alignment. 1254 unsigned Align = Query.MMODescrs[0].AlignInBits; 1255 if (EltSize > Align && 1256 (EltSize / Align < DstTy.getNumElements())) { 1257 return std::make_pair( 1258 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1259 } 1260 1261 // May need relegalization for the scalars. 1262 return std::make_pair(0, EltTy); 1263 }) 1264 .lowerIfMemSizeNotPow2() 1265 .minScalar(0, S32) 1266 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1267 .widenScalarToNextPow2(0) 1268 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1269 .lower(); 1270 } 1271 1272 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1273 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1274 {S32, GlobalPtr, 16, 2 * 8}, 1275 {S32, LocalPtr, 8, 8}, 1276 {S32, LocalPtr, 16, 16}, 1277 {S32, PrivatePtr, 8, 8}, 1278 {S32, PrivatePtr, 16, 16}, 1279 {S32, ConstantPtr, 8, 8}, 1280 {S32, ConstantPtr, 16, 2 * 8}}) 1281 .legalIf( 1282 [=](const LegalityQuery &Query) -> bool { 1283 return isLoadStoreLegal(ST, Query); 1284 }); 1285 1286 if (ST.hasFlatAddressSpace()) { 1287 ExtLoads.legalForTypesWithMemDesc( 1288 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1289 } 1290 1291 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1292 // 64-bits. 1293 // 1294 // TODO: Should generalize bitcast action into coerce, which will also cover 1295 // inserting addrspacecasts. 1296 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1297 1298 ExtLoads.clampScalar(0, S32, S32) 1299 .widenScalarToNextPow2(0) 1300 .unsupportedIfMemSizeNotPow2() 1301 .lower(); 1302 1303 auto &Atomics = getActionDefinitionsBuilder( 1304 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1305 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1306 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1307 G_ATOMICRMW_UMIN}) 1308 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1309 {S64, GlobalPtr}, {S64, LocalPtr}, 1310 {S32, RegionPtr}, {S64, RegionPtr}}); 1311 if (ST.hasFlatAddressSpace()) { 1312 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1313 } 1314 1315 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1316 if (ST.hasLDSFPAtomics()) { 1317 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1318 if (ST.hasGFX90AInsts()) 1319 Atomic.legalFor({{S64, LocalPtr}}); 1320 } 1321 if (ST.hasAtomicFaddInsts()) 1322 Atomic.legalFor({{S32, GlobalPtr}}); 1323 1324 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1325 // demarshalling 1326 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1327 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1328 {S32, FlatPtr}, {S64, FlatPtr}}) 1329 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1330 {S32, RegionPtr}, {S64, RegionPtr}}); 1331 // TODO: Pointer types, any 32-bit or 64-bit vector 1332 1333 // Condition should be s32 for scalar, s1 for vector. 1334 getActionDefinitionsBuilder(G_SELECT) 1335 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1336 LocalPtr, FlatPtr, PrivatePtr, 1337 LLT::fixed_vector(2, LocalPtr), 1338 LLT::fixed_vector(2, PrivatePtr)}, 1339 {S1, S32}) 1340 .clampScalar(0, S16, S64) 1341 .scalarize(1) 1342 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1343 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1344 .clampMaxNumElements(0, S32, 2) 1345 .clampMaxNumElements(0, LocalPtr, 2) 1346 .clampMaxNumElements(0, PrivatePtr, 2) 1347 .scalarize(0) 1348 .widenScalarToNextPow2(0) 1349 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1350 1351 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1352 // be more flexible with the shift amount type. 1353 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1354 .legalFor({{S32, S32}, {S64, S32}}); 1355 if (ST.has16BitInsts()) { 1356 if (ST.hasVOP3PInsts()) { 1357 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1358 .clampMaxNumElements(0, S16, 2); 1359 } else 1360 Shifts.legalFor({{S16, S16}}); 1361 1362 // TODO: Support 16-bit shift amounts for all types 1363 Shifts.widenScalarIf( 1364 [=](const LegalityQuery &Query) { 1365 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1366 // 32-bit amount. 1367 const LLT ValTy = Query.Types[0]; 1368 const LLT AmountTy = Query.Types[1]; 1369 return ValTy.getSizeInBits() <= 16 && 1370 AmountTy.getSizeInBits() < 16; 1371 }, changeTo(1, S16)); 1372 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1373 Shifts.clampScalar(1, S32, S32); 1374 Shifts.clampScalar(0, S16, S64); 1375 Shifts.widenScalarToNextPow2(0, 16); 1376 1377 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1378 .minScalar(0, S16) 1379 .scalarize(0) 1380 .lower(); 1381 } else { 1382 // Make sure we legalize the shift amount type first, as the general 1383 // expansion for the shifted type will produce much worse code if it hasn't 1384 // been truncated already. 1385 Shifts.clampScalar(1, S32, S32); 1386 Shifts.clampScalar(0, S32, S64); 1387 Shifts.widenScalarToNextPow2(0, 32); 1388 1389 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1390 .minScalar(0, S32) 1391 .scalarize(0) 1392 .lower(); 1393 } 1394 Shifts.scalarize(0); 1395 1396 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1397 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1398 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1399 unsigned IdxTypeIdx = 2; 1400 1401 getActionDefinitionsBuilder(Op) 1402 .customIf([=](const LegalityQuery &Query) { 1403 const LLT EltTy = Query.Types[EltTypeIdx]; 1404 const LLT VecTy = Query.Types[VecTypeIdx]; 1405 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1406 const unsigned EltSize = EltTy.getSizeInBits(); 1407 return (EltSize == 32 || EltSize == 64) && 1408 VecTy.getSizeInBits() % 32 == 0 && 1409 VecTy.getSizeInBits() <= MaxRegisterSize && 1410 IdxTy.getSizeInBits() == 32; 1411 }) 1412 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1413 bitcastToVectorElement32(VecTypeIdx)) 1414 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1415 .bitcastIf( 1416 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1417 [=](const LegalityQuery &Query) { 1418 // For > 64-bit element types, try to turn this into a 64-bit 1419 // element vector since we may be able to do better indexing 1420 // if this is scalar. If not, fall back to 32. 1421 const LLT EltTy = Query.Types[EltTypeIdx]; 1422 const LLT VecTy = Query.Types[VecTypeIdx]; 1423 const unsigned DstEltSize = EltTy.getSizeInBits(); 1424 const unsigned VecSize = VecTy.getSizeInBits(); 1425 1426 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1427 return std::make_pair( 1428 VecTypeIdx, 1429 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1430 }) 1431 .clampScalar(EltTypeIdx, S32, S64) 1432 .clampScalar(VecTypeIdx, S32, S64) 1433 .clampScalar(IdxTypeIdx, S32, S32) 1434 .clampMaxNumElements(VecTypeIdx, S32, 32) 1435 // TODO: Clamp elements for 64-bit vectors? 1436 // It should only be necessary with variable indexes. 1437 // As a last resort, lower to the stack 1438 .lower(); 1439 } 1440 1441 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1442 .unsupportedIf([=](const LegalityQuery &Query) { 1443 const LLT &EltTy = Query.Types[1].getElementType(); 1444 return Query.Types[0] != EltTy; 1445 }); 1446 1447 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1448 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1449 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1450 1451 // FIXME: Doesn't handle extract of illegal sizes. 1452 getActionDefinitionsBuilder(Op) 1453 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1454 // FIXME: Multiples of 16 should not be legal. 1455 .legalIf([=](const LegalityQuery &Query) { 1456 const LLT BigTy = Query.Types[BigTyIdx]; 1457 const LLT LitTy = Query.Types[LitTyIdx]; 1458 return (BigTy.getSizeInBits() % 32 == 0) && 1459 (LitTy.getSizeInBits() % 16 == 0); 1460 }) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT BigTy = Query.Types[BigTyIdx]; 1464 return (BigTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1467 .widenScalarIf( 1468 [=](const LegalityQuery &Query) { 1469 const LLT LitTy = Query.Types[LitTyIdx]; 1470 return (LitTy.getScalarSizeInBits() < 16); 1471 }, 1472 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1473 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1474 .widenScalarToNextPow2(BigTyIdx, 32); 1475 1476 } 1477 1478 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1479 .legalForCartesianProduct(AllS32Vectors, {S32}) 1480 .legalForCartesianProduct(AllS64Vectors, {S64}) 1481 .clampNumElements(0, V16S32, V32S32) 1482 .clampNumElements(0, V2S64, V16S64) 1483 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1484 1485 if (ST.hasScalarPackInsts()) { 1486 BuildVector 1487 // FIXME: Should probably widen s1 vectors straight to s32 1488 .minScalarOrElt(0, S16) 1489 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1490 .minScalar(1, S32); 1491 1492 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1493 .legalFor({V2S16, S32}) 1494 .lower(); 1495 BuildVector.minScalarOrElt(0, S32); 1496 } else { 1497 BuildVector.customFor({V2S16, S16}); 1498 BuildVector.minScalarOrElt(0, S32); 1499 1500 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1501 .customFor({V2S16, S32}) 1502 .lower(); 1503 } 1504 1505 BuildVector.legalIf(isRegisterType(0)); 1506 1507 // FIXME: Clamp maximum size 1508 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1509 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1510 .clampMaxNumElements(0, S32, 32) 1511 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1512 .clampMaxNumElements(0, S16, 64); 1513 1514 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1515 // pre-legalize. 1516 if (ST.hasVOP3PInsts()) { 1517 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1518 .customFor({V2S16, V2S16}) 1519 .lower(); 1520 } else 1521 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1522 1523 // Merge/Unmerge 1524 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1525 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1526 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1527 1528 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1529 const LLT Ty = Query.Types[TypeIdx]; 1530 if (Ty.isVector()) { 1531 const LLT &EltTy = Ty.getElementType(); 1532 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1533 return true; 1534 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1535 return true; 1536 } 1537 return false; 1538 }; 1539 1540 auto &Builder = getActionDefinitionsBuilder(Op) 1541 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1542 .lowerFor({{S16, V2S16}}) 1543 .lowerIf([=](const LegalityQuery &Query) { 1544 const LLT BigTy = Query.Types[BigTyIdx]; 1545 return BigTy.getSizeInBits() == 32; 1546 }) 1547 // Try to widen to s16 first for small types. 1548 // TODO: Only do this on targets with legal s16 shifts 1549 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1550 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1551 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1552 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1553 elementTypeIs(1, S16)), 1554 changeTo(1, V2S16)) 1555 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1556 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1557 // valid. 1558 .clampScalar(LitTyIdx, S32, S512) 1559 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1560 // Break up vectors with weird elements into scalars 1561 .fewerElementsIf( 1562 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1563 scalarize(0)) 1564 .fewerElementsIf( 1565 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1566 scalarize(1)) 1567 .clampScalar(BigTyIdx, S32, MaxScalar); 1568 1569 if (Op == G_MERGE_VALUES) { 1570 Builder.widenScalarIf( 1571 // TODO: Use 16-bit shifts if legal for 8-bit values? 1572 [=](const LegalityQuery &Query) { 1573 const LLT Ty = Query.Types[LitTyIdx]; 1574 return Ty.getSizeInBits() < 32; 1575 }, 1576 changeTo(LitTyIdx, S32)); 1577 } 1578 1579 Builder.widenScalarIf( 1580 [=](const LegalityQuery &Query) { 1581 const LLT Ty = Query.Types[BigTyIdx]; 1582 return !isPowerOf2_32(Ty.getSizeInBits()) && 1583 Ty.getSizeInBits() % 16 != 0; 1584 }, 1585 [=](const LegalityQuery &Query) { 1586 // Pick the next power of 2, or a multiple of 64 over 128. 1587 // Whichever is smaller. 1588 const LLT &Ty = Query.Types[BigTyIdx]; 1589 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1590 if (NewSizeInBits >= 256) { 1591 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1592 if (RoundedTo < NewSizeInBits) 1593 NewSizeInBits = RoundedTo; 1594 } 1595 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1596 }) 1597 // Any vectors left are the wrong size. Scalarize them. 1598 .scalarize(0) 1599 .scalarize(1); 1600 } 1601 1602 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1603 // RegBankSelect. 1604 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1605 .legalFor({{S32}, {S64}}); 1606 1607 if (ST.hasVOP3PInsts()) { 1608 SextInReg.lowerFor({{V2S16}}) 1609 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1610 // get more vector shift opportunities, since we'll get those when 1611 // expanded. 1612 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1613 } else if (ST.has16BitInsts()) { 1614 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1615 } else { 1616 // Prefer to promote to s32 before lowering if we don't have 16-bit 1617 // shifts. This avoid a lot of intermediate truncate and extend operations. 1618 SextInReg.lowerFor({{S32}, {S64}}); 1619 } 1620 1621 SextInReg 1622 .scalarize(0) 1623 .clampScalar(0, S32, S64) 1624 .lower(); 1625 1626 // TODO: Only Try to form v2s16 with legal packed instructions. 1627 getActionDefinitionsBuilder(G_FSHR) 1628 .legalFor({{S32, S32}}) 1629 .lowerFor({{V2S16, V2S16}}) 1630 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1631 .scalarize(0) 1632 .lower(); 1633 1634 if (ST.hasVOP3PInsts()) { 1635 getActionDefinitionsBuilder(G_FSHL) 1636 .lowerFor({{V2S16, V2S16}}) 1637 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1638 .scalarize(0) 1639 .lower(); 1640 } else { 1641 getActionDefinitionsBuilder(G_FSHL) 1642 .scalarize(0) 1643 .lower(); 1644 } 1645 1646 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1647 .legalFor({S64}); 1648 1649 getActionDefinitionsBuilder(G_FENCE) 1650 .alwaysLegal(); 1651 1652 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1653 .scalarize(0) 1654 .minScalar(0, S32) 1655 .lower(); 1656 1657 getActionDefinitionsBuilder({ 1658 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1659 G_FCOPYSIGN, 1660 1661 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1662 G_ATOMICRMW_NAND, 1663 G_ATOMICRMW_FSUB, 1664 G_READ_REGISTER, 1665 G_WRITE_REGISTER, 1666 1667 G_SADDO, G_SSUBO, 1668 1669 // TODO: Implement 1670 G_FMINIMUM, G_FMAXIMUM}).lower(); 1671 1672 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1673 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1674 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1675 .unsupported(); 1676 1677 getLegacyLegalizerInfo().computeTables(); 1678 verify(*ST.getInstrInfo()); 1679 } 1680 1681 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1682 MachineInstr &MI) const { 1683 MachineIRBuilder &B = Helper.MIRBuilder; 1684 MachineRegisterInfo &MRI = *B.getMRI(); 1685 1686 switch (MI.getOpcode()) { 1687 case TargetOpcode::G_ADDRSPACE_CAST: 1688 return legalizeAddrSpaceCast(MI, MRI, B); 1689 case TargetOpcode::G_FRINT: 1690 return legalizeFrint(MI, MRI, B); 1691 case TargetOpcode::G_FCEIL: 1692 return legalizeFceil(MI, MRI, B); 1693 case TargetOpcode::G_FREM: 1694 return legalizeFrem(MI, MRI, B); 1695 case TargetOpcode::G_INTRINSIC_TRUNC: 1696 return legalizeIntrinsicTrunc(MI, MRI, B); 1697 case TargetOpcode::G_SITOFP: 1698 return legalizeITOFP(MI, MRI, B, true); 1699 case TargetOpcode::G_UITOFP: 1700 return legalizeITOFP(MI, MRI, B, false); 1701 case TargetOpcode::G_FPTOSI: 1702 return legalizeFPTOI(MI, MRI, B, true); 1703 case TargetOpcode::G_FPTOUI: 1704 return legalizeFPTOI(MI, MRI, B, false); 1705 case TargetOpcode::G_FMINNUM: 1706 case TargetOpcode::G_FMAXNUM: 1707 case TargetOpcode::G_FMINNUM_IEEE: 1708 case TargetOpcode::G_FMAXNUM_IEEE: 1709 return legalizeMinNumMaxNum(Helper, MI); 1710 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1711 return legalizeExtractVectorElt(MI, MRI, B); 1712 case TargetOpcode::G_INSERT_VECTOR_ELT: 1713 return legalizeInsertVectorElt(MI, MRI, B); 1714 case TargetOpcode::G_SHUFFLE_VECTOR: 1715 return legalizeShuffleVector(MI, MRI, B); 1716 case TargetOpcode::G_FSIN: 1717 case TargetOpcode::G_FCOS: 1718 return legalizeSinCos(MI, MRI, B); 1719 case TargetOpcode::G_GLOBAL_VALUE: 1720 return legalizeGlobalValue(MI, MRI, B); 1721 case TargetOpcode::G_LOAD: 1722 case TargetOpcode::G_SEXTLOAD: 1723 case TargetOpcode::G_ZEXTLOAD: 1724 return legalizeLoad(Helper, MI); 1725 case TargetOpcode::G_FMAD: 1726 return legalizeFMad(MI, MRI, B); 1727 case TargetOpcode::G_FDIV: 1728 return legalizeFDIV(MI, MRI, B); 1729 case TargetOpcode::G_UDIV: 1730 case TargetOpcode::G_UREM: 1731 case TargetOpcode::G_UDIVREM: 1732 return legalizeUnsignedDIV_REM(MI, MRI, B); 1733 case TargetOpcode::G_SDIV: 1734 case TargetOpcode::G_SREM: 1735 case TargetOpcode::G_SDIVREM: 1736 return legalizeSignedDIV_REM(MI, MRI, B); 1737 case TargetOpcode::G_ATOMIC_CMPXCHG: 1738 return legalizeAtomicCmpXChg(MI, MRI, B); 1739 case TargetOpcode::G_FLOG: 1740 return legalizeFlog(MI, B, numbers::ln2f); 1741 case TargetOpcode::G_FLOG10: 1742 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1743 case TargetOpcode::G_FEXP: 1744 return legalizeFExp(MI, B); 1745 case TargetOpcode::G_FPOW: 1746 return legalizeFPow(MI, B); 1747 case TargetOpcode::G_FFLOOR: 1748 return legalizeFFloor(MI, MRI, B); 1749 case TargetOpcode::G_BUILD_VECTOR: 1750 return legalizeBuildVector(MI, MRI, B); 1751 default: 1752 return false; 1753 } 1754 1755 llvm_unreachable("expected switch to return"); 1756 } 1757 1758 Register AMDGPULegalizerInfo::getSegmentAperture( 1759 unsigned AS, 1760 MachineRegisterInfo &MRI, 1761 MachineIRBuilder &B) const { 1762 MachineFunction &MF = B.getMF(); 1763 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1764 const LLT S32 = LLT::scalar(32); 1765 1766 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1767 1768 if (ST.hasApertureRegs()) { 1769 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1770 // getreg. 1771 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1772 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1773 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1774 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1775 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1776 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1777 unsigned Encoding = 1778 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1779 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1780 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1781 1782 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1783 1784 B.buildInstr(AMDGPU::S_GETREG_B32) 1785 .addDef(GetReg) 1786 .addImm(Encoding); 1787 MRI.setType(GetReg, S32); 1788 1789 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1790 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1791 } 1792 1793 Register QueuePtr = MRI.createGenericVirtualRegister( 1794 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1795 1796 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1797 return Register(); 1798 1799 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1800 // private_segment_aperture_base_hi. 1801 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1802 1803 // TODO: can we be smarter about machine pointer info? 1804 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1805 MachineMemOperand *MMO = MF.getMachineMemOperand( 1806 PtrInfo, 1807 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1808 MachineMemOperand::MOInvariant, 1809 4, commonAlignment(Align(64), StructOffset)); 1810 1811 Register LoadAddr; 1812 1813 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1814 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1815 } 1816 1817 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1818 MachineInstr &MI, MachineRegisterInfo &MRI, 1819 MachineIRBuilder &B) const { 1820 MachineFunction &MF = B.getMF(); 1821 1822 const LLT S32 = LLT::scalar(32); 1823 Register Dst = MI.getOperand(0).getReg(); 1824 Register Src = MI.getOperand(1).getReg(); 1825 1826 LLT DstTy = MRI.getType(Dst); 1827 LLT SrcTy = MRI.getType(Src); 1828 unsigned DestAS = DstTy.getAddressSpace(); 1829 unsigned SrcAS = SrcTy.getAddressSpace(); 1830 1831 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1832 // vector element. 1833 assert(!DstTy.isVector()); 1834 1835 const AMDGPUTargetMachine &TM 1836 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1837 1838 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1839 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1840 return true; 1841 } 1842 1843 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1844 // Truncate. 1845 B.buildExtract(Dst, Src, 0); 1846 MI.eraseFromParent(); 1847 return true; 1848 } 1849 1850 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1851 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1852 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1853 1854 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1855 // another. Merge operands are required to be the same type, but creating an 1856 // extra ptrtoint would be kind of pointless. 1857 auto HighAddr = B.buildConstant( 1858 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1859 B.buildMerge(Dst, {Src, HighAddr}); 1860 MI.eraseFromParent(); 1861 return true; 1862 } 1863 1864 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1865 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1866 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1867 unsigned NullVal = TM.getNullPointerValue(DestAS); 1868 1869 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1870 auto FlatNull = B.buildConstant(SrcTy, 0); 1871 1872 // Extract low 32-bits of the pointer. 1873 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1874 1875 auto CmpRes = 1876 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1877 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1878 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1884 return false; 1885 1886 if (!ST.hasFlatAddressSpace()) 1887 return false; 1888 1889 auto SegmentNull = 1890 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1891 auto FlatNull = 1892 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1893 1894 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1895 if (!ApertureReg.isValid()) 1896 return false; 1897 1898 auto CmpRes = 1899 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1900 1901 // Coerce the type of the low half of the result so we can use merge_values. 1902 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1903 1904 // TODO: Should we allow mismatched types but matching sizes in merges to 1905 // avoid the ptrtoint? 1906 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1907 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1908 1909 MI.eraseFromParent(); 1910 return true; 1911 } 1912 1913 bool AMDGPULegalizerInfo::legalizeFrint( 1914 MachineInstr &MI, MachineRegisterInfo &MRI, 1915 MachineIRBuilder &B) const { 1916 Register Src = MI.getOperand(1).getReg(); 1917 LLT Ty = MRI.getType(Src); 1918 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1919 1920 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1921 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1922 1923 auto C1 = B.buildFConstant(Ty, C1Val); 1924 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1925 1926 // TODO: Should this propagate fast-math-flags? 1927 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1928 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1929 1930 auto C2 = B.buildFConstant(Ty, C2Val); 1931 auto Fabs = B.buildFAbs(Ty, Src); 1932 1933 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1934 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1935 MI.eraseFromParent(); 1936 return true; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeFceil( 1940 MachineInstr &MI, MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 1943 const LLT S1 = LLT::scalar(1); 1944 const LLT S64 = LLT::scalar(64); 1945 1946 Register Src = MI.getOperand(1).getReg(); 1947 assert(MRI.getType(Src) == S64); 1948 1949 // result = trunc(src) 1950 // if (src > 0.0 && src != result) 1951 // result += 1.0 1952 1953 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1954 1955 const auto Zero = B.buildFConstant(S64, 0.0); 1956 const auto One = B.buildFConstant(S64, 1.0); 1957 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1958 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1959 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1960 auto Add = B.buildSelect(S64, And, One, Zero); 1961 1962 // TODO: Should this propagate fast-math-flags? 1963 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1964 return true; 1965 } 1966 1967 bool AMDGPULegalizerInfo::legalizeFrem( 1968 MachineInstr &MI, MachineRegisterInfo &MRI, 1969 MachineIRBuilder &B) const { 1970 Register DstReg = MI.getOperand(0).getReg(); 1971 Register Src0Reg = MI.getOperand(1).getReg(); 1972 Register Src1Reg = MI.getOperand(2).getReg(); 1973 auto Flags = MI.getFlags(); 1974 LLT Ty = MRI.getType(DstReg); 1975 1976 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1977 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1978 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1979 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 static MachineInstrBuilder extractF64Exponent(Register Hi, 1985 MachineIRBuilder &B) { 1986 const unsigned FractBits = 52; 1987 const unsigned ExpBits = 11; 1988 LLT S32 = LLT::scalar(32); 1989 1990 auto Const0 = B.buildConstant(S32, FractBits - 32); 1991 auto Const1 = B.buildConstant(S32, ExpBits); 1992 1993 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1994 .addUse(Hi) 1995 .addUse(Const0.getReg(0)) 1996 .addUse(Const1.getReg(0)); 1997 1998 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1999 } 2000 2001 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2002 MachineInstr &MI, MachineRegisterInfo &MRI, 2003 MachineIRBuilder &B) const { 2004 const LLT S1 = LLT::scalar(1); 2005 const LLT S32 = LLT::scalar(32); 2006 const LLT S64 = LLT::scalar(64); 2007 2008 Register Src = MI.getOperand(1).getReg(); 2009 assert(MRI.getType(Src) == S64); 2010 2011 // TODO: Should this use extract since the low half is unused? 2012 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2013 Register Hi = Unmerge.getReg(1); 2014 2015 // Extract the upper half, since this is where we will find the sign and 2016 // exponent. 2017 auto Exp = extractF64Exponent(Hi, B); 2018 2019 const unsigned FractBits = 52; 2020 2021 // Extract the sign bit. 2022 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2023 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2024 2025 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2026 2027 const auto Zero32 = B.buildConstant(S32, 0); 2028 2029 // Extend back to 64-bits. 2030 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2031 2032 auto Shr = B.buildAShr(S64, FractMask, Exp); 2033 auto Not = B.buildNot(S64, Shr); 2034 auto Tmp0 = B.buildAnd(S64, Src, Not); 2035 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2036 2037 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2038 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2039 2040 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2041 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2042 MI.eraseFromParent(); 2043 return true; 2044 } 2045 2046 bool AMDGPULegalizerInfo::legalizeITOFP( 2047 MachineInstr &MI, MachineRegisterInfo &MRI, 2048 MachineIRBuilder &B, bool Signed) const { 2049 2050 Register Dst = MI.getOperand(0).getReg(); 2051 Register Src = MI.getOperand(1).getReg(); 2052 2053 const LLT S64 = LLT::scalar(64); 2054 const LLT S32 = LLT::scalar(32); 2055 2056 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2057 2058 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2059 2060 auto CvtHi = Signed ? 2061 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2062 B.buildUITOFP(S64, Unmerge.getReg(1)); 2063 2064 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2065 2066 auto ThirtyTwo = B.buildConstant(S32, 32); 2067 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2068 .addUse(CvtHi.getReg(0)) 2069 .addUse(ThirtyTwo.getReg(0)); 2070 2071 // TODO: Should this propagate fast-math-flags? 2072 B.buildFAdd(Dst, LdExp, CvtLo); 2073 MI.eraseFromParent(); 2074 return true; 2075 } 2076 2077 // TODO: Copied from DAG implementation. Verify logic and document how this 2078 // actually works. 2079 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2080 MachineRegisterInfo &MRI, 2081 MachineIRBuilder &B, 2082 bool Signed) const { 2083 2084 Register Dst = MI.getOperand(0).getReg(); 2085 Register Src = MI.getOperand(1).getReg(); 2086 2087 const LLT S64 = LLT::scalar(64); 2088 const LLT S32 = LLT::scalar(32); 2089 2090 const LLT SrcLT = MRI.getType(Src); 2091 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2092 2093 unsigned Flags = MI.getFlags(); 2094 2095 // The basic idea of converting a floating point number into a pair of 32-bit 2096 // integers is illustrated as follows: 2097 // 2098 // tf := trunc(val); 2099 // hif := floor(tf * 2^-32); 2100 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2101 // hi := fptoi(hif); 2102 // lo := fptoi(lof); 2103 // 2104 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2105 MachineInstrBuilder Sign; 2106 if (Signed && SrcLT == S32) { 2107 // However, a 32-bit floating point number has only 23 bits mantissa and 2108 // it's not enough to hold all the significant bits of `lof` if val is 2109 // negative. To avoid the loss of precision, We need to take the absolute 2110 // value after truncating and flip the result back based on the original 2111 // signedness. 2112 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2113 Trunc = B.buildFAbs(S32, Trunc, Flags); 2114 } 2115 MachineInstrBuilder K0, K1; 2116 if (SrcLT == S64) { 2117 K0 = B.buildFConstant(S64, 2118 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2119 K1 = B.buildFConstant(S64, 2120 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2121 } else { 2122 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2123 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2124 } 2125 2126 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2127 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2128 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2129 2130 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2131 : B.buildFPTOUI(S32, FloorMul); 2132 auto Lo = B.buildFPTOUI(S32, Fma); 2133 2134 if (Signed && SrcLT == S32) { 2135 // Flip the result based on the signedness, which is either all 0s or 1s. 2136 Sign = B.buildMerge(S64, {Sign, Sign}); 2137 // r := xor({lo, hi}, sign) - sign; 2138 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2139 } else 2140 B.buildMerge(Dst, {Lo, Hi}); 2141 MI.eraseFromParent(); 2142 2143 return true; 2144 } 2145 2146 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2147 MachineInstr &MI) const { 2148 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2149 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2150 2151 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2152 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2153 2154 // With ieee_mode disabled, the instructions have the correct behavior 2155 // already for G_FMINNUM/G_FMAXNUM 2156 if (!MFI->getMode().IEEE) 2157 return !IsIEEEOp; 2158 2159 if (IsIEEEOp) 2160 return true; 2161 2162 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2163 } 2164 2165 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2166 MachineInstr &MI, MachineRegisterInfo &MRI, 2167 MachineIRBuilder &B) const { 2168 // TODO: Should move some of this into LegalizerHelper. 2169 2170 // TODO: Promote dynamic indexing of s16 to s32 2171 2172 // FIXME: Artifact combiner probably should have replaced the truncated 2173 // constant before this, so we shouldn't need 2174 // getConstantVRegValWithLookThrough. 2175 Optional<ValueAndVReg> MaybeIdxVal = 2176 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2177 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2178 return true; 2179 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2180 2181 Register Dst = MI.getOperand(0).getReg(); 2182 Register Vec = MI.getOperand(1).getReg(); 2183 2184 LLT VecTy = MRI.getType(Vec); 2185 LLT EltTy = VecTy.getElementType(); 2186 assert(EltTy == MRI.getType(Dst)); 2187 2188 if (IdxVal < VecTy.getNumElements()) 2189 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2190 else 2191 B.buildUndef(Dst); 2192 2193 MI.eraseFromParent(); 2194 return true; 2195 } 2196 2197 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2198 MachineInstr &MI, MachineRegisterInfo &MRI, 2199 MachineIRBuilder &B) const { 2200 // TODO: Should move some of this into LegalizerHelper. 2201 2202 // TODO: Promote dynamic indexing of s16 to s32 2203 2204 // FIXME: Artifact combiner probably should have replaced the truncated 2205 // constant before this, so we shouldn't need 2206 // getConstantVRegValWithLookThrough. 2207 Optional<ValueAndVReg> MaybeIdxVal = 2208 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2209 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2210 return true; 2211 2212 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2213 Register Dst = MI.getOperand(0).getReg(); 2214 Register Vec = MI.getOperand(1).getReg(); 2215 Register Ins = MI.getOperand(2).getReg(); 2216 2217 LLT VecTy = MRI.getType(Vec); 2218 LLT EltTy = VecTy.getElementType(); 2219 assert(EltTy == MRI.getType(Ins)); 2220 2221 if (IdxVal < VecTy.getNumElements()) 2222 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2223 else 2224 B.buildUndef(Dst); 2225 2226 MI.eraseFromParent(); 2227 return true; 2228 } 2229 2230 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2231 MachineInstr &MI, MachineRegisterInfo &MRI, 2232 MachineIRBuilder &B) const { 2233 const LLT V2S16 = LLT::fixed_vector(2, 16); 2234 2235 Register Dst = MI.getOperand(0).getReg(); 2236 Register Src0 = MI.getOperand(1).getReg(); 2237 LLT DstTy = MRI.getType(Dst); 2238 LLT SrcTy = MRI.getType(Src0); 2239 2240 if (SrcTy == V2S16 && DstTy == V2S16 && 2241 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2242 return true; 2243 2244 MachineIRBuilder HelperBuilder(MI); 2245 GISelObserverWrapper DummyObserver; 2246 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2247 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2248 } 2249 2250 bool AMDGPULegalizerInfo::legalizeSinCos( 2251 MachineInstr &MI, MachineRegisterInfo &MRI, 2252 MachineIRBuilder &B) const { 2253 2254 Register DstReg = MI.getOperand(0).getReg(); 2255 Register SrcReg = MI.getOperand(1).getReg(); 2256 LLT Ty = MRI.getType(DstReg); 2257 unsigned Flags = MI.getFlags(); 2258 2259 Register TrigVal; 2260 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2261 if (ST.hasTrigReducedRange()) { 2262 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2263 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2264 .addUse(MulVal.getReg(0)) 2265 .setMIFlags(Flags).getReg(0); 2266 } else 2267 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2268 2269 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2270 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2271 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2272 .addUse(TrigVal) 2273 .setMIFlags(Flags); 2274 MI.eraseFromParent(); 2275 return true; 2276 } 2277 2278 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2279 MachineIRBuilder &B, 2280 const GlobalValue *GV, 2281 int64_t Offset, 2282 unsigned GAFlags) const { 2283 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2284 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2285 // to the following code sequence: 2286 // 2287 // For constant address space: 2288 // s_getpc_b64 s[0:1] 2289 // s_add_u32 s0, s0, $symbol 2290 // s_addc_u32 s1, s1, 0 2291 // 2292 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2293 // a fixup or relocation is emitted to replace $symbol with a literal 2294 // constant, which is a pc-relative offset from the encoding of the $symbol 2295 // operand to the global variable. 2296 // 2297 // For global address space: 2298 // s_getpc_b64 s[0:1] 2299 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2300 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2301 // 2302 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2303 // fixups or relocations are emitted to replace $symbol@*@lo and 2304 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2305 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2306 // operand to the global variable. 2307 // 2308 // What we want here is an offset from the value returned by s_getpc 2309 // (which is the address of the s_add_u32 instruction) to the global 2310 // variable, but since the encoding of $symbol starts 4 bytes after the start 2311 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2312 // small. This requires us to add 4 to the global variable offset in order to 2313 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2314 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2315 // instruction. 2316 2317 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2318 2319 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2320 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2321 2322 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2323 .addDef(PCReg); 2324 2325 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2326 if (GAFlags == SIInstrInfo::MO_NONE) 2327 MIB.addImm(0); 2328 else 2329 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2330 2331 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2332 2333 if (PtrTy.getSizeInBits() == 32) 2334 B.buildExtract(DstReg, PCReg, 0); 2335 return true; 2336 } 2337 2338 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2339 MachineInstr &MI, MachineRegisterInfo &MRI, 2340 MachineIRBuilder &B) const { 2341 Register DstReg = MI.getOperand(0).getReg(); 2342 LLT Ty = MRI.getType(DstReg); 2343 unsigned AS = Ty.getAddressSpace(); 2344 2345 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2346 MachineFunction &MF = B.getMF(); 2347 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2348 2349 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2350 if (!MFI->isModuleEntryFunction() && 2351 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2352 const Function &Fn = MF.getFunction(); 2353 DiagnosticInfoUnsupported BadLDSDecl( 2354 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2355 DS_Warning); 2356 Fn.getContext().diagnose(BadLDSDecl); 2357 2358 // We currently don't have a way to correctly allocate LDS objects that 2359 // aren't directly associated with a kernel. We do force inlining of 2360 // functions that use local objects. However, if these dead functions are 2361 // not eliminated, we don't want a compile time error. Just emit a warning 2362 // and a trap, since there should be no callable path here. 2363 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2364 B.buildUndef(DstReg); 2365 MI.eraseFromParent(); 2366 return true; 2367 } 2368 2369 // TODO: We could emit code to handle the initialization somewhere. 2370 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2371 const SITargetLowering *TLI = ST.getTargetLowering(); 2372 if (!TLI->shouldUseLDSConstAddress(GV)) { 2373 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2374 return true; // Leave in place; 2375 } 2376 2377 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2378 Type *Ty = GV->getValueType(); 2379 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2380 // zero-sized type in other languages to declare the dynamic shared 2381 // memory which size is not known at the compile time. They will be 2382 // allocated by the runtime and placed directly after the static 2383 // allocated ones. They all share the same offset. 2384 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2385 // Adjust alignment for that dynamic shared memory array. 2386 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2387 LLT S32 = LLT::scalar(32); 2388 auto Sz = 2389 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2390 B.buildIntToPtr(DstReg, Sz); 2391 MI.eraseFromParent(); 2392 return true; 2393 } 2394 } 2395 2396 B.buildConstant( 2397 DstReg, 2398 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 2403 const Function &Fn = MF.getFunction(); 2404 DiagnosticInfoUnsupported BadInit( 2405 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2406 Fn.getContext().diagnose(BadInit); 2407 return true; 2408 } 2409 2410 const SITargetLowering *TLI = ST.getTargetLowering(); 2411 2412 if (TLI->shouldEmitFixup(GV)) { 2413 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2414 MI.eraseFromParent(); 2415 return true; 2416 } 2417 2418 if (TLI->shouldEmitPCReloc(GV)) { 2419 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2420 MI.eraseFromParent(); 2421 return true; 2422 } 2423 2424 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2425 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2426 2427 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2428 MachinePointerInfo::getGOT(MF), 2429 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2430 MachineMemOperand::MOInvariant, 2431 8 /*Size*/, Align(8)); 2432 2433 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2434 2435 if (Ty.getSizeInBits() == 32) { 2436 // Truncate if this is a 32-bit constant adrdess. 2437 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2438 B.buildExtract(DstReg, Load, 0); 2439 } else 2440 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2441 2442 MI.eraseFromParent(); 2443 return true; 2444 } 2445 2446 static LLT widenToNextPowerOf2(LLT Ty) { 2447 if (Ty.isVector()) 2448 return Ty.changeElementCount( 2449 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2450 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2451 } 2452 2453 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2454 MachineInstr &MI) const { 2455 MachineIRBuilder &B = Helper.MIRBuilder; 2456 MachineRegisterInfo &MRI = *B.getMRI(); 2457 GISelChangeObserver &Observer = Helper.Observer; 2458 2459 Register PtrReg = MI.getOperand(1).getReg(); 2460 LLT PtrTy = MRI.getType(PtrReg); 2461 unsigned AddrSpace = PtrTy.getAddressSpace(); 2462 2463 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2464 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2465 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2466 Observer.changingInstr(MI); 2467 MI.getOperand(1).setReg(Cast.getReg(0)); 2468 Observer.changedInstr(MI); 2469 return true; 2470 } 2471 2472 if (MI.getOpcode() != AMDGPU::G_LOAD) 2473 return false; 2474 2475 Register ValReg = MI.getOperand(0).getReg(); 2476 LLT ValTy = MRI.getType(ValReg); 2477 2478 MachineMemOperand *MMO = *MI.memoperands_begin(); 2479 const unsigned ValSize = ValTy.getSizeInBits(); 2480 const unsigned MemSize = 8 * MMO->getSize(); 2481 const Align MemAlign = MMO->getAlign(); 2482 const unsigned AlignInBits = 8 * MemAlign.value(); 2483 2484 // Widen non-power-of-2 loads to the alignment if needed 2485 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2486 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2487 2488 // This was already the correct extending load result type, so just adjust 2489 // the memory type. 2490 if (WideMemSize == ValSize) { 2491 MachineFunction &MF = B.getMF(); 2492 2493 MachineMemOperand *WideMMO = 2494 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2495 Observer.changingInstr(MI); 2496 MI.setMemRefs(MF, {WideMMO}); 2497 Observer.changedInstr(MI); 2498 return true; 2499 } 2500 2501 // Don't bother handling edge case that should probably never be produced. 2502 if (ValSize > WideMemSize) 2503 return false; 2504 2505 LLT WideTy = widenToNextPowerOf2(ValTy); 2506 2507 Register WideLoad; 2508 if (!WideTy.isVector()) { 2509 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2510 B.buildTrunc(ValReg, WideLoad).getReg(0); 2511 } else { 2512 // Extract the subvector. 2513 2514 if (isRegisterType(ValTy)) { 2515 // If this a case where G_EXTRACT is legal, use it. 2516 // (e.g. <3 x s32> -> <4 x s32>) 2517 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2518 B.buildExtract(ValReg, WideLoad, 0); 2519 } else { 2520 // For cases where the widened type isn't a nice register value, unmerge 2521 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2522 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2523 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2524 B.setInsertPt(B.getMBB(), MI.getIterator()); 2525 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2526 } 2527 } 2528 2529 MI.eraseFromParent(); 2530 return true; 2531 } 2532 2533 return false; 2534 } 2535 2536 bool AMDGPULegalizerInfo::legalizeFMad( 2537 MachineInstr &MI, MachineRegisterInfo &MRI, 2538 MachineIRBuilder &B) const { 2539 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2540 assert(Ty.isScalar()); 2541 2542 MachineFunction &MF = B.getMF(); 2543 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2544 2545 // TODO: Always legal with future ftz flag. 2546 // FIXME: Do we need just output? 2547 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2548 return true; 2549 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2550 return true; 2551 2552 MachineIRBuilder HelperBuilder(MI); 2553 GISelObserverWrapper DummyObserver; 2554 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2555 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2556 } 2557 2558 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2559 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2560 Register DstReg = MI.getOperand(0).getReg(); 2561 Register PtrReg = MI.getOperand(1).getReg(); 2562 Register CmpVal = MI.getOperand(2).getReg(); 2563 Register NewVal = MI.getOperand(3).getReg(); 2564 2565 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2566 "this should not have been custom lowered"); 2567 2568 LLT ValTy = MRI.getType(CmpVal); 2569 LLT VecTy = LLT::fixed_vector(2, ValTy); 2570 2571 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2572 2573 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2574 .addDef(DstReg) 2575 .addUse(PtrReg) 2576 .addUse(PackedVal) 2577 .setMemRefs(MI.memoperands()); 2578 2579 MI.eraseFromParent(); 2580 return true; 2581 } 2582 2583 bool AMDGPULegalizerInfo::legalizeFlog( 2584 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2585 Register Dst = MI.getOperand(0).getReg(); 2586 Register Src = MI.getOperand(1).getReg(); 2587 LLT Ty = B.getMRI()->getType(Dst); 2588 unsigned Flags = MI.getFlags(); 2589 2590 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2591 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2592 2593 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2594 MI.eraseFromParent(); 2595 return true; 2596 } 2597 2598 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2599 MachineIRBuilder &B) const { 2600 Register Dst = MI.getOperand(0).getReg(); 2601 Register Src = MI.getOperand(1).getReg(); 2602 unsigned Flags = MI.getFlags(); 2603 LLT Ty = B.getMRI()->getType(Dst); 2604 2605 auto K = B.buildFConstant(Ty, numbers::log2e); 2606 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2607 B.buildFExp2(Dst, Mul, Flags); 2608 MI.eraseFromParent(); 2609 return true; 2610 } 2611 2612 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2613 MachineIRBuilder &B) const { 2614 Register Dst = MI.getOperand(0).getReg(); 2615 Register Src0 = MI.getOperand(1).getReg(); 2616 Register Src1 = MI.getOperand(2).getReg(); 2617 unsigned Flags = MI.getFlags(); 2618 LLT Ty = B.getMRI()->getType(Dst); 2619 const LLT S16 = LLT::scalar(16); 2620 const LLT S32 = LLT::scalar(32); 2621 2622 if (Ty == S32) { 2623 auto Log = B.buildFLog2(S32, Src0, Flags); 2624 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2625 .addUse(Log.getReg(0)) 2626 .addUse(Src1) 2627 .setMIFlags(Flags); 2628 B.buildFExp2(Dst, Mul, Flags); 2629 } else if (Ty == S16) { 2630 // There's no f16 fmul_legacy, so we need to convert for it. 2631 auto Log = B.buildFLog2(S16, Src0, Flags); 2632 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2633 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2634 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2635 .addUse(Ext0.getReg(0)) 2636 .addUse(Ext1.getReg(0)) 2637 .setMIFlags(Flags); 2638 2639 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2640 } else 2641 return false; 2642 2643 MI.eraseFromParent(); 2644 return true; 2645 } 2646 2647 // Find a source register, ignoring any possible source modifiers. 2648 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2649 Register ModSrc = OrigSrc; 2650 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2651 ModSrc = SrcFNeg->getOperand(1).getReg(); 2652 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2653 ModSrc = SrcFAbs->getOperand(1).getReg(); 2654 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2655 ModSrc = SrcFAbs->getOperand(1).getReg(); 2656 return ModSrc; 2657 } 2658 2659 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2660 MachineRegisterInfo &MRI, 2661 MachineIRBuilder &B) const { 2662 2663 const LLT S1 = LLT::scalar(1); 2664 const LLT S64 = LLT::scalar(64); 2665 Register Dst = MI.getOperand(0).getReg(); 2666 Register OrigSrc = MI.getOperand(1).getReg(); 2667 unsigned Flags = MI.getFlags(); 2668 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2669 "this should not have been custom lowered"); 2670 2671 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2672 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2673 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2674 // V_FRACT bug is: 2675 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2676 // 2677 // Convert floor(x) to (x - fract(x)) 2678 2679 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2680 .addUse(OrigSrc) 2681 .setMIFlags(Flags); 2682 2683 // Give source modifier matching some assistance before obscuring a foldable 2684 // pattern. 2685 2686 // TODO: We can avoid the neg on the fract? The input sign to fract 2687 // shouldn't matter? 2688 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2689 2690 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2691 2692 Register Min = MRI.createGenericVirtualRegister(S64); 2693 2694 // We don't need to concern ourselves with the snan handling difference, so 2695 // use the one which will directly select. 2696 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2697 if (MFI->getMode().IEEE) 2698 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2699 else 2700 B.buildFMinNum(Min, Fract, Const, Flags); 2701 2702 Register CorrectedFract = Min; 2703 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2704 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2705 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2706 } 2707 2708 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2709 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2710 2711 MI.eraseFromParent(); 2712 return true; 2713 } 2714 2715 // Turn an illegal packed v2s16 build vector into bit operations. 2716 // TODO: This should probably be a bitcast action in LegalizerHelper. 2717 bool AMDGPULegalizerInfo::legalizeBuildVector( 2718 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2719 Register Dst = MI.getOperand(0).getReg(); 2720 const LLT S32 = LLT::scalar(32); 2721 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2722 2723 Register Src0 = MI.getOperand(1).getReg(); 2724 Register Src1 = MI.getOperand(2).getReg(); 2725 assert(MRI.getType(Src0) == LLT::scalar(16)); 2726 2727 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2728 B.buildBitcast(Dst, Merge); 2729 2730 MI.eraseFromParent(); 2731 return true; 2732 } 2733 2734 // Check that this is a G_XOR x, -1 2735 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2736 if (MI.getOpcode() != TargetOpcode::G_XOR) 2737 return false; 2738 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2739 return ConstVal && *ConstVal == -1; 2740 } 2741 2742 // Return the use branch instruction, otherwise null if the usage is invalid. 2743 static MachineInstr * 2744 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2745 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2746 Register CondDef = MI.getOperand(0).getReg(); 2747 if (!MRI.hasOneNonDBGUse(CondDef)) 2748 return nullptr; 2749 2750 MachineBasicBlock *Parent = MI.getParent(); 2751 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2752 2753 if (isNot(MRI, *UseMI)) { 2754 Register NegatedCond = UseMI->getOperand(0).getReg(); 2755 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2756 return nullptr; 2757 2758 // We're deleting the def of this value, so we need to remove it. 2759 UseMI->eraseFromParent(); 2760 2761 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2762 Negated = true; 2763 } 2764 2765 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2766 return nullptr; 2767 2768 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2769 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2770 if (Next == Parent->end()) { 2771 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2772 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2773 return nullptr; 2774 UncondBrTarget = &*NextMBB; 2775 } else { 2776 if (Next->getOpcode() != AMDGPU::G_BR) 2777 return nullptr; 2778 Br = &*Next; 2779 UncondBrTarget = Br->getOperand(0).getMBB(); 2780 } 2781 2782 return UseMI; 2783 } 2784 2785 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2786 const ArgDescriptor *Arg, 2787 const TargetRegisterClass *ArgRC, 2788 LLT ArgTy) const { 2789 MCRegister SrcReg = Arg->getRegister(); 2790 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2791 assert(DstReg.isVirtual() && "Virtual register expected"); 2792 2793 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2794 ArgTy); 2795 if (Arg->isMasked()) { 2796 // TODO: Should we try to emit this once in the entry block? 2797 const LLT S32 = LLT::scalar(32); 2798 const unsigned Mask = Arg->getMask(); 2799 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2800 2801 Register AndMaskSrc = LiveIn; 2802 2803 if (Shift != 0) { 2804 auto ShiftAmt = B.buildConstant(S32, Shift); 2805 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2806 } 2807 2808 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2809 } else { 2810 B.buildCopy(DstReg, LiveIn); 2811 } 2812 2813 return true; 2814 } 2815 2816 bool AMDGPULegalizerInfo::loadInputValue( 2817 Register DstReg, MachineIRBuilder &B, 2818 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2819 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2820 const ArgDescriptor *Arg; 2821 const TargetRegisterClass *ArgRC; 2822 LLT ArgTy; 2823 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2824 2825 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2826 return false; // TODO: Handle these 2827 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2828 } 2829 2830 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2831 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2832 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2833 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2834 return false; 2835 2836 MI.eraseFromParent(); 2837 return true; 2838 } 2839 2840 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2841 MachineRegisterInfo &MRI, 2842 MachineIRBuilder &B) const { 2843 Register Dst = MI.getOperand(0).getReg(); 2844 LLT DstTy = MRI.getType(Dst); 2845 LLT S16 = LLT::scalar(16); 2846 LLT S32 = LLT::scalar(32); 2847 LLT S64 = LLT::scalar(64); 2848 2849 if (DstTy == S16) 2850 return legalizeFDIV16(MI, MRI, B); 2851 if (DstTy == S32) 2852 return legalizeFDIV32(MI, MRI, B); 2853 if (DstTy == S64) 2854 return legalizeFDIV64(MI, MRI, B); 2855 2856 return false; 2857 } 2858 2859 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2860 Register DstDivReg, 2861 Register DstRemReg, 2862 Register X, 2863 Register Y) const { 2864 const LLT S1 = LLT::scalar(1); 2865 const LLT S32 = LLT::scalar(32); 2866 2867 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2868 // algorithm used here. 2869 2870 // Initial estimate of inv(y). 2871 auto FloatY = B.buildUITOFP(S32, Y); 2872 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2873 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2874 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2875 auto Z = B.buildFPTOUI(S32, ScaledY); 2876 2877 // One round of UNR. 2878 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2879 auto NegYZ = B.buildMul(S32, NegY, Z); 2880 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2881 2882 // Quotient/remainder estimate. 2883 auto Q = B.buildUMulH(S32, X, Z); 2884 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2885 2886 // First quotient/remainder refinement. 2887 auto One = B.buildConstant(S32, 1); 2888 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2889 if (DstDivReg) 2890 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2891 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2892 2893 // Second quotient/remainder refinement. 2894 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2895 if (DstDivReg) 2896 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2897 2898 if (DstRemReg) 2899 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2900 } 2901 2902 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2903 // 2904 // Return lo, hi of result 2905 // 2906 // %cvt.lo = G_UITOFP Val.lo 2907 // %cvt.hi = G_UITOFP Val.hi 2908 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2909 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2910 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2911 // %mul2 = G_FMUL %mul1, 2**(-32) 2912 // %trunc = G_INTRINSIC_TRUNC %mul2 2913 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2914 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2915 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2916 Register Val) { 2917 const LLT S32 = LLT::scalar(32); 2918 auto Unmerge = B.buildUnmerge(S32, Val); 2919 2920 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2921 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2922 2923 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2924 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2925 2926 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2927 auto Mul1 = 2928 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2929 2930 // 2**(-32) 2931 auto Mul2 = 2932 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2933 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2934 2935 // -(2**32) 2936 auto Mad2 = B.buildFMAD(S32, Trunc, 2937 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2938 2939 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2940 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2941 2942 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2943 } 2944 2945 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2946 Register DstDivReg, 2947 Register DstRemReg, 2948 Register Numer, 2949 Register Denom) const { 2950 const LLT S32 = LLT::scalar(32); 2951 const LLT S64 = LLT::scalar(64); 2952 const LLT S1 = LLT::scalar(1); 2953 Register RcpLo, RcpHi; 2954 2955 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2956 2957 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2958 2959 auto Zero64 = B.buildConstant(S64, 0); 2960 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2961 2962 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2963 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2964 2965 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2966 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2967 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2968 2969 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2970 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2971 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2972 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2973 2974 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2975 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2976 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2977 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2978 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2979 2980 auto Zero32 = B.buildConstant(S32, 0); 2981 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2982 auto Add2_HiC = 2983 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2984 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2985 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2986 2987 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2988 Register NumerLo = UnmergeNumer.getReg(0); 2989 Register NumerHi = UnmergeNumer.getReg(1); 2990 2991 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2992 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2993 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2994 Register Mul3_Lo = UnmergeMul3.getReg(0); 2995 Register Mul3_Hi = UnmergeMul3.getReg(1); 2996 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2997 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2998 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2999 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3000 3001 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3002 Register DenomLo = UnmergeDenom.getReg(0); 3003 Register DenomHi = UnmergeDenom.getReg(1); 3004 3005 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3006 auto C1 = B.buildSExt(S32, CmpHi); 3007 3008 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3009 auto C2 = B.buildSExt(S32, CmpLo); 3010 3011 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3012 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3013 3014 // TODO: Here and below portions of the code can be enclosed into if/endif. 3015 // Currently control flow is unconditional and we have 4 selects after 3016 // potential endif to substitute PHIs. 3017 3018 // if C3 != 0 ... 3019 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3020 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3021 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3022 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3023 3024 auto One64 = B.buildConstant(S64, 1); 3025 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3026 3027 auto C4 = 3028 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3029 auto C5 = 3030 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3031 auto C6 = B.buildSelect( 3032 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3033 3034 // if (C6 != 0) 3035 auto Add4 = B.buildAdd(S64, Add3, One64); 3036 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3037 3038 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3039 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3040 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3041 3042 // endif C6 3043 // endif C3 3044 3045 if (DstDivReg) { 3046 auto Sel1 = B.buildSelect( 3047 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3048 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3049 Sel1, MulHi3); 3050 } 3051 3052 if (DstRemReg) { 3053 auto Sel2 = B.buildSelect( 3054 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3055 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3056 Sel2, Sub1); 3057 } 3058 } 3059 3060 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3061 MachineRegisterInfo &MRI, 3062 MachineIRBuilder &B) const { 3063 Register DstDivReg, DstRemReg; 3064 switch (MI.getOpcode()) { 3065 default: 3066 llvm_unreachable("Unexpected opcode!"); 3067 case AMDGPU::G_UDIV: { 3068 DstDivReg = MI.getOperand(0).getReg(); 3069 break; 3070 } 3071 case AMDGPU::G_UREM: { 3072 DstRemReg = MI.getOperand(0).getReg(); 3073 break; 3074 } 3075 case AMDGPU::G_UDIVREM: { 3076 DstDivReg = MI.getOperand(0).getReg(); 3077 DstRemReg = MI.getOperand(1).getReg(); 3078 break; 3079 } 3080 } 3081 3082 const LLT S64 = LLT::scalar(64); 3083 const LLT S32 = LLT::scalar(32); 3084 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3085 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3086 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3087 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3088 3089 if (Ty == S32) 3090 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3091 else if (Ty == S64) 3092 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3093 else 3094 return false; 3095 3096 MI.eraseFromParent(); 3097 return true; 3098 } 3099 3100 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3101 MachineRegisterInfo &MRI, 3102 MachineIRBuilder &B) const { 3103 const LLT S64 = LLT::scalar(64); 3104 const LLT S32 = LLT::scalar(32); 3105 3106 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3107 if (Ty != S32 && Ty != S64) 3108 return false; 3109 3110 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3111 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3112 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3113 3114 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3115 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3116 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3117 3118 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3119 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3120 3121 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3122 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3123 3124 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3125 switch (MI.getOpcode()) { 3126 default: 3127 llvm_unreachable("Unexpected opcode!"); 3128 case AMDGPU::G_SDIV: { 3129 DstDivReg = MI.getOperand(0).getReg(); 3130 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3131 break; 3132 } 3133 case AMDGPU::G_SREM: { 3134 DstRemReg = MI.getOperand(0).getReg(); 3135 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3136 break; 3137 } 3138 case AMDGPU::G_SDIVREM: { 3139 DstDivReg = MI.getOperand(0).getReg(); 3140 DstRemReg = MI.getOperand(1).getReg(); 3141 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3142 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3143 break; 3144 } 3145 } 3146 3147 if (Ty == S32) 3148 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3149 else 3150 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3151 3152 if (DstDivReg) { 3153 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3154 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3155 B.buildSub(DstDivReg, SignXor, Sign); 3156 } 3157 3158 if (DstRemReg) { 3159 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3160 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3161 B.buildSub(DstRemReg, SignXor, Sign); 3162 } 3163 3164 MI.eraseFromParent(); 3165 return true; 3166 } 3167 3168 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3169 MachineRegisterInfo &MRI, 3170 MachineIRBuilder &B) const { 3171 Register Res = MI.getOperand(0).getReg(); 3172 Register LHS = MI.getOperand(1).getReg(); 3173 Register RHS = MI.getOperand(2).getReg(); 3174 uint16_t Flags = MI.getFlags(); 3175 LLT ResTy = MRI.getType(Res); 3176 3177 const MachineFunction &MF = B.getMF(); 3178 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3179 MI.getFlag(MachineInstr::FmAfn); 3180 3181 if (!AllowInaccurateRcp) 3182 return false; 3183 3184 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3185 // 1 / x -> RCP(x) 3186 if (CLHS->isExactlyValue(1.0)) { 3187 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3188 .addUse(RHS) 3189 .setMIFlags(Flags); 3190 3191 MI.eraseFromParent(); 3192 return true; 3193 } 3194 3195 // -1 / x -> RCP( FNEG(x) ) 3196 if (CLHS->isExactlyValue(-1.0)) { 3197 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3198 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3199 .addUse(FNeg.getReg(0)) 3200 .setMIFlags(Flags); 3201 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 } 3206 3207 // x / y -> x * (1.0 / y) 3208 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3209 .addUse(RHS) 3210 .setMIFlags(Flags); 3211 B.buildFMul(Res, LHS, RCP, Flags); 3212 3213 MI.eraseFromParent(); 3214 return true; 3215 } 3216 3217 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3218 MachineRegisterInfo &MRI, 3219 MachineIRBuilder &B) const { 3220 Register Res = MI.getOperand(0).getReg(); 3221 Register X = MI.getOperand(1).getReg(); 3222 Register Y = MI.getOperand(2).getReg(); 3223 uint16_t Flags = MI.getFlags(); 3224 LLT ResTy = MRI.getType(Res); 3225 3226 const MachineFunction &MF = B.getMF(); 3227 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3228 MI.getFlag(MachineInstr::FmAfn); 3229 3230 if (!AllowInaccurateRcp) 3231 return false; 3232 3233 auto NegY = B.buildFNeg(ResTy, Y); 3234 auto One = B.buildFConstant(ResTy, 1.0); 3235 3236 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3237 .addUse(Y) 3238 .setMIFlags(Flags); 3239 3240 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3241 R = B.buildFMA(ResTy, Tmp0, R, R); 3242 3243 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3244 R = B.buildFMA(ResTy, Tmp1, R, R); 3245 3246 auto Ret = B.buildFMul(ResTy, X, R); 3247 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3248 3249 B.buildFMA(Res, Tmp2, R, Ret); 3250 MI.eraseFromParent(); 3251 return true; 3252 } 3253 3254 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3255 MachineRegisterInfo &MRI, 3256 MachineIRBuilder &B) const { 3257 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3258 return true; 3259 3260 Register Res = MI.getOperand(0).getReg(); 3261 Register LHS = MI.getOperand(1).getReg(); 3262 Register RHS = MI.getOperand(2).getReg(); 3263 3264 uint16_t Flags = MI.getFlags(); 3265 3266 LLT S16 = LLT::scalar(16); 3267 LLT S32 = LLT::scalar(32); 3268 3269 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3270 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3271 3272 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3273 .addUse(RHSExt.getReg(0)) 3274 .setMIFlags(Flags); 3275 3276 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3277 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3278 3279 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3280 .addUse(RDst.getReg(0)) 3281 .addUse(RHS) 3282 .addUse(LHS) 3283 .setMIFlags(Flags); 3284 3285 MI.eraseFromParent(); 3286 return true; 3287 } 3288 3289 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3290 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3291 static void toggleSPDenormMode(bool Enable, 3292 MachineIRBuilder &B, 3293 const GCNSubtarget &ST, 3294 AMDGPU::SIModeRegisterDefaults Mode) { 3295 // Set SP denorm mode to this value. 3296 unsigned SPDenormMode = 3297 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3298 3299 if (ST.hasDenormModeInst()) { 3300 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3301 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3302 3303 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3304 B.buildInstr(AMDGPU::S_DENORM_MODE) 3305 .addImm(NewDenormModeValue); 3306 3307 } else { 3308 // Select FP32 bit field in mode register. 3309 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3310 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3311 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3312 3313 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3314 .addImm(SPDenormMode) 3315 .addImm(SPDenormModeBitField); 3316 } 3317 } 3318 3319 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3320 MachineRegisterInfo &MRI, 3321 MachineIRBuilder &B) const { 3322 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3323 return true; 3324 3325 Register Res = MI.getOperand(0).getReg(); 3326 Register LHS = MI.getOperand(1).getReg(); 3327 Register RHS = MI.getOperand(2).getReg(); 3328 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3329 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3330 3331 uint16_t Flags = MI.getFlags(); 3332 3333 LLT S32 = LLT::scalar(32); 3334 LLT S1 = LLT::scalar(1); 3335 3336 auto One = B.buildFConstant(S32, 1.0f); 3337 3338 auto DenominatorScaled = 3339 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3340 .addUse(LHS) 3341 .addUse(RHS) 3342 .addImm(0) 3343 .setMIFlags(Flags); 3344 auto NumeratorScaled = 3345 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3346 .addUse(LHS) 3347 .addUse(RHS) 3348 .addImm(1) 3349 .setMIFlags(Flags); 3350 3351 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3352 .addUse(DenominatorScaled.getReg(0)) 3353 .setMIFlags(Flags); 3354 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3355 3356 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3357 // aren't modeled as reading it. 3358 if (!Mode.allFP32Denormals()) 3359 toggleSPDenormMode(true, B, ST, Mode); 3360 3361 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3362 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3363 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3364 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3365 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3366 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3367 3368 if (!Mode.allFP32Denormals()) 3369 toggleSPDenormMode(false, B, ST, Mode); 3370 3371 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3372 .addUse(Fma4.getReg(0)) 3373 .addUse(Fma1.getReg(0)) 3374 .addUse(Fma3.getReg(0)) 3375 .addUse(NumeratorScaled.getReg(1)) 3376 .setMIFlags(Flags); 3377 3378 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3379 .addUse(Fmas.getReg(0)) 3380 .addUse(RHS) 3381 .addUse(LHS) 3382 .setMIFlags(Flags); 3383 3384 MI.eraseFromParent(); 3385 return true; 3386 } 3387 3388 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3389 MachineRegisterInfo &MRI, 3390 MachineIRBuilder &B) const { 3391 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3392 return true; 3393 3394 Register Res = MI.getOperand(0).getReg(); 3395 Register LHS = MI.getOperand(1).getReg(); 3396 Register RHS = MI.getOperand(2).getReg(); 3397 3398 uint16_t Flags = MI.getFlags(); 3399 3400 LLT S64 = LLT::scalar(64); 3401 LLT S1 = LLT::scalar(1); 3402 3403 auto One = B.buildFConstant(S64, 1.0); 3404 3405 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3406 .addUse(LHS) 3407 .addUse(RHS) 3408 .addImm(0) 3409 .setMIFlags(Flags); 3410 3411 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3412 3413 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3414 .addUse(DivScale0.getReg(0)) 3415 .setMIFlags(Flags); 3416 3417 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3418 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3419 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3420 3421 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3422 .addUse(LHS) 3423 .addUse(RHS) 3424 .addImm(1) 3425 .setMIFlags(Flags); 3426 3427 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3428 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3429 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3430 3431 Register Scale; 3432 if (!ST.hasUsableDivScaleConditionOutput()) { 3433 // Workaround a hardware bug on SI where the condition output from div_scale 3434 // is not usable. 3435 3436 LLT S32 = LLT::scalar(32); 3437 3438 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3439 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3440 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3441 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3442 3443 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3444 Scale1Unmerge.getReg(1)); 3445 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3446 Scale0Unmerge.getReg(1)); 3447 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3448 } else { 3449 Scale = DivScale1.getReg(1); 3450 } 3451 3452 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3453 .addUse(Fma4.getReg(0)) 3454 .addUse(Fma3.getReg(0)) 3455 .addUse(Mul.getReg(0)) 3456 .addUse(Scale) 3457 .setMIFlags(Flags); 3458 3459 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3460 .addUse(Fmas.getReg(0)) 3461 .addUse(RHS) 3462 .addUse(LHS) 3463 .setMIFlags(Flags); 3464 3465 MI.eraseFromParent(); 3466 return true; 3467 } 3468 3469 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3470 MachineRegisterInfo &MRI, 3471 MachineIRBuilder &B) const { 3472 Register Res = MI.getOperand(0).getReg(); 3473 Register LHS = MI.getOperand(2).getReg(); 3474 Register RHS = MI.getOperand(3).getReg(); 3475 uint16_t Flags = MI.getFlags(); 3476 3477 LLT S32 = LLT::scalar(32); 3478 LLT S1 = LLT::scalar(1); 3479 3480 auto Abs = B.buildFAbs(S32, RHS, Flags); 3481 const APFloat C0Val(1.0f); 3482 3483 auto C0 = B.buildConstant(S32, 0x6f800000); 3484 auto C1 = B.buildConstant(S32, 0x2f800000); 3485 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3486 3487 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3488 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3489 3490 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3491 3492 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3493 .addUse(Mul0.getReg(0)) 3494 .setMIFlags(Flags); 3495 3496 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3497 3498 B.buildFMul(Res, Sel, Mul1, Flags); 3499 3500 MI.eraseFromParent(); 3501 return true; 3502 } 3503 3504 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3505 // FIXME: Why do we handle this one but not other removed instructions? 3506 // 3507 // Reciprocal square root. The clamp prevents infinite results, clamping 3508 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3509 // +-max_float. 3510 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3511 MachineRegisterInfo &MRI, 3512 MachineIRBuilder &B) const { 3513 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3514 return true; 3515 3516 Register Dst = MI.getOperand(0).getReg(); 3517 Register Src = MI.getOperand(2).getReg(); 3518 auto Flags = MI.getFlags(); 3519 3520 LLT Ty = MRI.getType(Dst); 3521 3522 const fltSemantics *FltSemantics; 3523 if (Ty == LLT::scalar(32)) 3524 FltSemantics = &APFloat::IEEEsingle(); 3525 else if (Ty == LLT::scalar(64)) 3526 FltSemantics = &APFloat::IEEEdouble(); 3527 else 3528 return false; 3529 3530 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3531 .addUse(Src) 3532 .setMIFlags(Flags); 3533 3534 // We don't need to concern ourselves with the snan handling difference, since 3535 // the rsq quieted (or not) so use the one which will directly select. 3536 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3537 const bool UseIEEE = MFI->getMode().IEEE; 3538 3539 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3540 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3541 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3542 3543 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3544 3545 if (UseIEEE) 3546 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3547 else 3548 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3549 MI.eraseFromParent(); 3550 return true; 3551 } 3552 3553 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3554 switch (IID) { 3555 case Intrinsic::amdgcn_ds_fadd: 3556 return AMDGPU::G_ATOMICRMW_FADD; 3557 case Intrinsic::amdgcn_ds_fmin: 3558 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3559 case Intrinsic::amdgcn_ds_fmax: 3560 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3561 default: 3562 llvm_unreachable("not a DS FP intrinsic"); 3563 } 3564 } 3565 3566 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3567 MachineInstr &MI, 3568 Intrinsic::ID IID) const { 3569 GISelChangeObserver &Observer = Helper.Observer; 3570 Observer.changingInstr(MI); 3571 3572 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3573 3574 // The remaining operands were used to set fields in the MemOperand on 3575 // construction. 3576 for (int I = 6; I > 3; --I) 3577 MI.RemoveOperand(I); 3578 3579 MI.RemoveOperand(1); // Remove the intrinsic ID. 3580 Observer.changedInstr(MI); 3581 return true; 3582 } 3583 3584 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3585 MachineRegisterInfo &MRI, 3586 MachineIRBuilder &B) const { 3587 uint64_t Offset = 3588 ST.getTargetLowering()->getImplicitParameterOffset( 3589 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3590 LLT DstTy = MRI.getType(DstReg); 3591 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3592 3593 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3594 if (!loadInputValue(KernargPtrReg, B, 3595 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3596 return false; 3597 3598 // FIXME: This should be nuw 3599 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3600 return true; 3601 } 3602 3603 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3604 MachineRegisterInfo &MRI, 3605 MachineIRBuilder &B) const { 3606 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3607 if (!MFI->isEntryFunction()) { 3608 return legalizePreloadedArgIntrin(MI, MRI, B, 3609 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3610 } 3611 3612 Register DstReg = MI.getOperand(0).getReg(); 3613 if (!getImplicitArgPtr(DstReg, MRI, B)) 3614 return false; 3615 3616 MI.eraseFromParent(); 3617 return true; 3618 } 3619 3620 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3621 MachineRegisterInfo &MRI, 3622 MachineIRBuilder &B, 3623 unsigned AddrSpace) const { 3624 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3625 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3626 Register Hi32 = Unmerge.getReg(1); 3627 3628 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3629 MI.eraseFromParent(); 3630 return true; 3631 } 3632 3633 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3634 // offset (the offset that is included in bounds checking and swizzling, to be 3635 // split between the instruction's voffset and immoffset fields) and soffset 3636 // (the offset that is excluded from bounds checking and swizzling, to go in 3637 // the instruction's soffset field). This function takes the first kind of 3638 // offset and figures out how to split it between voffset and immoffset. 3639 std::tuple<Register, unsigned, unsigned> 3640 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3641 Register OrigOffset) const { 3642 const unsigned MaxImm = 4095; 3643 Register BaseReg; 3644 unsigned TotalConstOffset; 3645 const LLT S32 = LLT::scalar(32); 3646 MachineRegisterInfo &MRI = *B.getMRI(); 3647 3648 std::tie(BaseReg, TotalConstOffset) = 3649 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3650 3651 unsigned ImmOffset = TotalConstOffset; 3652 3653 // If BaseReg is a pointer, convert it to int. 3654 if (MRI.getType(BaseReg).isPointer()) 3655 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3656 3657 // If the immediate value is too big for the immoffset field, put the value 3658 // and -4096 into the immoffset field so that the value that is copied/added 3659 // for the voffset field is a multiple of 4096, and it stands more chance 3660 // of being CSEd with the copy/add for another similar load/store. 3661 // However, do not do that rounding down to a multiple of 4096 if that is a 3662 // negative number, as it appears to be illegal to have a negative offset 3663 // in the vgpr, even if adding the immediate offset makes it positive. 3664 unsigned Overflow = ImmOffset & ~MaxImm; 3665 ImmOffset -= Overflow; 3666 if ((int32_t)Overflow < 0) { 3667 Overflow += ImmOffset; 3668 ImmOffset = 0; 3669 } 3670 3671 if (Overflow != 0) { 3672 if (!BaseReg) { 3673 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3674 } else { 3675 auto OverflowVal = B.buildConstant(S32, Overflow); 3676 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3677 } 3678 } 3679 3680 if (!BaseReg) 3681 BaseReg = B.buildConstant(S32, 0).getReg(0); 3682 3683 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3684 } 3685 3686 /// Handle register layout difference for f16 images for some subtargets. 3687 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3688 MachineRegisterInfo &MRI, 3689 Register Reg, 3690 bool ImageStore) const { 3691 const LLT S16 = LLT::scalar(16); 3692 const LLT S32 = LLT::scalar(32); 3693 LLT StoreVT = MRI.getType(Reg); 3694 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3695 3696 if (ST.hasUnpackedD16VMem()) { 3697 auto Unmerge = B.buildUnmerge(S16, Reg); 3698 3699 SmallVector<Register, 4> WideRegs; 3700 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3701 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3702 3703 int NumElts = StoreVT.getNumElements(); 3704 3705 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3706 .getReg(0); 3707 } 3708 3709 if (ImageStore && ST.hasImageStoreD16Bug()) { 3710 if (StoreVT.getNumElements() == 2) { 3711 SmallVector<Register, 4> PackedRegs; 3712 Reg = B.buildBitcast(S32, Reg).getReg(0); 3713 PackedRegs.push_back(Reg); 3714 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3715 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3716 .getReg(0); 3717 } 3718 3719 if (StoreVT.getNumElements() == 3) { 3720 SmallVector<Register, 4> PackedRegs; 3721 auto Unmerge = B.buildUnmerge(S16, Reg); 3722 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3723 PackedRegs.push_back(Unmerge.getReg(I)); 3724 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3725 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3726 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3727 } 3728 3729 if (StoreVT.getNumElements() == 4) { 3730 SmallVector<Register, 4> PackedRegs; 3731 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3732 auto Unmerge = B.buildUnmerge(S32, Reg); 3733 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3734 PackedRegs.push_back(Unmerge.getReg(I)); 3735 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3736 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3737 .getReg(0); 3738 } 3739 3740 llvm_unreachable("invalid data type"); 3741 } 3742 3743 return Reg; 3744 } 3745 3746 Register AMDGPULegalizerInfo::fixStoreSourceType( 3747 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3748 MachineRegisterInfo *MRI = B.getMRI(); 3749 LLT Ty = MRI->getType(VData); 3750 3751 const LLT S16 = LLT::scalar(16); 3752 3753 // Fixup illegal register types for i8 stores. 3754 if (Ty == LLT::scalar(8) || Ty == S16) { 3755 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3756 return AnyExt; 3757 } 3758 3759 if (Ty.isVector()) { 3760 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3761 if (IsFormat) 3762 return handleD16VData(B, *MRI, VData); 3763 } 3764 } 3765 3766 return VData; 3767 } 3768 3769 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3770 MachineRegisterInfo &MRI, 3771 MachineIRBuilder &B, 3772 bool IsTyped, 3773 bool IsFormat) const { 3774 Register VData = MI.getOperand(1).getReg(); 3775 LLT Ty = MRI.getType(VData); 3776 LLT EltTy = Ty.getScalarType(); 3777 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3778 const LLT S32 = LLT::scalar(32); 3779 3780 VData = fixStoreSourceType(B, VData, IsFormat); 3781 Register RSrc = MI.getOperand(2).getReg(); 3782 3783 MachineMemOperand *MMO = *MI.memoperands_begin(); 3784 const int MemSize = MMO->getSize(); 3785 3786 unsigned ImmOffset; 3787 unsigned TotalOffset; 3788 3789 // The typed intrinsics add an immediate after the registers. 3790 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3791 3792 // The struct intrinsic variants add one additional operand over raw. 3793 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3794 Register VIndex; 3795 int OpOffset = 0; 3796 if (HasVIndex) { 3797 VIndex = MI.getOperand(3).getReg(); 3798 OpOffset = 1; 3799 } 3800 3801 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3802 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3803 3804 unsigned Format = 0; 3805 if (IsTyped) { 3806 Format = MI.getOperand(5 + OpOffset).getImm(); 3807 ++OpOffset; 3808 } 3809 3810 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3811 3812 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3813 if (TotalOffset != 0) 3814 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3815 3816 unsigned Opc; 3817 if (IsTyped) { 3818 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3819 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3820 } else if (IsFormat) { 3821 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3822 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3823 } else { 3824 switch (MemSize) { 3825 case 1: 3826 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3827 break; 3828 case 2: 3829 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3830 break; 3831 default: 3832 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3833 break; 3834 } 3835 } 3836 3837 if (!VIndex) 3838 VIndex = B.buildConstant(S32, 0).getReg(0); 3839 3840 auto MIB = B.buildInstr(Opc) 3841 .addUse(VData) // vdata 3842 .addUse(RSrc) // rsrc 3843 .addUse(VIndex) // vindex 3844 .addUse(VOffset) // voffset 3845 .addUse(SOffset) // soffset 3846 .addImm(ImmOffset); // offset(imm) 3847 3848 if (IsTyped) 3849 MIB.addImm(Format); 3850 3851 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3852 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3853 .addMemOperand(MMO); 3854 3855 MI.eraseFromParent(); 3856 return true; 3857 } 3858 3859 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3860 MachineRegisterInfo &MRI, 3861 MachineIRBuilder &B, 3862 bool IsFormat, 3863 bool IsTyped) const { 3864 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3865 MachineMemOperand *MMO = *MI.memoperands_begin(); 3866 const int MemSize = MMO->getSize(); 3867 const LLT S32 = LLT::scalar(32); 3868 3869 Register Dst = MI.getOperand(0).getReg(); 3870 Register RSrc = MI.getOperand(2).getReg(); 3871 3872 // The typed intrinsics add an immediate after the registers. 3873 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3874 3875 // The struct intrinsic variants add one additional operand over raw. 3876 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3877 Register VIndex; 3878 int OpOffset = 0; 3879 if (HasVIndex) { 3880 VIndex = MI.getOperand(3).getReg(); 3881 OpOffset = 1; 3882 } 3883 3884 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3885 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3886 3887 unsigned Format = 0; 3888 if (IsTyped) { 3889 Format = MI.getOperand(5 + OpOffset).getImm(); 3890 ++OpOffset; 3891 } 3892 3893 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3894 unsigned ImmOffset; 3895 unsigned TotalOffset; 3896 3897 LLT Ty = MRI.getType(Dst); 3898 LLT EltTy = Ty.getScalarType(); 3899 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3900 const bool Unpacked = ST.hasUnpackedD16VMem(); 3901 3902 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3903 if (TotalOffset != 0) 3904 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3905 3906 unsigned Opc; 3907 3908 if (IsTyped) { 3909 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3910 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3911 } else if (IsFormat) { 3912 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3913 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3914 } else { 3915 switch (MemSize) { 3916 case 1: 3917 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3918 break; 3919 case 2: 3920 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3921 break; 3922 default: 3923 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3924 break; 3925 } 3926 } 3927 3928 Register LoadDstReg; 3929 3930 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3931 LLT UnpackedTy = Ty.changeElementSize(32); 3932 3933 if (IsExtLoad) 3934 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3935 else if (Unpacked && IsD16 && Ty.isVector()) 3936 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3937 else 3938 LoadDstReg = Dst; 3939 3940 if (!VIndex) 3941 VIndex = B.buildConstant(S32, 0).getReg(0); 3942 3943 auto MIB = B.buildInstr(Opc) 3944 .addDef(LoadDstReg) // vdata 3945 .addUse(RSrc) // rsrc 3946 .addUse(VIndex) // vindex 3947 .addUse(VOffset) // voffset 3948 .addUse(SOffset) // soffset 3949 .addImm(ImmOffset); // offset(imm) 3950 3951 if (IsTyped) 3952 MIB.addImm(Format); 3953 3954 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3955 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3956 .addMemOperand(MMO); 3957 3958 if (LoadDstReg != Dst) { 3959 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3960 3961 // Widen result for extending loads was widened. 3962 if (IsExtLoad) 3963 B.buildTrunc(Dst, LoadDstReg); 3964 else { 3965 // Repack to original 16-bit vector result 3966 // FIXME: G_TRUNC should work, but legalization currently fails 3967 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3968 SmallVector<Register, 4> Repack; 3969 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3970 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3971 B.buildMerge(Dst, Repack); 3972 } 3973 } 3974 3975 MI.eraseFromParent(); 3976 return true; 3977 } 3978 3979 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3980 MachineIRBuilder &B, 3981 bool IsInc) const { 3982 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3983 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3984 B.buildInstr(Opc) 3985 .addDef(MI.getOperand(0).getReg()) 3986 .addUse(MI.getOperand(2).getReg()) 3987 .addUse(MI.getOperand(3).getReg()) 3988 .cloneMemRefs(MI); 3989 MI.eraseFromParent(); 3990 return true; 3991 } 3992 3993 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3994 switch (IntrID) { 3995 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3996 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3997 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3998 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3999 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4001 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4002 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4004 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4005 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4006 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4007 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4008 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4009 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4010 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4011 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4012 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4013 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4014 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4015 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4016 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4017 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4018 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4019 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4020 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4021 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4022 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4023 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4024 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4025 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4028 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4029 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4030 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4031 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4032 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4033 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4034 case Intrinsic::amdgcn_buffer_atomic_fadd: 4035 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4036 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4037 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4038 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4039 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4040 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4041 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4042 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4043 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4044 default: 4045 llvm_unreachable("unhandled atomic opcode"); 4046 } 4047 } 4048 4049 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4050 MachineIRBuilder &B, 4051 Intrinsic::ID IID) const { 4052 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4053 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4054 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4055 4056 Register Dst; 4057 4058 int OpOffset = 0; 4059 if (HasReturn) { 4060 // A few FP atomics do not support return values. 4061 Dst = MI.getOperand(0).getReg(); 4062 } else { 4063 OpOffset = -1; 4064 } 4065 4066 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4067 Register CmpVal; 4068 4069 if (IsCmpSwap) { 4070 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4071 ++OpOffset; 4072 } 4073 4074 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4075 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4076 4077 // The struct intrinsic variants add one additional operand over raw. 4078 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4079 Register VIndex; 4080 if (HasVIndex) { 4081 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4082 ++OpOffset; 4083 } 4084 4085 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4086 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4087 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4088 4089 MachineMemOperand *MMO = *MI.memoperands_begin(); 4090 4091 unsigned ImmOffset; 4092 unsigned TotalOffset; 4093 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4094 if (TotalOffset != 0) 4095 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4096 4097 if (!VIndex) 4098 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4099 4100 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4101 4102 if (HasReturn) 4103 MIB.addDef(Dst); 4104 4105 MIB.addUse(VData); // vdata 4106 4107 if (IsCmpSwap) 4108 MIB.addReg(CmpVal); 4109 4110 MIB.addUse(RSrc) // rsrc 4111 .addUse(VIndex) // vindex 4112 .addUse(VOffset) // voffset 4113 .addUse(SOffset) // soffset 4114 .addImm(ImmOffset) // offset(imm) 4115 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4116 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4117 .addMemOperand(MMO); 4118 4119 MI.eraseFromParent(); 4120 return true; 4121 } 4122 4123 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4124 /// vector with s16 typed elements. 4125 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4126 SmallVectorImpl<Register> &PackedAddrs, 4127 unsigned ArgOffset, 4128 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4129 bool IsA16, bool IsG16) { 4130 const LLT S16 = LLT::scalar(16); 4131 const LLT V2S16 = LLT::fixed_vector(2, 16); 4132 auto EndIdx = Intr->VAddrEnd; 4133 4134 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4135 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4136 if (!SrcOp.isReg()) 4137 continue; // _L to _LZ may have eliminated this. 4138 4139 Register AddrReg = SrcOp.getReg(); 4140 4141 if (I < Intr->GradientStart) { 4142 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4143 PackedAddrs.push_back(AddrReg); 4144 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4145 (I >= Intr->CoordStart && !IsA16)) { 4146 // Handle any gradient or coordinate operands that should not be packed 4147 PackedAddrs.push_back(AddrReg); 4148 } else { 4149 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4150 // derivatives dx/dh and dx/dv are packed with undef. 4151 if (((I + 1) >= EndIdx) || 4152 ((Intr->NumGradients / 2) % 2 == 1 && 4153 (I == static_cast<unsigned>(Intr->GradientStart + 4154 (Intr->NumGradients / 2) - 1) || 4155 I == static_cast<unsigned>(Intr->GradientStart + 4156 Intr->NumGradients - 1))) || 4157 // Check for _L to _LZ optimization 4158 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4159 PackedAddrs.push_back( 4160 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4161 .getReg(0)); 4162 } else { 4163 PackedAddrs.push_back( 4164 B.buildBuildVector( 4165 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4166 .getReg(0)); 4167 ++I; 4168 } 4169 } 4170 } 4171 } 4172 4173 /// Convert from separate vaddr components to a single vector address register, 4174 /// and replace the remaining operands with $noreg. 4175 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4176 int DimIdx, int NumVAddrs) { 4177 const LLT S32 = LLT::scalar(32); 4178 4179 SmallVector<Register, 8> AddrRegs; 4180 for (int I = 0; I != NumVAddrs; ++I) { 4181 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4182 if (SrcOp.isReg()) { 4183 AddrRegs.push_back(SrcOp.getReg()); 4184 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4185 } 4186 } 4187 4188 int NumAddrRegs = AddrRegs.size(); 4189 if (NumAddrRegs != 1) { 4190 // Round up to 8 elements for v5-v7 4191 // FIXME: Missing intermediate sized register classes and instructions. 4192 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4193 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4194 auto Undef = B.buildUndef(S32); 4195 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4196 NumAddrRegs = RoundedNumRegs; 4197 } 4198 4199 auto VAddr = 4200 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4201 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4202 } 4203 4204 for (int I = 1; I != NumVAddrs; ++I) { 4205 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4206 if (SrcOp.isReg()) 4207 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4208 } 4209 } 4210 4211 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4212 /// 4213 /// Depending on the subtarget, load/store with 16-bit element data need to be 4214 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4215 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4216 /// registers. 4217 /// 4218 /// We don't want to directly select image instructions just yet, but also want 4219 /// to exposes all register repacking to the legalizer/combiners. We also don't 4220 /// want a selected instrution entering RegBankSelect. In order to avoid 4221 /// defining a multitude of intermediate image instructions, directly hack on 4222 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4223 /// now unnecessary arguments with $noreg. 4224 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4225 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4226 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4227 4228 const unsigned NumDefs = MI.getNumExplicitDefs(); 4229 const unsigned ArgOffset = NumDefs + 1; 4230 bool IsTFE = NumDefs == 2; 4231 // We are only processing the operands of d16 image operations on subtargets 4232 // that use the unpacked register layout, or need to repack the TFE result. 4233 4234 // TODO: Do we need to guard against already legalized intrinsics? 4235 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4236 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4237 4238 MachineRegisterInfo *MRI = B.getMRI(); 4239 const LLT S32 = LLT::scalar(32); 4240 const LLT S16 = LLT::scalar(16); 4241 const LLT V2S16 = LLT::fixed_vector(2, 16); 4242 4243 unsigned DMask = 0; 4244 4245 // Check for 16 bit addresses and pack if true. 4246 LLT GradTy = 4247 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4248 LLT AddrTy = 4249 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4250 const bool IsG16 = GradTy == S16; 4251 const bool IsA16 = AddrTy == S16; 4252 4253 int DMaskLanes = 0; 4254 if (!BaseOpcode->Atomic) { 4255 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4256 if (BaseOpcode->Gather4) { 4257 DMaskLanes = 4; 4258 } else if (DMask != 0) { 4259 DMaskLanes = countPopulation(DMask); 4260 } else if (!IsTFE && !BaseOpcode->Store) { 4261 // If dmask is 0, this is a no-op load. This can be eliminated. 4262 B.buildUndef(MI.getOperand(0)); 4263 MI.eraseFromParent(); 4264 return true; 4265 } 4266 } 4267 4268 Observer.changingInstr(MI); 4269 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4270 4271 unsigned NewOpcode = NumDefs == 0 ? 4272 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4273 4274 // Track that we legalized this 4275 MI.setDesc(B.getTII().get(NewOpcode)); 4276 4277 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4278 // dmask to be at least 1 otherwise the instruction will fail 4279 if (IsTFE && DMask == 0) { 4280 DMask = 0x1; 4281 DMaskLanes = 1; 4282 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4283 } 4284 4285 if (BaseOpcode->Atomic) { 4286 Register VData0 = MI.getOperand(2).getReg(); 4287 LLT Ty = MRI->getType(VData0); 4288 4289 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4290 if (Ty.isVector()) 4291 return false; 4292 4293 if (BaseOpcode->AtomicX2) { 4294 Register VData1 = MI.getOperand(3).getReg(); 4295 // The two values are packed in one register. 4296 LLT PackedTy = LLT::fixed_vector(2, Ty); 4297 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4298 MI.getOperand(2).setReg(Concat.getReg(0)); 4299 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4300 } 4301 } 4302 4303 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4304 4305 // Optimize _L to _LZ when _L is zero 4306 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4307 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4308 const ConstantFP *ConstantLod; 4309 4310 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4311 m_GFCst(ConstantLod))) { 4312 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4313 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4314 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4315 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4316 Intr->Dim); 4317 4318 // The starting indexes should remain in the same place. 4319 --CorrectedNumVAddrs; 4320 4321 MI.getOperand(MI.getNumExplicitDefs()) 4322 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4323 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4324 Intr = NewImageDimIntr; 4325 } 4326 } 4327 } 4328 4329 // Optimize _mip away, when 'lod' is zero 4330 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4331 int64_t ConstantLod; 4332 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4333 m_ICst(ConstantLod))) { 4334 if (ConstantLod == 0) { 4335 // TODO: Change intrinsic opcode and remove operand instead or replacing 4336 // it with 0, as the _L to _LZ handling is done above. 4337 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4338 --CorrectedNumVAddrs; 4339 } 4340 } 4341 } 4342 4343 // Rewrite the addressing register layout before doing anything else. 4344 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4345 // 16 bit gradients are supported, but are tied to the A16 control 4346 // so both gradients and addresses must be 16 bit 4347 return false; 4348 } 4349 4350 if (IsA16 && !ST.hasA16()) { 4351 // A16 not supported 4352 return false; 4353 } 4354 4355 if (IsA16 || IsG16) { 4356 if (Intr->NumVAddrs > 1) { 4357 SmallVector<Register, 4> PackedRegs; 4358 4359 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4360 IsG16); 4361 4362 // See also below in the non-a16 branch 4363 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4364 4365 if (!UseNSA && PackedRegs.size() > 1) { 4366 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4367 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4368 PackedRegs[0] = Concat.getReg(0); 4369 PackedRegs.resize(1); 4370 } 4371 4372 const unsigned NumPacked = PackedRegs.size(); 4373 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4374 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4375 if (!SrcOp.isReg()) { 4376 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4377 continue; 4378 } 4379 4380 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4381 4382 if (I - Intr->VAddrStart < NumPacked) 4383 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4384 else 4385 SrcOp.setReg(AMDGPU::NoRegister); 4386 } 4387 } 4388 } else { 4389 // If the register allocator cannot place the address registers contiguously 4390 // without introducing moves, then using the non-sequential address encoding 4391 // is always preferable, since it saves VALU instructions and is usually a 4392 // wash in terms of code size or even better. 4393 // 4394 // However, we currently have no way of hinting to the register allocator 4395 // that MIMG addresses should be placed contiguously when it is possible to 4396 // do so, so force non-NSA for the common 2-address case as a heuristic. 4397 // 4398 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4399 // allocation when possible. 4400 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4401 4402 if (!UseNSA && Intr->NumVAddrs > 1) 4403 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4404 Intr->NumVAddrs); 4405 } 4406 4407 int Flags = 0; 4408 if (IsA16) 4409 Flags |= 1; 4410 if (IsG16) 4411 Flags |= 2; 4412 MI.addOperand(MachineOperand::CreateImm(Flags)); 4413 4414 if (BaseOpcode->Store) { // No TFE for stores? 4415 // TODO: Handle dmask trim 4416 Register VData = MI.getOperand(1).getReg(); 4417 LLT Ty = MRI->getType(VData); 4418 if (!Ty.isVector() || Ty.getElementType() != S16) 4419 return true; 4420 4421 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4422 if (RepackedReg != VData) { 4423 MI.getOperand(1).setReg(RepackedReg); 4424 } 4425 4426 return true; 4427 } 4428 4429 Register DstReg = MI.getOperand(0).getReg(); 4430 LLT Ty = MRI->getType(DstReg); 4431 const LLT EltTy = Ty.getScalarType(); 4432 const bool IsD16 = Ty.getScalarType() == S16; 4433 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4434 4435 // Confirm that the return type is large enough for the dmask specified 4436 if (NumElts < DMaskLanes) 4437 return false; 4438 4439 if (NumElts > 4 || DMaskLanes > 4) 4440 return false; 4441 4442 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4443 const LLT AdjustedTy = 4444 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4445 4446 // The raw dword aligned data component of the load. The only legal cases 4447 // where this matters should be when using the packed D16 format, for 4448 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4449 LLT RoundedTy; 4450 4451 // S32 vector to to cover all data, plus TFE result element. 4452 LLT TFETy; 4453 4454 // Register type to use for each loaded component. Will be S32 or V2S16. 4455 LLT RegTy; 4456 4457 if (IsD16 && ST.hasUnpackedD16VMem()) { 4458 RoundedTy = 4459 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4460 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4461 RegTy = S32; 4462 } else { 4463 unsigned EltSize = EltTy.getSizeInBits(); 4464 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4465 unsigned RoundedSize = 32 * RoundedElts; 4466 RoundedTy = LLT::scalarOrVector( 4467 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4468 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4469 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4470 } 4471 4472 // The return type does not need adjustment. 4473 // TODO: Should we change s16 case to s32 or <2 x s16>? 4474 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4475 return true; 4476 4477 Register Dst1Reg; 4478 4479 // Insert after the instruction. 4480 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4481 4482 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4483 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4484 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4485 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4486 4487 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4488 4489 MI.getOperand(0).setReg(NewResultReg); 4490 4491 // In the IR, TFE is supposed to be used with a 2 element struct return 4492 // type. The intruction really returns these two values in one contiguous 4493 // register, with one additional dword beyond the loaded data. Rewrite the 4494 // return type to use a single register result. 4495 4496 if (IsTFE) { 4497 Dst1Reg = MI.getOperand(1).getReg(); 4498 if (MRI->getType(Dst1Reg) != S32) 4499 return false; 4500 4501 // TODO: Make sure the TFE operand bit is set. 4502 MI.RemoveOperand(1); 4503 4504 // Handle the easy case that requires no repack instructions. 4505 if (Ty == S32) { 4506 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4507 return true; 4508 } 4509 } 4510 4511 // Now figure out how to copy the new result register back into the old 4512 // result. 4513 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4514 4515 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4516 4517 if (ResultNumRegs == 1) { 4518 assert(!IsTFE); 4519 ResultRegs[0] = NewResultReg; 4520 } else { 4521 // We have to repack into a new vector of some kind. 4522 for (int I = 0; I != NumDataRegs; ++I) 4523 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4524 B.buildUnmerge(ResultRegs, NewResultReg); 4525 4526 // Drop the final TFE element to get the data part. The TFE result is 4527 // directly written to the right place already. 4528 if (IsTFE) 4529 ResultRegs.resize(NumDataRegs); 4530 } 4531 4532 // For an s16 scalar result, we form an s32 result with a truncate regardless 4533 // of packed vs. unpacked. 4534 if (IsD16 && !Ty.isVector()) { 4535 B.buildTrunc(DstReg, ResultRegs[0]); 4536 return true; 4537 } 4538 4539 // Avoid a build/concat_vector of 1 entry. 4540 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4541 B.buildBitcast(DstReg, ResultRegs[0]); 4542 return true; 4543 } 4544 4545 assert(Ty.isVector()); 4546 4547 if (IsD16) { 4548 // For packed D16 results with TFE enabled, all the data components are 4549 // S32. Cast back to the expected type. 4550 // 4551 // TODO: We don't really need to use load s32 elements. We would only need one 4552 // cast for the TFE result if a multiple of v2s16 was used. 4553 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4554 for (Register &Reg : ResultRegs) 4555 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4556 } else if (ST.hasUnpackedD16VMem()) { 4557 for (Register &Reg : ResultRegs) 4558 Reg = B.buildTrunc(S16, Reg).getReg(0); 4559 } 4560 } 4561 4562 auto padWithUndef = [&](LLT Ty, int NumElts) { 4563 if (NumElts == 0) 4564 return; 4565 Register Undef = B.buildUndef(Ty).getReg(0); 4566 for (int I = 0; I != NumElts; ++I) 4567 ResultRegs.push_back(Undef); 4568 }; 4569 4570 // Pad out any elements eliminated due to the dmask. 4571 LLT ResTy = MRI->getType(ResultRegs[0]); 4572 if (!ResTy.isVector()) { 4573 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4574 B.buildBuildVector(DstReg, ResultRegs); 4575 return true; 4576 } 4577 4578 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4579 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4580 4581 // Deal with the one annoying legal case. 4582 const LLT V3S16 = LLT::fixed_vector(3, 16); 4583 if (Ty == V3S16) { 4584 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4585 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4586 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4587 return true; 4588 } 4589 4590 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4591 B.buildConcatVectors(DstReg, ResultRegs); 4592 return true; 4593 } 4594 4595 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4596 LegalizerHelper &Helper, MachineInstr &MI) const { 4597 MachineIRBuilder &B = Helper.MIRBuilder; 4598 GISelChangeObserver &Observer = Helper.Observer; 4599 4600 Register Dst = MI.getOperand(0).getReg(); 4601 LLT Ty = B.getMRI()->getType(Dst); 4602 unsigned Size = Ty.getSizeInBits(); 4603 MachineFunction &MF = B.getMF(); 4604 4605 Observer.changingInstr(MI); 4606 4607 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4608 Ty = getBitcastRegisterType(Ty); 4609 Helper.bitcastDst(MI, Ty, 0); 4610 Dst = MI.getOperand(0).getReg(); 4611 B.setInsertPt(B.getMBB(), MI); 4612 } 4613 4614 // FIXME: We don't really need this intermediate instruction. The intrinsic 4615 // should be fixed to have a memory operand. Since it's readnone, we're not 4616 // allowed to add one. 4617 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4618 MI.RemoveOperand(1); // Remove intrinsic ID 4619 4620 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4621 // TODO: Should this use datalayout alignment? 4622 const unsigned MemSize = (Size + 7) / 8; 4623 const Align MemAlign(4); 4624 MachineMemOperand *MMO = MF.getMachineMemOperand( 4625 MachinePointerInfo(), 4626 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4627 MachineMemOperand::MOInvariant, 4628 MemSize, MemAlign); 4629 MI.addMemOperand(MF, MMO); 4630 4631 // There are no 96-bit result scalar loads, but widening to 128-bit should 4632 // always be legal. We may need to restore this to a 96-bit result if it turns 4633 // out this needs to be converted to a vector load during RegBankSelect. 4634 if (!isPowerOf2_32(Size)) { 4635 if (Ty.isVector()) 4636 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4637 else 4638 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4639 } 4640 4641 Observer.changedInstr(MI); 4642 return true; 4643 } 4644 4645 // TODO: Move to selection 4646 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4647 MachineRegisterInfo &MRI, 4648 MachineIRBuilder &B) const { 4649 if (!ST.isTrapHandlerEnabled() || 4650 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4651 return legalizeTrapEndpgm(MI, MRI, B); 4652 4653 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4654 switch (*HsaAbiVer) { 4655 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4656 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4657 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4658 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4659 return ST.supportsGetDoorbellID() ? 4660 legalizeTrapHsa(MI, MRI, B) : 4661 legalizeTrapHsaQueuePtr(MI, MRI, B); 4662 } 4663 } 4664 4665 llvm_unreachable("Unknown trap handler"); 4666 } 4667 4668 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4669 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4670 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4671 MI.eraseFromParent(); 4672 return true; 4673 } 4674 4675 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4676 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4677 // Pass queue pointer to trap handler as input, and insert trap instruction 4678 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4679 Register LiveIn = 4680 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4681 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4682 return false; 4683 4684 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4685 B.buildCopy(SGPR01, LiveIn); 4686 B.buildInstr(AMDGPU::S_TRAP) 4687 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4688 .addReg(SGPR01, RegState::Implicit); 4689 4690 MI.eraseFromParent(); 4691 return true; 4692 } 4693 4694 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4695 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4696 B.buildInstr(AMDGPU::S_TRAP) 4697 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4698 MI.eraseFromParent(); 4699 return true; 4700 } 4701 4702 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4703 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4704 // Is non-HSA path or trap-handler disabled? then, report a warning 4705 // accordingly 4706 if (!ST.isTrapHandlerEnabled() || 4707 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4708 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4709 "debugtrap handler not supported", 4710 MI.getDebugLoc(), DS_Warning); 4711 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4712 Ctx.diagnose(NoTrap); 4713 } else { 4714 // Insert debug-trap instruction 4715 B.buildInstr(AMDGPU::S_TRAP) 4716 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4717 } 4718 4719 MI.eraseFromParent(); 4720 return true; 4721 } 4722 4723 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4724 MachineIRBuilder &B) const { 4725 MachineRegisterInfo &MRI = *B.getMRI(); 4726 const LLT S16 = LLT::scalar(16); 4727 const LLT S32 = LLT::scalar(32); 4728 4729 Register DstReg = MI.getOperand(0).getReg(); 4730 Register NodePtr = MI.getOperand(2).getReg(); 4731 Register RayExtent = MI.getOperand(3).getReg(); 4732 Register RayOrigin = MI.getOperand(4).getReg(); 4733 Register RayDir = MI.getOperand(5).getReg(); 4734 Register RayInvDir = MI.getOperand(6).getReg(); 4735 Register TDescr = MI.getOperand(7).getReg(); 4736 4737 if (!ST.hasGFX10_AEncoding()) { 4738 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4739 "intrinsic not supported on subtarget", 4740 MI.getDebugLoc()); 4741 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4742 return false; 4743 } 4744 4745 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4746 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4747 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4748 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4749 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4750 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4751 4752 SmallVector<Register, 12> Ops; 4753 if (Is64) { 4754 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4755 Ops.push_back(Unmerge.getReg(0)); 4756 Ops.push_back(Unmerge.getReg(1)); 4757 } else { 4758 Ops.push_back(NodePtr); 4759 } 4760 Ops.push_back(RayExtent); 4761 4762 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4763 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4764 Ops.push_back(Unmerge.getReg(0)); 4765 Ops.push_back(Unmerge.getReg(1)); 4766 Ops.push_back(Unmerge.getReg(2)); 4767 }; 4768 4769 packLanes(RayOrigin); 4770 if (IsA16) { 4771 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4772 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4773 Register R1 = MRI.createGenericVirtualRegister(S32); 4774 Register R2 = MRI.createGenericVirtualRegister(S32); 4775 Register R3 = MRI.createGenericVirtualRegister(S32); 4776 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4777 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4778 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4779 Ops.push_back(R1); 4780 Ops.push_back(R2); 4781 Ops.push_back(R3); 4782 } else { 4783 packLanes(RayDir); 4784 packLanes(RayInvDir); 4785 } 4786 4787 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4788 .addDef(DstReg) 4789 .addImm(Opcode); 4790 4791 for (Register R : Ops) { 4792 MIB.addUse(R); 4793 } 4794 4795 MIB.addUse(TDescr) 4796 .addImm(IsA16 ? 1 : 0) 4797 .cloneMemRefs(MI); 4798 4799 MI.eraseFromParent(); 4800 return true; 4801 } 4802 4803 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4804 MachineInstr &MI) const { 4805 MachineIRBuilder &B = Helper.MIRBuilder; 4806 MachineRegisterInfo &MRI = *B.getMRI(); 4807 4808 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4809 auto IntrID = MI.getIntrinsicID(); 4810 switch (IntrID) { 4811 case Intrinsic::amdgcn_if: 4812 case Intrinsic::amdgcn_else: { 4813 MachineInstr *Br = nullptr; 4814 MachineBasicBlock *UncondBrTarget = nullptr; 4815 bool Negated = false; 4816 if (MachineInstr *BrCond = 4817 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4818 const SIRegisterInfo *TRI 4819 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4820 4821 Register Def = MI.getOperand(1).getReg(); 4822 Register Use = MI.getOperand(3).getReg(); 4823 4824 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4825 4826 if (Negated) 4827 std::swap(CondBrTarget, UncondBrTarget); 4828 4829 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4830 if (IntrID == Intrinsic::amdgcn_if) { 4831 B.buildInstr(AMDGPU::SI_IF) 4832 .addDef(Def) 4833 .addUse(Use) 4834 .addMBB(UncondBrTarget); 4835 } else { 4836 B.buildInstr(AMDGPU::SI_ELSE) 4837 .addDef(Def) 4838 .addUse(Use) 4839 .addMBB(UncondBrTarget); 4840 } 4841 4842 if (Br) { 4843 Br->getOperand(0).setMBB(CondBrTarget); 4844 } else { 4845 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4846 // since we're swapping branch targets it needs to be reinserted. 4847 // FIXME: IRTranslator should probably not do this 4848 B.buildBr(*CondBrTarget); 4849 } 4850 4851 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4852 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4853 MI.eraseFromParent(); 4854 BrCond->eraseFromParent(); 4855 return true; 4856 } 4857 4858 return false; 4859 } 4860 case Intrinsic::amdgcn_loop: { 4861 MachineInstr *Br = nullptr; 4862 MachineBasicBlock *UncondBrTarget = nullptr; 4863 bool Negated = false; 4864 if (MachineInstr *BrCond = 4865 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4866 const SIRegisterInfo *TRI 4867 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4868 4869 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4870 Register Reg = MI.getOperand(2).getReg(); 4871 4872 if (Negated) 4873 std::swap(CondBrTarget, UncondBrTarget); 4874 4875 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4876 B.buildInstr(AMDGPU::SI_LOOP) 4877 .addUse(Reg) 4878 .addMBB(UncondBrTarget); 4879 4880 if (Br) 4881 Br->getOperand(0).setMBB(CondBrTarget); 4882 else 4883 B.buildBr(*CondBrTarget); 4884 4885 MI.eraseFromParent(); 4886 BrCond->eraseFromParent(); 4887 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4888 return true; 4889 } 4890 4891 return false; 4892 } 4893 case Intrinsic::amdgcn_kernarg_segment_ptr: 4894 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4895 // This only makes sense to call in a kernel, so just lower to null. 4896 B.buildConstant(MI.getOperand(0).getReg(), 0); 4897 MI.eraseFromParent(); 4898 return true; 4899 } 4900 4901 return legalizePreloadedArgIntrin( 4902 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4903 case Intrinsic::amdgcn_implicitarg_ptr: 4904 return legalizeImplicitArgPtr(MI, MRI, B); 4905 case Intrinsic::amdgcn_workitem_id_x: 4906 return legalizePreloadedArgIntrin(MI, MRI, B, 4907 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4908 case Intrinsic::amdgcn_workitem_id_y: 4909 return legalizePreloadedArgIntrin(MI, MRI, B, 4910 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4911 case Intrinsic::amdgcn_workitem_id_z: 4912 return legalizePreloadedArgIntrin(MI, MRI, B, 4913 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4914 case Intrinsic::amdgcn_workgroup_id_x: 4915 return legalizePreloadedArgIntrin(MI, MRI, B, 4916 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4917 case Intrinsic::amdgcn_workgroup_id_y: 4918 return legalizePreloadedArgIntrin(MI, MRI, B, 4919 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4920 case Intrinsic::amdgcn_workgroup_id_z: 4921 return legalizePreloadedArgIntrin(MI, MRI, B, 4922 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4923 case Intrinsic::amdgcn_dispatch_ptr: 4924 return legalizePreloadedArgIntrin(MI, MRI, B, 4925 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4926 case Intrinsic::amdgcn_queue_ptr: 4927 return legalizePreloadedArgIntrin(MI, MRI, B, 4928 AMDGPUFunctionArgInfo::QUEUE_PTR); 4929 case Intrinsic::amdgcn_implicit_buffer_ptr: 4930 return legalizePreloadedArgIntrin( 4931 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4932 case Intrinsic::amdgcn_dispatch_id: 4933 return legalizePreloadedArgIntrin(MI, MRI, B, 4934 AMDGPUFunctionArgInfo::DISPATCH_ID); 4935 case Intrinsic::amdgcn_fdiv_fast: 4936 return legalizeFDIVFastIntrin(MI, MRI, B); 4937 case Intrinsic::amdgcn_is_shared: 4938 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4939 case Intrinsic::amdgcn_is_private: 4940 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4941 case Intrinsic::amdgcn_wavefrontsize: { 4942 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4943 MI.eraseFromParent(); 4944 return true; 4945 } 4946 case Intrinsic::amdgcn_s_buffer_load: 4947 return legalizeSBufferLoad(Helper, MI); 4948 case Intrinsic::amdgcn_raw_buffer_store: 4949 case Intrinsic::amdgcn_struct_buffer_store: 4950 return legalizeBufferStore(MI, MRI, B, false, false); 4951 case Intrinsic::amdgcn_raw_buffer_store_format: 4952 case Intrinsic::amdgcn_struct_buffer_store_format: 4953 return legalizeBufferStore(MI, MRI, B, false, true); 4954 case Intrinsic::amdgcn_raw_tbuffer_store: 4955 case Intrinsic::amdgcn_struct_tbuffer_store: 4956 return legalizeBufferStore(MI, MRI, B, true, true); 4957 case Intrinsic::amdgcn_raw_buffer_load: 4958 case Intrinsic::amdgcn_struct_buffer_load: 4959 return legalizeBufferLoad(MI, MRI, B, false, false); 4960 case Intrinsic::amdgcn_raw_buffer_load_format: 4961 case Intrinsic::amdgcn_struct_buffer_load_format: 4962 return legalizeBufferLoad(MI, MRI, B, true, false); 4963 case Intrinsic::amdgcn_raw_tbuffer_load: 4964 case Intrinsic::amdgcn_struct_tbuffer_load: 4965 return legalizeBufferLoad(MI, MRI, B, true, true); 4966 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4967 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4968 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4969 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4970 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4971 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4972 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4973 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4974 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4975 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4976 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4977 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4978 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4979 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4980 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4981 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4982 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4983 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4984 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4985 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4986 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4987 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4988 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4989 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4990 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4991 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4992 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4993 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4994 case Intrinsic::amdgcn_buffer_atomic_fadd: 4995 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4996 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4997 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4998 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4999 return legalizeBufferAtomic(MI, B, IntrID); 5000 case Intrinsic::amdgcn_atomic_inc: 5001 return legalizeAtomicIncDec(MI, B, true); 5002 case Intrinsic::amdgcn_atomic_dec: 5003 return legalizeAtomicIncDec(MI, B, false); 5004 case Intrinsic::trap: 5005 return legalizeTrapIntrinsic(MI, MRI, B); 5006 case Intrinsic::debugtrap: 5007 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5008 case Intrinsic::amdgcn_rsq_clamp: 5009 return legalizeRsqClampIntrinsic(MI, MRI, B); 5010 case Intrinsic::amdgcn_ds_fadd: 5011 case Intrinsic::amdgcn_ds_fmin: 5012 case Intrinsic::amdgcn_ds_fmax: 5013 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5014 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5015 return legalizeBVHIntrinsic(MI, B); 5016 default: { 5017 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5018 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5019 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5020 return true; 5021 } 5022 } 5023 5024 return true; 5025 } 5026