1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 // If we have a truncating store or an extending load with a data size larger 226 // than 32-bits, we need to reduce to a 32-bit type. 227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // Do not handle extending vector loads. 280 if (Ty.isVector() && MemSize != RegSize) 281 return false; 282 283 // TODO: We should be able to widen loads if the alignment is high enough, but 284 // we also need to modify the memory access size. 285 #if 0 286 // Accept widening loads based on alignment. 287 if (IsLoad && MemSize < Size) 288 MemSize = std::max(MemSize, Align); 289 #endif 290 291 // Only 1-byte and 2-byte to 32-bit extloads are valid. 292 if (MemSize != RegSize && RegSize != 32) 293 return false; 294 295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 296 return false; 297 298 switch (MemSize) { 299 case 8: 300 case 16: 301 case 32: 302 case 64: 303 case 128: 304 break; 305 case 96: 306 if (!ST.hasDwordx3LoadStores()) 307 return false; 308 break; 309 case 256: 310 case 512: 311 // These may contextually need to be broken down. 312 break; 313 default: 314 return false; 315 } 316 317 assert(RegSize >= MemSize); 318 319 if (AlignBits < MemSize) { 320 const SITargetLowering *TLI = ST.getTargetLowering(); 321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 322 Align(AlignBits / 8))) 323 return false; 324 } 325 326 return true; 327 } 328 329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 330 // workaround this. Eventually it should ignore the type for loads and only care 331 // about the size. Return true in cases where we will workaround this for now by 332 // bitcasting. 333 static bool loadStoreBitcastWorkaround(const LLT Ty) { 334 if (EnableNewLegality) 335 return false; 336 337 const unsigned Size = Ty.getSizeInBits(); 338 if (Size <= 64) 339 return false; 340 if (!Ty.isVector()) 341 return true; 342 343 LLT EltTy = Ty.getElementType(); 344 if (EltTy.isPointer()) 345 return true; 346 347 unsigned EltSize = EltTy.getSizeInBits(); 348 return EltSize != 32 && EltSize != 64; 349 } 350 351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 352 const LLT Ty = Query.Types[0]; 353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 354 !loadStoreBitcastWorkaround(Ty); 355 } 356 357 /// Return true if a load or store of the type should be lowered with a bitcast 358 /// to a different type. 359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 360 const unsigned MemSizeInBits) { 361 const unsigned Size = Ty.getSizeInBits(); 362 if (Size != MemSizeInBits) 363 return Size <= 32 && Ty.isVector(); 364 365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 366 return true; 367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 368 !isRegisterVectorElementType(Ty.getElementType()); 369 } 370 371 /// Return true if we should legalize a load by widening an odd sized memory 372 /// access up to the alignment. Note this case when the memory access itself 373 /// changes, not the size of the result register. 374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 375 unsigned AlignInBits, unsigned AddrSpace, 376 unsigned Opcode) { 377 // We don't want to widen cases that are naturally legal. 378 if (isPowerOf2_32(SizeInBits)) 379 return false; 380 381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 382 // end up widening these for a scalar load during RegBankSelect, since there 383 // aren't 96-bit scalar loads. 384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 385 return false; 386 387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 388 return false; 389 390 // A load is known dereferenceable up to the alignment, so it's legal to widen 391 // to it. 392 // 393 // TODO: Could check dereferenceable for less aligned cases. 394 unsigned RoundedSize = NextPowerOf2(SizeInBits); 395 if (AlignInBits < RoundedSize) 396 return false; 397 398 // Do not widen if it would introduce a slow unaligned load. 399 const SITargetLowering *TLI = ST.getTargetLowering(); 400 bool Fast = false; 401 return TLI->allowsMisalignedMemoryAccessesImpl( 402 RoundedSize, AddrSpace, Align(AlignInBits / 8), 403 MachineMemOperand::MOLoad, &Fast) && 404 Fast; 405 } 406 407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 408 unsigned Opcode) { 409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 410 return false; 411 412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 413 Query.MMODescrs[0].AlignInBits, 414 Query.Types[1].getAddressSpace(), Opcode); 415 } 416 417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 418 const GCNTargetMachine &TM) 419 : ST(ST_) { 420 using namespace TargetOpcode; 421 422 auto GetAddrSpacePtr = [&TM](unsigned AS) { 423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 424 }; 425 426 const LLT S1 = LLT::scalar(1); 427 const LLT S8 = LLT::scalar(8); 428 const LLT S16 = LLT::scalar(16); 429 const LLT S32 = LLT::scalar(32); 430 const LLT S64 = LLT::scalar(64); 431 const LLT S128 = LLT::scalar(128); 432 const LLT S256 = LLT::scalar(256); 433 const LLT S512 = LLT::scalar(512); 434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 435 436 const LLT V2S8 = LLT::vector(2, 8); 437 const LLT V2S16 = LLT::vector(2, 16); 438 const LLT V4S16 = LLT::vector(4, 16); 439 440 const LLT V2S32 = LLT::vector(2, 32); 441 const LLT V3S32 = LLT::vector(3, 32); 442 const LLT V4S32 = LLT::vector(4, 32); 443 const LLT V5S32 = LLT::vector(5, 32); 444 const LLT V6S32 = LLT::vector(6, 32); 445 const LLT V7S32 = LLT::vector(7, 32); 446 const LLT V8S32 = LLT::vector(8, 32); 447 const LLT V9S32 = LLT::vector(9, 32); 448 const LLT V10S32 = LLT::vector(10, 32); 449 const LLT V11S32 = LLT::vector(11, 32); 450 const LLT V12S32 = LLT::vector(12, 32); 451 const LLT V13S32 = LLT::vector(13, 32); 452 const LLT V14S32 = LLT::vector(14, 32); 453 const LLT V15S32 = LLT::vector(15, 32); 454 const LLT V16S32 = LLT::vector(16, 32); 455 const LLT V32S32 = LLT::vector(32, 32); 456 457 const LLT V2S64 = LLT::vector(2, 64); 458 const LLT V3S64 = LLT::vector(3, 64); 459 const LLT V4S64 = LLT::vector(4, 64); 460 const LLT V5S64 = LLT::vector(5, 64); 461 const LLT V6S64 = LLT::vector(6, 64); 462 const LLT V7S64 = LLT::vector(7, 64); 463 const LLT V8S64 = LLT::vector(8, 64); 464 const LLT V16S64 = LLT::vector(16, 64); 465 466 std::initializer_list<LLT> AllS32Vectors = 467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 469 std::initializer_list<LLT> AllS64Vectors = 470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 471 472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 479 480 const LLT CodePtr = FlatPtr; 481 482 const std::initializer_list<LLT> AddrSpaces64 = { 483 GlobalPtr, ConstantPtr, FlatPtr 484 }; 485 486 const std::initializer_list<LLT> AddrSpaces32 = { 487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 488 }; 489 490 const std::initializer_list<LLT> FPTypesBase = { 491 S32, S64 492 }; 493 494 const std::initializer_list<LLT> FPTypes16 = { 495 S32, S64, S16 496 }; 497 498 const std::initializer_list<LLT> FPTypesPK16 = { 499 S32, S64, S16, V2S16 500 }; 501 502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 503 504 auto &LegacyInfo = getLegacyLegalizerInfo(); 505 LegacyInfo.setAction({G_BRCOND, S1}, 506 LegacyLegalizeActions::Legal); // VCC branches 507 LegacyInfo.setAction({G_BRCOND, S32}, 508 LegacyLegalizeActions::Legal); // SCC branches 509 510 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 511 // elements for v3s16 512 getActionDefinitionsBuilder(G_PHI) 513 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 514 .legalFor(AllS32Vectors) 515 .legalFor(AllS64Vectors) 516 .legalFor(AddrSpaces64) 517 .legalFor(AddrSpaces32) 518 .legalIf(isPointer(0)) 519 .clampScalar(0, S16, S256) 520 .widenScalarToNextPow2(0, 32) 521 .clampMaxNumElements(0, S32, 16) 522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 523 .scalarize(0); 524 525 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 526 // Full set of gfx9 features. 527 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 528 .legalFor({S32, S16, V2S16}) 529 .clampScalar(0, S16, S32) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32); 533 534 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 535 .legalFor({S32, S16, V2S16}) // Clamp modifier 536 .minScalarOrElt(0, S16) 537 .clampMaxNumElements(0, S16, 2) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32) 540 .lower(); 541 } else if (ST.has16BitInsts()) { 542 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 543 .legalFor({S32, S16}) 544 .clampScalar(0, S16, S32) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 547 548 // Technically the saturating operations require clamp bit support, but this 549 // was introduced at the same time as 16-bit operations. 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 551 .legalFor({S32, S16}) // Clamp modifier 552 .minScalar(0, S16) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 16) 555 .lower(); 556 557 // We're just lowering this, but it helps get a better result to try to 558 // coerce to the desired type first. 559 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 560 .minScalar(0, S16) 561 .scalarize(0) 562 .lower(); 563 } else { 564 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 565 .legalFor({S32}) 566 .clampScalar(0, S32, S32) 567 .scalarize(0); 568 569 if (ST.hasIntClamp()) { 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .legalFor({S32}) // Clamp modifier. 572 .scalarize(0) 573 .minScalarOrElt(0, S32) 574 .lower(); 575 } else { 576 // Clamp bit support was added in VI, along with 16-bit operations. 577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 // FIXME: DAG expansion gets better results. The widening uses the smaller 584 // range values and goes for the min/max lowering directly. 585 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 586 .minScalar(0, S32) 587 .scalarize(0) 588 .lower(); 589 } 590 591 getActionDefinitionsBuilder( 592 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 593 .customFor({S32, S64}) 594 .clampScalar(0, S32, S64) 595 .widenScalarToNextPow2(0, 32) 596 .scalarize(0); 597 598 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 599 .legalFor({S32}) 600 .maxScalarOrElt(0, S32); 601 602 if (ST.hasVOP3PInsts()) { 603 Mulh 604 .clampMaxNumElements(0, S8, 2) 605 .lowerFor({V2S8}); 606 } 607 608 Mulh 609 .scalarize(0) 610 .lower(); 611 612 // Report legal for any types we can handle anywhere. For the cases only legal 613 // on the SALU, RegBankSelect will be able to re-legalize. 614 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 615 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 616 .clampScalar(0, S32, S64) 617 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 618 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 619 .widenScalarToNextPow2(0) 620 .scalarize(0); 621 622 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 623 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 624 .legalFor({{S32, S1}, {S32, S32}}) 625 .minScalar(0, S32) 626 // TODO: .scalarize(0) 627 .lower(); 628 629 getActionDefinitionsBuilder(G_BITCAST) 630 // Don't worry about the size constraint. 631 .legalIf(all(isRegisterType(0), isRegisterType(1))) 632 .lower(); 633 634 635 getActionDefinitionsBuilder(G_CONSTANT) 636 .legalFor({S1, S32, S64, S16, GlobalPtr, 637 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 638 .legalIf(isPointer(0)) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(0); 641 642 getActionDefinitionsBuilder(G_FCONSTANT) 643 .legalFor({S32, S64, S16}) 644 .clampScalar(0, S16, S64); 645 646 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 647 .legalIf(isRegisterType(0)) 648 // s1 and s16 are special cases because they have legal operations on 649 // them, but don't really occupy registers in the normal way. 650 .legalFor({S1, S16}) 651 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 652 .clampScalarOrElt(0, S32, MaxScalar) 653 .widenScalarToNextPow2(0, 32) 654 .clampMaxNumElements(0, S32, 16); 655 656 LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr}, 657 LegacyLegalizeActions::Legal); 658 659 // If the amount is divergent, we have to do a wave reduction to get the 660 // maximum value, so this is expanded during RegBankSelect. 661 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 662 .legalFor({{PrivatePtr, S32}}); 663 664 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 665 .customIf(typeIsNot(0, PrivatePtr)); 666 667 LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal); 668 669 auto &FPOpActions = getActionDefinitionsBuilder( 670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 671 .legalFor({S32, S64}); 672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 673 .customFor({S32, S64}); 674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 675 .customFor({S32, S64}); 676 677 if (ST.has16BitInsts()) { 678 if (ST.hasVOP3PInsts()) 679 FPOpActions.legalFor({S16, V2S16}); 680 else 681 FPOpActions.legalFor({S16}); 682 683 TrigActions.customFor({S16}); 684 FDIVActions.customFor({S16}); 685 } 686 687 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 689 690 if (ST.hasVOP3PInsts()) { 691 MinNumMaxNum.customFor(FPTypesPK16) 692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 693 .clampMaxNumElements(0, S16, 2) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else if (ST.has16BitInsts()) { 697 MinNumMaxNum.customFor(FPTypes16) 698 .clampScalar(0, S16, S64) 699 .scalarize(0); 700 } else { 701 MinNumMaxNum.customFor(FPTypesBase) 702 .clampScalar(0, S32, S64) 703 .scalarize(0); 704 } 705 706 if (ST.hasVOP3PInsts()) 707 FPOpActions.clampMaxNumElements(0, S16, 2); 708 709 FPOpActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 TrigActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 FDIVActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 722 .legalFor(FPTypesPK16) 723 .clampMaxNumElements(0, S16, 2) 724 .scalarize(0) 725 .clampScalar(0, S16, S64); 726 727 if (ST.has16BitInsts()) { 728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 729 .legalFor({S32, S64, S16}) 730 .scalarize(0) 731 .clampScalar(0, S16, S64); 732 } else { 733 getActionDefinitionsBuilder(G_FSQRT) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 738 if (ST.hasFractBug()) { 739 getActionDefinitionsBuilder(G_FFLOOR) 740 .customFor({S64}) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 } else { 745 getActionDefinitionsBuilder(G_FFLOOR) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } 750 } 751 752 getActionDefinitionsBuilder(G_FPTRUNC) 753 .legalFor({{S32, S64}, {S16, S32}}) 754 .scalarize(0) 755 .lower(); 756 757 getActionDefinitionsBuilder(G_FPEXT) 758 .legalFor({{S64, S32}, {S32, S16}}) 759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 760 .scalarize(0); 761 762 getActionDefinitionsBuilder(G_FSUB) 763 // Use actual fsub instruction 764 .legalFor({S32}) 765 // Must use fadd + fneg 766 .lowerFor({S64, S16, V2S16}) 767 .scalarize(0) 768 .clampScalar(0, S32, S64); 769 770 // Whether this is legal depends on the floating point mode for the function. 771 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 772 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 773 FMad.customFor({S32, S16}); 774 else if (ST.hasMadMacF32Insts()) 775 FMad.customFor({S32}); 776 else if (ST.hasMadF16()) 777 FMad.customFor({S16}); 778 FMad.scalarize(0) 779 .lower(); 780 781 auto &FRem = getActionDefinitionsBuilder(G_FREM); 782 if (ST.has16BitInsts()) { 783 FRem.customFor({S16, S32, S64}); 784 } else { 785 FRem.minScalar(0, S32) 786 .customFor({S32, S64}); 787 } 788 FRem.scalarize(0); 789 790 // TODO: Do we need to clamp maximum bitwidth? 791 getActionDefinitionsBuilder(G_TRUNC) 792 .legalIf(isScalar(0)) 793 .legalFor({{V2S16, V2S32}}) 794 .clampMaxNumElements(0, S16, 2) 795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 796 // situations (like an invalid implicit use), we don't want to infinite loop 797 // in the legalizer. 798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 799 .alwaysLegal(); 800 801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 803 {S32, S1}, {S64, S1}, {S16, S1}}) 804 .scalarize(0) 805 .clampScalar(0, S32, S64) 806 .widenScalarToNextPow2(1, 32); 807 808 // TODO: Split s1->s64 during regbankselect for VALU. 809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 811 .lowerFor({{S32, S64}}) 812 .lowerIf(typeIs(1, S1)) 813 .customFor({{S64, S64}}); 814 if (ST.has16BitInsts()) 815 IToFP.legalFor({{S16, S16}}); 816 IToFP.clampScalar(1, S32, S64) 817 .minScalar(0, S32) 818 .scalarize(0) 819 .widenScalarToNextPow2(1); 820 821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 823 .customFor({{S64, S64}}) 824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 825 if (ST.has16BitInsts()) 826 FPToI.legalFor({{S16, S16}}); 827 else 828 FPToI.minScalar(1, S32); 829 830 FPToI.minScalar(0, S32) 831 .widenScalarToNextPow2(0, 32) 832 .scalarize(0) 833 .lower(); 834 835 // Lower roundeven into G_FRINT 836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 837 .scalarize(0) 838 .lower(); 839 840 if (ST.has16BitInsts()) { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S16, S32, S64}) 843 .clampScalar(0, S16, S64) 844 .scalarize(0); 845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S32, S64}) 848 .clampScalar(0, S32, S64) 849 .scalarize(0); 850 } else { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32}) 853 .customFor({S64}) 854 .clampScalar(0, S32, S64) 855 .scalarize(0); 856 } 857 858 getActionDefinitionsBuilder(G_PTR_ADD) 859 .legalIf(all(isPointer(0), sameSize(0, 1))) 860 .scalarize(0) 861 .scalarSameSizeAs(1, 0); 862 863 getActionDefinitionsBuilder(G_PTRMASK) 864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 865 .scalarSameSizeAs(1, 0) 866 .scalarize(0); 867 868 auto &CmpBuilder = 869 getActionDefinitionsBuilder(G_ICMP) 870 // The compare output type differs based on the register bank of the output, 871 // so make both s1 and s32 legal. 872 // 873 // Scalar compares producing output in scc will be promoted to s32, as that 874 // is the allocatable register type that will be needed for the copy from 875 // scc. This will be promoted during RegBankSelect, and we assume something 876 // before that won't try to use s32 result types. 877 // 878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 879 // bank. 880 .legalForCartesianProduct( 881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 882 .legalForCartesianProduct( 883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 884 if (ST.has16BitInsts()) { 885 CmpBuilder.legalFor({{S1, S16}}); 886 } 887 888 CmpBuilder 889 .widenScalarToNextPow2(1) 890 .clampScalar(1, S32, S64) 891 .scalarize(0) 892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 893 894 getActionDefinitionsBuilder(G_FCMP) 895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0); 899 900 // FIXME: fpow has a selection pattern that should move to custom lowering. 901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 902 if (ST.has16BitInsts()) 903 Exp2Ops.legalFor({S32, S16}); 904 else 905 Exp2Ops.legalFor({S32}); 906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 907 Exp2Ops.scalarize(0); 908 909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 910 if (ST.has16BitInsts()) 911 ExpOps.customFor({{S32}, {S16}}); 912 else 913 ExpOps.customFor({S32}); 914 ExpOps.clampScalar(0, MinScalarFPTy, S32) 915 .scalarize(0); 916 917 getActionDefinitionsBuilder(G_FPOWI) 918 .clampScalar(0, MinScalarFPTy, S32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder(G_CTPOP) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 // The hardware instructions return a different result on 0 than the generic 931 // instructions expect. The hardware produces -1, but these produce the 932 // bitwidth. 933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 934 .scalarize(0) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32) 939 .lower(); 940 941 // The 64-bit versions produce 32-bit results, but only on the SALU. 942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 943 .legalFor({{S32, S32}, {S32, S64}}) 944 .clampScalar(0, S32, S32) 945 .clampScalar(1, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0, 32) 948 .widenScalarToNextPow2(1, 32); 949 950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 951 // RegBankSelect. 952 getActionDefinitionsBuilder(G_BITREVERSE) 953 .legalFor({S32, S64}) 954 .clampScalar(0, S32, S64) 955 .scalarize(0) 956 .widenScalarToNextPow2(0); 957 958 if (ST.has16BitInsts()) { 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S16, S32, V2S16}) 961 .clampMaxNumElements(0, S16, 2) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .clampScalar(0, S16, S32) 966 .scalarize(0); 967 968 if (ST.hasVOP3PInsts()) { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 970 .legalFor({S32, S16, V2S16}) 971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 972 .clampMaxNumElements(0, S16, 2) 973 .minScalar(0, S16) 974 .widenScalarToNextPow2(0) 975 .scalarize(0) 976 .lower(); 977 } else { 978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 979 .legalFor({S32, S16}) 980 .widenScalarToNextPow2(0) 981 .minScalar(0, S16) 982 .scalarize(0) 983 .lower(); 984 } 985 } else { 986 // TODO: Should have same legality without v_perm_b32 987 getActionDefinitionsBuilder(G_BSWAP) 988 .legalFor({S32}) 989 .lowerIf(scalarNarrowerThan(0, 32)) 990 // FIXME: Fixing non-power-of-2 before clamp is workaround for 991 // narrowScalar limitation. 992 .widenScalarToNextPow2(0) 993 .maxScalar(0, S32) 994 .scalarize(0) 995 .lower(); 996 997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 998 .legalFor({S32}) 999 .minScalar(0, S32) 1000 .widenScalarToNextPow2(0) 1001 .scalarize(0) 1002 .lower(); 1003 } 1004 1005 getActionDefinitionsBuilder(G_INTTOPTR) 1006 // List the common cases 1007 .legalForCartesianProduct(AddrSpaces64, {S64}) 1008 .legalForCartesianProduct(AddrSpaces32, {S32}) 1009 .scalarize(0) 1010 // Accept any address space as long as the size matches 1011 .legalIf(sameSize(0, 1)) 1012 .widenScalarIf(smallerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }) 1016 .narrowScalarIf(largerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }); 1020 1021 getActionDefinitionsBuilder(G_PTRTOINT) 1022 // List the common cases 1023 .legalForCartesianProduct(AddrSpaces64, {S64}) 1024 .legalForCartesianProduct(AddrSpaces32, {S32}) 1025 .scalarize(0) 1026 // Accept any address space as long as the size matches 1027 .legalIf(sameSize(0, 1)) 1028 .widenScalarIf(smallerThan(0, 1), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1031 }) 1032 .narrowScalarIf( 1033 largerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }); 1037 1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1039 .scalarize(0) 1040 .custom(); 1041 1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1043 bool IsLoad) -> bool { 1044 const LLT DstTy = Query.Types[0]; 1045 1046 // Split vector extloads. 1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1049 1050 if (MemSize < DstTy.getSizeInBits()) 1051 MemSize = std::max(MemSize, AlignBits); 1052 1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1054 return true; 1055 1056 const LLT PtrTy = Query.Types[1]; 1057 unsigned AS = PtrTy.getAddressSpace(); 1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1059 return true; 1060 1061 // Catch weird sized loads that don't evenly divide into the access sizes 1062 // TODO: May be able to widen depending on alignment etc. 1063 unsigned NumRegs = (MemSize + 31) / 32; 1064 if (NumRegs == 3) { 1065 if (!ST.hasDwordx3LoadStores()) 1066 return true; 1067 } else { 1068 // If the alignment allows, these should have been widened. 1069 if (!isPowerOf2_32(NumRegs)) 1070 return true; 1071 } 1072 1073 if (AlignBits < MemSize) { 1074 const SITargetLowering *TLI = ST.getTargetLowering(); 1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1076 Align(AlignBits / 8)); 1077 } 1078 1079 return false; 1080 }; 1081 1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1085 1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1087 // LDS 1088 // TODO: Unsupported flat for SI. 1089 1090 for (unsigned Op : {G_LOAD, G_STORE}) { 1091 const bool IsStore = Op == G_STORE; 1092 1093 auto &Actions = getActionDefinitionsBuilder(Op); 1094 // Explicitly list some common cases. 1095 // TODO: Does this help compile time at all? 1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1097 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1098 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1099 {S64, GlobalPtr, 64, GlobalAlign32}, 1100 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1101 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1102 {S32, GlobalPtr, 8, GlobalAlign8}, 1103 {S32, GlobalPtr, 16, GlobalAlign16}, 1104 1105 {S32, LocalPtr, 32, 32}, 1106 {S64, LocalPtr, 64, 32}, 1107 {V2S32, LocalPtr, 64, 32}, 1108 {S32, LocalPtr, 8, 8}, 1109 {S32, LocalPtr, 16, 16}, 1110 {V2S16, LocalPtr, 32, 32}, 1111 1112 {S32, PrivatePtr, 32, 32}, 1113 {S32, PrivatePtr, 8, 8}, 1114 {S32, PrivatePtr, 16, 16}, 1115 {V2S16, PrivatePtr, 32, 32}, 1116 1117 {S32, ConstantPtr, 32, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1119 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1120 {S64, ConstantPtr, 64, GlobalAlign32}, 1121 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1122 Actions.legalIf( 1123 [=](const LegalityQuery &Query) -> bool { 1124 return isLoadStoreLegal(ST, Query); 1125 }); 1126 1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1128 // 64-bits. 1129 // 1130 // TODO: Should generalize bitcast action into coerce, which will also cover 1131 // inserting addrspacecasts. 1132 Actions.customIf(typeIs(1, Constant32Ptr)); 1133 1134 // Turn any illegal element vectors into something easier to deal 1135 // with. These will ultimately produce 32-bit scalar shifts to extract the 1136 // parts anyway. 1137 // 1138 // For odd 16-bit element vectors, prefer to split those into pieces with 1139 // 16-bit vector parts. 1140 Actions.bitcastIf( 1141 [=](const LegalityQuery &Query) -> bool { 1142 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1143 Query.MMODescrs[0].SizeInBits); 1144 }, bitcastToRegisterType(0)); 1145 1146 if (!IsStore) { 1147 // Widen suitably aligned loads by loading extra bytes. The standard 1148 // legalization actions can't properly express widening memory operands. 1149 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1150 return shouldWidenLoad(ST, Query, G_LOAD); 1151 }); 1152 } 1153 1154 // FIXME: load/store narrowing should be moved to lower action 1155 Actions 1156 .narrowScalarIf( 1157 [=](const LegalityQuery &Query) -> bool { 1158 return !Query.Types[0].isVector() && 1159 needToSplitMemOp(Query, Op == G_LOAD); 1160 }, 1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1162 const LLT DstTy = Query.Types[0]; 1163 const LLT PtrTy = Query.Types[1]; 1164 1165 const unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1167 1168 // Split extloads. 1169 if (DstSize > MemSize) 1170 return std::make_pair(0, LLT::scalar(MemSize)); 1171 1172 if (!isPowerOf2_32(DstSize)) { 1173 // We're probably decomposing an odd sized store. Try to split 1174 // to the widest type. TODO: Account for alignment. As-is it 1175 // should be OK, since the new parts will be further legalized. 1176 unsigned FloorSize = PowerOf2Floor(DstSize); 1177 return std::make_pair(0, LLT::scalar(FloorSize)); 1178 } 1179 1180 if (DstSize > 32 && (DstSize % 32 != 0)) { 1181 // FIXME: Need a way to specify non-extload of larger size if 1182 // suitably aligned. 1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1184 } 1185 1186 unsigned MaxSize = maxSizeForAddrSpace(ST, 1187 PtrTy.getAddressSpace(), 1188 Op == G_LOAD); 1189 if (MemSize > MaxSize) 1190 return std::make_pair(0, LLT::scalar(MaxSize)); 1191 1192 unsigned Align = Query.MMODescrs[0].AlignInBits; 1193 return std::make_pair(0, LLT::scalar(Align)); 1194 }) 1195 .fewerElementsIf( 1196 [=](const LegalityQuery &Query) -> bool { 1197 return Query.Types[0].isVector() && 1198 needToSplitMemOp(Query, Op == G_LOAD); 1199 }, 1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1201 const LLT DstTy = Query.Types[0]; 1202 const LLT PtrTy = Query.Types[1]; 1203 1204 LLT EltTy = DstTy.getElementType(); 1205 unsigned MaxSize = maxSizeForAddrSpace(ST, 1206 PtrTy.getAddressSpace(), 1207 Op == G_LOAD); 1208 1209 // FIXME: Handle widened to power of 2 results better. This ends 1210 // up scalarizing. 1211 // FIXME: 3 element stores scalarized on SI 1212 1213 // Split if it's too large for the address space. 1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1215 unsigned NumElts = DstTy.getNumElements(); 1216 unsigned EltSize = EltTy.getSizeInBits(); 1217 1218 if (MaxSize % EltSize == 0) { 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1221 } 1222 1223 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1224 1225 // FIXME: Refine when odd breakdowns handled 1226 // The scalars will need to be re-legalized. 1227 if (NumPieces == 1 || NumPieces >= NumElts || 1228 NumElts % NumPieces != 0) 1229 return std::make_pair(0, EltTy); 1230 1231 return std::make_pair(0, 1232 LLT::vector(NumElts / NumPieces, EltTy)); 1233 } 1234 1235 // FIXME: We could probably handle weird extending loads better. 1236 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1237 if (DstTy.getSizeInBits() > MemSize) 1238 return std::make_pair(0, EltTy); 1239 1240 unsigned EltSize = EltTy.getSizeInBits(); 1241 unsigned DstSize = DstTy.getSizeInBits(); 1242 if (!isPowerOf2_32(DstSize)) { 1243 // We're probably decomposing an odd sized store. Try to split 1244 // to the widest type. TODO: Account for alignment. As-is it 1245 // should be OK, since the new parts will be further legalized. 1246 unsigned FloorSize = PowerOf2Floor(DstSize); 1247 return std::make_pair( 1248 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1249 } 1250 1251 // Need to split because of alignment. 1252 unsigned Align = Query.MMODescrs[0].AlignInBits; 1253 if (EltSize > Align && 1254 (EltSize / Align < DstTy.getNumElements())) { 1255 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1256 } 1257 1258 // May need relegalization for the scalars. 1259 return std::make_pair(0, EltTy); 1260 }) 1261 .lowerIfMemSizeNotPow2() 1262 .minScalar(0, S32) 1263 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1264 .widenScalarToNextPow2(0) 1265 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1266 .lower(); 1267 } 1268 1269 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1270 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1271 {S32, GlobalPtr, 16, 2 * 8}, 1272 {S32, LocalPtr, 8, 8}, 1273 {S32, LocalPtr, 16, 16}, 1274 {S32, PrivatePtr, 8, 8}, 1275 {S32, PrivatePtr, 16, 16}, 1276 {S32, ConstantPtr, 8, 8}, 1277 {S32, ConstantPtr, 16, 2 * 8}}) 1278 .legalIf( 1279 [=](const LegalityQuery &Query) -> bool { 1280 return isLoadStoreLegal(ST, Query); 1281 }); 1282 1283 if (ST.hasFlatAddressSpace()) { 1284 ExtLoads.legalForTypesWithMemDesc( 1285 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1286 } 1287 1288 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1289 // 64-bits. 1290 // 1291 // TODO: Should generalize bitcast action into coerce, which will also cover 1292 // inserting addrspacecasts. 1293 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1294 1295 ExtLoads.clampScalar(0, S32, S32) 1296 .widenScalarToNextPow2(0) 1297 .unsupportedIfMemSizeNotPow2() 1298 .lower(); 1299 1300 auto &Atomics = getActionDefinitionsBuilder( 1301 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1302 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1303 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1304 G_ATOMICRMW_UMIN}) 1305 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1306 {S64, GlobalPtr}, {S64, LocalPtr}, 1307 {S32, RegionPtr}, {S64, RegionPtr}}); 1308 if (ST.hasFlatAddressSpace()) { 1309 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1310 } 1311 1312 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1313 if (ST.hasLDSFPAtomics()) { 1314 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1315 if (ST.hasGFX90AInsts()) 1316 Atomic.legalFor({{S64, LocalPtr}}); 1317 } 1318 if (ST.hasAtomicFaddInsts()) 1319 Atomic.legalFor({{S32, GlobalPtr}}); 1320 1321 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1322 // demarshalling 1323 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1324 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1325 {S32, FlatPtr}, {S64, FlatPtr}}) 1326 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1327 {S32, RegionPtr}, {S64, RegionPtr}}); 1328 // TODO: Pointer types, any 32-bit or 64-bit vector 1329 1330 // Condition should be s32 for scalar, s1 for vector. 1331 getActionDefinitionsBuilder(G_SELECT) 1332 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1333 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1334 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1335 .clampScalar(0, S16, S64) 1336 .scalarize(1) 1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1339 .clampMaxNumElements(0, S32, 2) 1340 .clampMaxNumElements(0, LocalPtr, 2) 1341 .clampMaxNumElements(0, PrivatePtr, 2) 1342 .scalarize(0) 1343 .widenScalarToNextPow2(0) 1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1345 1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1347 // be more flexible with the shift amount type. 1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1349 .legalFor({{S32, S32}, {S64, S32}}); 1350 if (ST.has16BitInsts()) { 1351 if (ST.hasVOP3PInsts()) { 1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1353 .clampMaxNumElements(0, S16, 2); 1354 } else 1355 Shifts.legalFor({{S16, S16}}); 1356 1357 // TODO: Support 16-bit shift amounts for all types 1358 Shifts.widenScalarIf( 1359 [=](const LegalityQuery &Query) { 1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1361 // 32-bit amount. 1362 const LLT ValTy = Query.Types[0]; 1363 const LLT AmountTy = Query.Types[1]; 1364 return ValTy.getSizeInBits() <= 16 && 1365 AmountTy.getSizeInBits() < 16; 1366 }, changeTo(1, S16)); 1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1368 Shifts.clampScalar(1, S32, S32); 1369 Shifts.clampScalar(0, S16, S64); 1370 Shifts.widenScalarToNextPow2(0, 16); 1371 1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1373 .minScalar(0, S16) 1374 .scalarize(0) 1375 .lower(); 1376 } else { 1377 // Make sure we legalize the shift amount type first, as the general 1378 // expansion for the shifted type will produce much worse code if it hasn't 1379 // been truncated already. 1380 Shifts.clampScalar(1, S32, S32); 1381 Shifts.clampScalar(0, S32, S64); 1382 Shifts.widenScalarToNextPow2(0, 32); 1383 1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1385 .minScalar(0, S32) 1386 .scalarize(0) 1387 .lower(); 1388 } 1389 Shifts.scalarize(0); 1390 1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1394 unsigned IdxTypeIdx = 2; 1395 1396 getActionDefinitionsBuilder(Op) 1397 .customIf([=](const LegalityQuery &Query) { 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1401 const unsigned EltSize = EltTy.getSizeInBits(); 1402 return (EltSize == 32 || EltSize == 64) && 1403 VecTy.getSizeInBits() % 32 == 0 && 1404 VecTy.getSizeInBits() <= MaxRegisterSize && 1405 IdxTy.getSizeInBits() == 32; 1406 }) 1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1408 bitcastToVectorElement32(VecTypeIdx)) 1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1410 .bitcastIf( 1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1412 [=](const LegalityQuery &Query) { 1413 // For > 64-bit element types, try to turn this into a 64-bit 1414 // element vector since we may be able to do better indexing 1415 // if this is scalar. If not, fall back to 32. 1416 const LLT EltTy = Query.Types[EltTypeIdx]; 1417 const LLT VecTy = Query.Types[VecTypeIdx]; 1418 const unsigned DstEltSize = EltTy.getSizeInBits(); 1419 const unsigned VecSize = VecTy.getSizeInBits(); 1420 1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1422 return std::make_pair( 1423 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 // TODO: Only Try to form v2s16 with legal packed instructions. 1621 getActionDefinitionsBuilder(G_FSHR) 1622 .legalFor({{S32, S32}}) 1623 .lowerFor({{V2S16, V2S16}}) 1624 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1625 .scalarize(0) 1626 .lower(); 1627 1628 if (ST.hasVOP3PInsts()) { 1629 getActionDefinitionsBuilder(G_FSHL) 1630 .lowerFor({{V2S16, V2S16}}) 1631 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1632 .scalarize(0) 1633 .lower(); 1634 } else { 1635 getActionDefinitionsBuilder(G_FSHL) 1636 .scalarize(0) 1637 .lower(); 1638 } 1639 1640 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1641 .legalFor({S64}); 1642 1643 getActionDefinitionsBuilder(G_FENCE) 1644 .alwaysLegal(); 1645 1646 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1647 .scalarize(0) 1648 .minScalar(0, S32) 1649 .lower(); 1650 1651 getActionDefinitionsBuilder({ 1652 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1653 G_FCOPYSIGN, 1654 1655 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1656 G_ATOMICRMW_NAND, 1657 G_ATOMICRMW_FSUB, 1658 G_READ_REGISTER, 1659 G_WRITE_REGISTER, 1660 1661 G_SADDO, G_SSUBO, 1662 1663 // TODO: Implement 1664 G_FMINIMUM, G_FMAXIMUM}).lower(); 1665 1666 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1667 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1668 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1669 .unsupported(); 1670 1671 getLegacyLegalizerInfo().computeTables(); 1672 verify(*ST.getInstrInfo()); 1673 } 1674 1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1676 MachineInstr &MI) const { 1677 MachineIRBuilder &B = Helper.MIRBuilder; 1678 MachineRegisterInfo &MRI = *B.getMRI(); 1679 1680 switch (MI.getOpcode()) { 1681 case TargetOpcode::G_ADDRSPACE_CAST: 1682 return legalizeAddrSpaceCast(MI, MRI, B); 1683 case TargetOpcode::G_FRINT: 1684 return legalizeFrint(MI, MRI, B); 1685 case TargetOpcode::G_FCEIL: 1686 return legalizeFceil(MI, MRI, B); 1687 case TargetOpcode::G_FREM: 1688 return legalizeFrem(MI, MRI, B); 1689 case TargetOpcode::G_INTRINSIC_TRUNC: 1690 return legalizeIntrinsicTrunc(MI, MRI, B); 1691 case TargetOpcode::G_SITOFP: 1692 return legalizeITOFP(MI, MRI, B, true); 1693 case TargetOpcode::G_UITOFP: 1694 return legalizeITOFP(MI, MRI, B, false); 1695 case TargetOpcode::G_FPTOSI: 1696 return legalizeFPTOI(MI, MRI, B, true); 1697 case TargetOpcode::G_FPTOUI: 1698 return legalizeFPTOI(MI, MRI, B, false); 1699 case TargetOpcode::G_FMINNUM: 1700 case TargetOpcode::G_FMAXNUM: 1701 case TargetOpcode::G_FMINNUM_IEEE: 1702 case TargetOpcode::G_FMAXNUM_IEEE: 1703 return legalizeMinNumMaxNum(Helper, MI); 1704 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1705 return legalizeExtractVectorElt(MI, MRI, B); 1706 case TargetOpcode::G_INSERT_VECTOR_ELT: 1707 return legalizeInsertVectorElt(MI, MRI, B); 1708 case TargetOpcode::G_SHUFFLE_VECTOR: 1709 return legalizeShuffleVector(MI, MRI, B); 1710 case TargetOpcode::G_FSIN: 1711 case TargetOpcode::G_FCOS: 1712 return legalizeSinCos(MI, MRI, B); 1713 case TargetOpcode::G_GLOBAL_VALUE: 1714 return legalizeGlobalValue(MI, MRI, B); 1715 case TargetOpcode::G_LOAD: 1716 case TargetOpcode::G_SEXTLOAD: 1717 case TargetOpcode::G_ZEXTLOAD: 1718 return legalizeLoad(Helper, MI); 1719 case TargetOpcode::G_FMAD: 1720 return legalizeFMad(MI, MRI, B); 1721 case TargetOpcode::G_FDIV: 1722 return legalizeFDIV(MI, MRI, B); 1723 case TargetOpcode::G_UDIV: 1724 case TargetOpcode::G_UREM: 1725 case TargetOpcode::G_UDIVREM: 1726 return legalizeUnsignedDIV_REM(MI, MRI, B); 1727 case TargetOpcode::G_SDIV: 1728 case TargetOpcode::G_SREM: 1729 case TargetOpcode::G_SDIVREM: 1730 return legalizeSignedDIV_REM(MI, MRI, B); 1731 case TargetOpcode::G_ATOMIC_CMPXCHG: 1732 return legalizeAtomicCmpXChg(MI, MRI, B); 1733 case TargetOpcode::G_FLOG: 1734 return legalizeFlog(MI, B, numbers::ln2f); 1735 case TargetOpcode::G_FLOG10: 1736 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1737 case TargetOpcode::G_FEXP: 1738 return legalizeFExp(MI, B); 1739 case TargetOpcode::G_FPOW: 1740 return legalizeFPow(MI, B); 1741 case TargetOpcode::G_FFLOOR: 1742 return legalizeFFloor(MI, MRI, B); 1743 case TargetOpcode::G_BUILD_VECTOR: 1744 return legalizeBuildVector(MI, MRI, B); 1745 default: 1746 return false; 1747 } 1748 1749 llvm_unreachable("expected switch to return"); 1750 } 1751 1752 Register AMDGPULegalizerInfo::getSegmentAperture( 1753 unsigned AS, 1754 MachineRegisterInfo &MRI, 1755 MachineIRBuilder &B) const { 1756 MachineFunction &MF = B.getMF(); 1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1758 const LLT S32 = LLT::scalar(32); 1759 1760 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1761 1762 if (ST.hasApertureRegs()) { 1763 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1764 // getreg. 1765 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1766 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1767 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1768 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1769 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1770 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1771 unsigned Encoding = 1772 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1773 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1774 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1775 1776 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1777 1778 B.buildInstr(AMDGPU::S_GETREG_B32) 1779 .addDef(GetReg) 1780 .addImm(Encoding); 1781 MRI.setType(GetReg, S32); 1782 1783 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1784 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1785 } 1786 1787 Register QueuePtr = MRI.createGenericVirtualRegister( 1788 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1789 1790 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1791 return Register(); 1792 1793 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1794 // private_segment_aperture_base_hi. 1795 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1796 1797 // TODO: can we be smarter about machine pointer info? 1798 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1799 MachineMemOperand *MMO = MF.getMachineMemOperand( 1800 PtrInfo, 1801 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1802 MachineMemOperand::MOInvariant, 1803 4, commonAlignment(Align(64), StructOffset)); 1804 1805 Register LoadAddr; 1806 1807 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1808 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1809 } 1810 1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1812 MachineInstr &MI, MachineRegisterInfo &MRI, 1813 MachineIRBuilder &B) const { 1814 MachineFunction &MF = B.getMF(); 1815 1816 const LLT S32 = LLT::scalar(32); 1817 Register Dst = MI.getOperand(0).getReg(); 1818 Register Src = MI.getOperand(1).getReg(); 1819 1820 LLT DstTy = MRI.getType(Dst); 1821 LLT SrcTy = MRI.getType(Src); 1822 unsigned DestAS = DstTy.getAddressSpace(); 1823 unsigned SrcAS = SrcTy.getAddressSpace(); 1824 1825 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1826 // vector element. 1827 assert(!DstTy.isVector()); 1828 1829 const AMDGPUTargetMachine &TM 1830 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1831 1832 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1833 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1834 return true; 1835 } 1836 1837 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1838 // Truncate. 1839 B.buildExtract(Dst, Src, 0); 1840 MI.eraseFromParent(); 1841 return true; 1842 } 1843 1844 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1845 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1846 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1847 1848 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1849 // another. Merge operands are required to be the same type, but creating an 1850 // extra ptrtoint would be kind of pointless. 1851 auto HighAddr = B.buildConstant( 1852 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1853 B.buildMerge(Dst, {Src, HighAddr}); 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1859 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1860 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1861 unsigned NullVal = TM.getNullPointerValue(DestAS); 1862 1863 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1864 auto FlatNull = B.buildConstant(SrcTy, 0); 1865 1866 // Extract low 32-bits of the pointer. 1867 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1868 1869 auto CmpRes = 1870 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1871 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1872 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1878 return false; 1879 1880 if (!ST.hasFlatAddressSpace()) 1881 return false; 1882 1883 auto SegmentNull = 1884 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1885 auto FlatNull = 1886 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1887 1888 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1889 if (!ApertureReg.isValid()) 1890 return false; 1891 1892 auto CmpRes = 1893 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1894 1895 // Coerce the type of the low half of the result so we can use merge_values. 1896 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1897 1898 // TODO: Should we allow mismatched types but matching sizes in merges to 1899 // avoid the ptrtoint? 1900 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1901 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1902 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 bool AMDGPULegalizerInfo::legalizeFrint( 1908 MachineInstr &MI, MachineRegisterInfo &MRI, 1909 MachineIRBuilder &B) const { 1910 Register Src = MI.getOperand(1).getReg(); 1911 LLT Ty = MRI.getType(Src); 1912 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1913 1914 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1915 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1916 1917 auto C1 = B.buildFConstant(Ty, C1Val); 1918 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1919 1920 // TODO: Should this propagate fast-math-flags? 1921 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1922 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1923 1924 auto C2 = B.buildFConstant(Ty, C2Val); 1925 auto Fabs = B.buildFAbs(Ty, Src); 1926 1927 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1928 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 bool AMDGPULegalizerInfo::legalizeFceil( 1934 MachineInstr &MI, MachineRegisterInfo &MRI, 1935 MachineIRBuilder &B) const { 1936 1937 const LLT S1 = LLT::scalar(1); 1938 const LLT S64 = LLT::scalar(64); 1939 1940 Register Src = MI.getOperand(1).getReg(); 1941 assert(MRI.getType(Src) == S64); 1942 1943 // result = trunc(src) 1944 // if (src > 0.0 && src != result) 1945 // result += 1.0 1946 1947 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1948 1949 const auto Zero = B.buildFConstant(S64, 0.0); 1950 const auto One = B.buildFConstant(S64, 1.0); 1951 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1952 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1953 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1954 auto Add = B.buildSelect(S64, And, One, Zero); 1955 1956 // TODO: Should this propagate fast-math-flags? 1957 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1958 return true; 1959 } 1960 1961 bool AMDGPULegalizerInfo::legalizeFrem( 1962 MachineInstr &MI, MachineRegisterInfo &MRI, 1963 MachineIRBuilder &B) const { 1964 Register DstReg = MI.getOperand(0).getReg(); 1965 Register Src0Reg = MI.getOperand(1).getReg(); 1966 Register Src1Reg = MI.getOperand(2).getReg(); 1967 auto Flags = MI.getFlags(); 1968 LLT Ty = MRI.getType(DstReg); 1969 1970 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1971 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1972 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1973 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 static MachineInstrBuilder extractF64Exponent(Register Hi, 1979 MachineIRBuilder &B) { 1980 const unsigned FractBits = 52; 1981 const unsigned ExpBits = 11; 1982 LLT S32 = LLT::scalar(32); 1983 1984 auto Const0 = B.buildConstant(S32, FractBits - 32); 1985 auto Const1 = B.buildConstant(S32, ExpBits); 1986 1987 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1988 .addUse(Hi) 1989 .addUse(Const0.getReg(0)) 1990 .addUse(Const1.getReg(0)); 1991 1992 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1993 } 1994 1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B) const { 1998 const LLT S1 = LLT::scalar(1); 1999 const LLT S32 = LLT::scalar(32); 2000 const LLT S64 = LLT::scalar(64); 2001 2002 Register Src = MI.getOperand(1).getReg(); 2003 assert(MRI.getType(Src) == S64); 2004 2005 // TODO: Should this use extract since the low half is unused? 2006 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2007 Register Hi = Unmerge.getReg(1); 2008 2009 // Extract the upper half, since this is where we will find the sign and 2010 // exponent. 2011 auto Exp = extractF64Exponent(Hi, B); 2012 2013 const unsigned FractBits = 52; 2014 2015 // Extract the sign bit. 2016 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2017 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2018 2019 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2020 2021 const auto Zero32 = B.buildConstant(S32, 0); 2022 2023 // Extend back to 64-bits. 2024 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2025 2026 auto Shr = B.buildAShr(S64, FractMask, Exp); 2027 auto Not = B.buildNot(S64, Shr); 2028 auto Tmp0 = B.buildAnd(S64, Src, Not); 2029 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2030 2031 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2032 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2033 2034 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2035 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeITOFP( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B, bool Signed) const { 2043 2044 Register Dst = MI.getOperand(0).getReg(); 2045 Register Src = MI.getOperand(1).getReg(); 2046 2047 const LLT S64 = LLT::scalar(64); 2048 const LLT S32 = LLT::scalar(32); 2049 2050 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2051 2052 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2053 2054 auto CvtHi = Signed ? 2055 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2056 B.buildUITOFP(S64, Unmerge.getReg(1)); 2057 2058 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2059 2060 auto ThirtyTwo = B.buildConstant(S32, 32); 2061 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2062 .addUse(CvtHi.getReg(0)) 2063 .addUse(ThirtyTwo.getReg(0)); 2064 2065 // TODO: Should this propagate fast-math-flags? 2066 B.buildFAdd(Dst, LdExp, CvtLo); 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 // TODO: Copied from DAG implementation. Verify logic and document how this 2072 // actually works. 2073 bool AMDGPULegalizerInfo::legalizeFPTOI( 2074 MachineInstr &MI, MachineRegisterInfo &MRI, 2075 MachineIRBuilder &B, bool Signed) const { 2076 2077 Register Dst = MI.getOperand(0).getReg(); 2078 Register Src = MI.getOperand(1).getReg(); 2079 2080 const LLT S64 = LLT::scalar(64); 2081 const LLT S32 = LLT::scalar(32); 2082 2083 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2084 2085 unsigned Flags = MI.getFlags(); 2086 2087 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2088 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2089 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2090 2091 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2092 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2093 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2094 2095 auto Hi = Signed ? 2096 B.buildFPTOSI(S32, FloorMul) : 2097 B.buildFPTOUI(S32, FloorMul); 2098 auto Lo = B.buildFPTOUI(S32, Fma); 2099 2100 B.buildMerge(Dst, { Lo, Hi }); 2101 MI.eraseFromParent(); 2102 2103 return true; 2104 } 2105 2106 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2107 MachineInstr &MI) const { 2108 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2109 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2110 2111 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2112 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2113 2114 // With ieee_mode disabled, the instructions have the correct behavior 2115 // already for G_FMINNUM/G_FMAXNUM 2116 if (!MFI->getMode().IEEE) 2117 return !IsIEEEOp; 2118 2119 if (IsIEEEOp) 2120 return true; 2121 2122 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2123 } 2124 2125 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2126 MachineInstr &MI, MachineRegisterInfo &MRI, 2127 MachineIRBuilder &B) const { 2128 // TODO: Should move some of this into LegalizerHelper. 2129 2130 // TODO: Promote dynamic indexing of s16 to s32 2131 2132 // FIXME: Artifact combiner probably should have replaced the truncated 2133 // constant before this, so we shouldn't need 2134 // getConstantVRegValWithLookThrough. 2135 Optional<ValueAndVReg> MaybeIdxVal = 2136 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2137 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2138 return true; 2139 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2140 2141 Register Dst = MI.getOperand(0).getReg(); 2142 Register Vec = MI.getOperand(1).getReg(); 2143 2144 LLT VecTy = MRI.getType(Vec); 2145 LLT EltTy = VecTy.getElementType(); 2146 assert(EltTy == MRI.getType(Dst)); 2147 2148 if (IdxVal < VecTy.getNumElements()) 2149 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2150 else 2151 B.buildUndef(Dst); 2152 2153 MI.eraseFromParent(); 2154 return true; 2155 } 2156 2157 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2158 MachineInstr &MI, MachineRegisterInfo &MRI, 2159 MachineIRBuilder &B) const { 2160 // TODO: Should move some of this into LegalizerHelper. 2161 2162 // TODO: Promote dynamic indexing of s16 to s32 2163 2164 // FIXME: Artifact combiner probably should have replaced the truncated 2165 // constant before this, so we shouldn't need 2166 // getConstantVRegValWithLookThrough. 2167 Optional<ValueAndVReg> MaybeIdxVal = 2168 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2169 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2170 return true; 2171 2172 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2173 Register Dst = MI.getOperand(0).getReg(); 2174 Register Vec = MI.getOperand(1).getReg(); 2175 Register Ins = MI.getOperand(2).getReg(); 2176 2177 LLT VecTy = MRI.getType(Vec); 2178 LLT EltTy = VecTy.getElementType(); 2179 assert(EltTy == MRI.getType(Ins)); 2180 2181 if (IdxVal < VecTy.getNumElements()) 2182 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2183 else 2184 B.buildUndef(Dst); 2185 2186 MI.eraseFromParent(); 2187 return true; 2188 } 2189 2190 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2191 MachineInstr &MI, MachineRegisterInfo &MRI, 2192 MachineIRBuilder &B) const { 2193 const LLT V2S16 = LLT::vector(2, 16); 2194 2195 Register Dst = MI.getOperand(0).getReg(); 2196 Register Src0 = MI.getOperand(1).getReg(); 2197 LLT DstTy = MRI.getType(Dst); 2198 LLT SrcTy = MRI.getType(Src0); 2199 2200 if (SrcTy == V2S16 && DstTy == V2S16 && 2201 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2202 return true; 2203 2204 MachineIRBuilder HelperBuilder(MI); 2205 GISelObserverWrapper DummyObserver; 2206 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2207 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2208 } 2209 2210 bool AMDGPULegalizerInfo::legalizeSinCos( 2211 MachineInstr &MI, MachineRegisterInfo &MRI, 2212 MachineIRBuilder &B) const { 2213 2214 Register DstReg = MI.getOperand(0).getReg(); 2215 Register SrcReg = MI.getOperand(1).getReg(); 2216 LLT Ty = MRI.getType(DstReg); 2217 unsigned Flags = MI.getFlags(); 2218 2219 Register TrigVal; 2220 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2221 if (ST.hasTrigReducedRange()) { 2222 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2223 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2224 .addUse(MulVal.getReg(0)) 2225 .setMIFlags(Flags).getReg(0); 2226 } else 2227 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2228 2229 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2230 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2231 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2232 .addUse(TrigVal) 2233 .setMIFlags(Flags); 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2239 MachineIRBuilder &B, 2240 const GlobalValue *GV, 2241 int64_t Offset, 2242 unsigned GAFlags) const { 2243 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2244 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2245 // to the following code sequence: 2246 // 2247 // For constant address space: 2248 // s_getpc_b64 s[0:1] 2249 // s_add_u32 s0, s0, $symbol 2250 // s_addc_u32 s1, s1, 0 2251 // 2252 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2253 // a fixup or relocation is emitted to replace $symbol with a literal 2254 // constant, which is a pc-relative offset from the encoding of the $symbol 2255 // operand to the global variable. 2256 // 2257 // For global address space: 2258 // s_getpc_b64 s[0:1] 2259 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2260 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2261 // 2262 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2263 // fixups or relocations are emitted to replace $symbol@*@lo and 2264 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2265 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2266 // operand to the global variable. 2267 // 2268 // What we want here is an offset from the value returned by s_getpc 2269 // (which is the address of the s_add_u32 instruction) to the global 2270 // variable, but since the encoding of $symbol starts 4 bytes after the start 2271 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2272 // small. This requires us to add 4 to the global variable offset in order to 2273 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2274 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2275 // instruction. 2276 2277 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2278 2279 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2280 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2281 2282 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2283 .addDef(PCReg); 2284 2285 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2286 if (GAFlags == SIInstrInfo::MO_NONE) 2287 MIB.addImm(0); 2288 else 2289 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2290 2291 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2292 2293 if (PtrTy.getSizeInBits() == 32) 2294 B.buildExtract(DstReg, PCReg, 0); 2295 return true; 2296 } 2297 2298 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2299 MachineInstr &MI, MachineRegisterInfo &MRI, 2300 MachineIRBuilder &B) const { 2301 Register DstReg = MI.getOperand(0).getReg(); 2302 LLT Ty = MRI.getType(DstReg); 2303 unsigned AS = Ty.getAddressSpace(); 2304 2305 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2306 MachineFunction &MF = B.getMF(); 2307 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2308 2309 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2310 if (!MFI->isModuleEntryFunction() && 2311 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2312 const Function &Fn = MF.getFunction(); 2313 DiagnosticInfoUnsupported BadLDSDecl( 2314 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2315 DS_Warning); 2316 Fn.getContext().diagnose(BadLDSDecl); 2317 2318 // We currently don't have a way to correctly allocate LDS objects that 2319 // aren't directly associated with a kernel. We do force inlining of 2320 // functions that use local objects. However, if these dead functions are 2321 // not eliminated, we don't want a compile time error. Just emit a warning 2322 // and a trap, since there should be no callable path here. 2323 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2324 B.buildUndef(DstReg); 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 // TODO: We could emit code to handle the initialization somewhere. 2330 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2331 const SITargetLowering *TLI = ST.getTargetLowering(); 2332 if (!TLI->shouldUseLDSConstAddress(GV)) { 2333 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2334 return true; // Leave in place; 2335 } 2336 2337 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2338 Type *Ty = GV->getValueType(); 2339 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2340 // zero-sized type in other languages to declare the dynamic shared 2341 // memory which size is not known at the compile time. They will be 2342 // allocated by the runtime and placed directly after the static 2343 // allocated ones. They all share the same offset. 2344 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2345 // Adjust alignment for that dynamic shared memory array. 2346 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2347 LLT S32 = LLT::scalar(32); 2348 auto Sz = 2349 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2350 B.buildIntToPtr(DstReg, Sz); 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 } 2355 2356 B.buildConstant( 2357 DstReg, 2358 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2359 MI.eraseFromParent(); 2360 return true; 2361 } 2362 2363 const Function &Fn = MF.getFunction(); 2364 DiagnosticInfoUnsupported BadInit( 2365 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2366 Fn.getContext().diagnose(BadInit); 2367 return true; 2368 } 2369 2370 const SITargetLowering *TLI = ST.getTargetLowering(); 2371 2372 if (TLI->shouldEmitFixup(GV)) { 2373 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2374 MI.eraseFromParent(); 2375 return true; 2376 } 2377 2378 if (TLI->shouldEmitPCReloc(GV)) { 2379 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2380 MI.eraseFromParent(); 2381 return true; 2382 } 2383 2384 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2385 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2386 2387 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2388 MachinePointerInfo::getGOT(MF), 2389 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2390 MachineMemOperand::MOInvariant, 2391 8 /*Size*/, Align(8)); 2392 2393 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2394 2395 if (Ty.getSizeInBits() == 32) { 2396 // Truncate if this is a 32-bit constant adrdess. 2397 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2398 B.buildExtract(DstReg, Load, 0); 2399 } else 2400 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2401 2402 MI.eraseFromParent(); 2403 return true; 2404 } 2405 2406 static LLT widenToNextPowerOf2(LLT Ty) { 2407 if (Ty.isVector()) 2408 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2409 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2410 } 2411 2412 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2413 MachineInstr &MI) const { 2414 MachineIRBuilder &B = Helper.MIRBuilder; 2415 MachineRegisterInfo &MRI = *B.getMRI(); 2416 GISelChangeObserver &Observer = Helper.Observer; 2417 2418 Register PtrReg = MI.getOperand(1).getReg(); 2419 LLT PtrTy = MRI.getType(PtrReg); 2420 unsigned AddrSpace = PtrTy.getAddressSpace(); 2421 2422 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2423 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2424 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2425 Observer.changingInstr(MI); 2426 MI.getOperand(1).setReg(Cast.getReg(0)); 2427 Observer.changedInstr(MI); 2428 return true; 2429 } 2430 2431 if (MI.getOpcode() != AMDGPU::G_LOAD) 2432 return false; 2433 2434 Register ValReg = MI.getOperand(0).getReg(); 2435 LLT ValTy = MRI.getType(ValReg); 2436 2437 MachineMemOperand *MMO = *MI.memoperands_begin(); 2438 const unsigned ValSize = ValTy.getSizeInBits(); 2439 const unsigned MemSize = 8 * MMO->getSize(); 2440 const Align MemAlign = MMO->getAlign(); 2441 const unsigned AlignInBits = 8 * MemAlign.value(); 2442 2443 // Widen non-power-of-2 loads to the alignment if needed 2444 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2445 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2446 2447 // This was already the correct extending load result type, so just adjust 2448 // the memory type. 2449 if (WideMemSize == ValSize) { 2450 MachineFunction &MF = B.getMF(); 2451 2452 MachineMemOperand *WideMMO = 2453 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2454 Observer.changingInstr(MI); 2455 MI.setMemRefs(MF, {WideMMO}); 2456 Observer.changedInstr(MI); 2457 return true; 2458 } 2459 2460 // Don't bother handling edge case that should probably never be produced. 2461 if (ValSize > WideMemSize) 2462 return false; 2463 2464 LLT WideTy = widenToNextPowerOf2(ValTy); 2465 2466 Register WideLoad; 2467 if (!WideTy.isVector()) { 2468 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2469 B.buildTrunc(ValReg, WideLoad).getReg(0); 2470 } else { 2471 // Extract the subvector. 2472 2473 if (isRegisterType(ValTy)) { 2474 // If this a case where G_EXTRACT is legal, use it. 2475 // (e.g. <3 x s32> -> <4 x s32>) 2476 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2477 B.buildExtract(ValReg, WideLoad, 0); 2478 } else { 2479 // For cases where the widened type isn't a nice register value, unmerge 2480 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2481 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2482 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2483 B.setInsertPt(B.getMBB(), MI.getIterator()); 2484 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2485 } 2486 } 2487 2488 MI.eraseFromParent(); 2489 return true; 2490 } 2491 2492 return false; 2493 } 2494 2495 bool AMDGPULegalizerInfo::legalizeFMad( 2496 MachineInstr &MI, MachineRegisterInfo &MRI, 2497 MachineIRBuilder &B) const { 2498 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2499 assert(Ty.isScalar()); 2500 2501 MachineFunction &MF = B.getMF(); 2502 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2503 2504 // TODO: Always legal with future ftz flag. 2505 // FIXME: Do we need just output? 2506 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2507 return true; 2508 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2509 return true; 2510 2511 MachineIRBuilder HelperBuilder(MI); 2512 GISelObserverWrapper DummyObserver; 2513 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2514 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2515 } 2516 2517 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2518 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2519 Register DstReg = MI.getOperand(0).getReg(); 2520 Register PtrReg = MI.getOperand(1).getReg(); 2521 Register CmpVal = MI.getOperand(2).getReg(); 2522 Register NewVal = MI.getOperand(3).getReg(); 2523 2524 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2525 "this should not have been custom lowered"); 2526 2527 LLT ValTy = MRI.getType(CmpVal); 2528 LLT VecTy = LLT::vector(2, ValTy); 2529 2530 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2531 2532 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2533 .addDef(DstReg) 2534 .addUse(PtrReg) 2535 .addUse(PackedVal) 2536 .setMemRefs(MI.memoperands()); 2537 2538 MI.eraseFromParent(); 2539 return true; 2540 } 2541 2542 bool AMDGPULegalizerInfo::legalizeFlog( 2543 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2544 Register Dst = MI.getOperand(0).getReg(); 2545 Register Src = MI.getOperand(1).getReg(); 2546 LLT Ty = B.getMRI()->getType(Dst); 2547 unsigned Flags = MI.getFlags(); 2548 2549 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2550 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2551 2552 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2553 MI.eraseFromParent(); 2554 return true; 2555 } 2556 2557 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2558 MachineIRBuilder &B) const { 2559 Register Dst = MI.getOperand(0).getReg(); 2560 Register Src = MI.getOperand(1).getReg(); 2561 unsigned Flags = MI.getFlags(); 2562 LLT Ty = B.getMRI()->getType(Dst); 2563 2564 auto K = B.buildFConstant(Ty, numbers::log2e); 2565 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2566 B.buildFExp2(Dst, Mul, Flags); 2567 MI.eraseFromParent(); 2568 return true; 2569 } 2570 2571 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2572 MachineIRBuilder &B) const { 2573 Register Dst = MI.getOperand(0).getReg(); 2574 Register Src0 = MI.getOperand(1).getReg(); 2575 Register Src1 = MI.getOperand(2).getReg(); 2576 unsigned Flags = MI.getFlags(); 2577 LLT Ty = B.getMRI()->getType(Dst); 2578 const LLT S16 = LLT::scalar(16); 2579 const LLT S32 = LLT::scalar(32); 2580 2581 if (Ty == S32) { 2582 auto Log = B.buildFLog2(S32, Src0, Flags); 2583 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2584 .addUse(Log.getReg(0)) 2585 .addUse(Src1) 2586 .setMIFlags(Flags); 2587 B.buildFExp2(Dst, Mul, Flags); 2588 } else if (Ty == S16) { 2589 // There's no f16 fmul_legacy, so we need to convert for it. 2590 auto Log = B.buildFLog2(S16, Src0, Flags); 2591 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2592 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2593 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2594 .addUse(Ext0.getReg(0)) 2595 .addUse(Ext1.getReg(0)) 2596 .setMIFlags(Flags); 2597 2598 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2599 } else 2600 return false; 2601 2602 MI.eraseFromParent(); 2603 return true; 2604 } 2605 2606 // Find a source register, ignoring any possible source modifiers. 2607 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2608 Register ModSrc = OrigSrc; 2609 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2610 ModSrc = SrcFNeg->getOperand(1).getReg(); 2611 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2612 ModSrc = SrcFAbs->getOperand(1).getReg(); 2613 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2614 ModSrc = SrcFAbs->getOperand(1).getReg(); 2615 return ModSrc; 2616 } 2617 2618 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2619 MachineRegisterInfo &MRI, 2620 MachineIRBuilder &B) const { 2621 2622 const LLT S1 = LLT::scalar(1); 2623 const LLT S64 = LLT::scalar(64); 2624 Register Dst = MI.getOperand(0).getReg(); 2625 Register OrigSrc = MI.getOperand(1).getReg(); 2626 unsigned Flags = MI.getFlags(); 2627 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2628 "this should not have been custom lowered"); 2629 2630 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2631 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2632 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2633 // V_FRACT bug is: 2634 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2635 // 2636 // Convert floor(x) to (x - fract(x)) 2637 2638 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2639 .addUse(OrigSrc) 2640 .setMIFlags(Flags); 2641 2642 // Give source modifier matching some assistance before obscuring a foldable 2643 // pattern. 2644 2645 // TODO: We can avoid the neg on the fract? The input sign to fract 2646 // shouldn't matter? 2647 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2648 2649 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2650 2651 Register Min = MRI.createGenericVirtualRegister(S64); 2652 2653 // We don't need to concern ourselves with the snan handling difference, so 2654 // use the one which will directly select. 2655 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2656 if (MFI->getMode().IEEE) 2657 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2658 else 2659 B.buildFMinNum(Min, Fract, Const, Flags); 2660 2661 Register CorrectedFract = Min; 2662 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2663 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2664 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2665 } 2666 2667 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2668 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2669 2670 MI.eraseFromParent(); 2671 return true; 2672 } 2673 2674 // Turn an illegal packed v2s16 build vector into bit operations. 2675 // TODO: This should probably be a bitcast action in LegalizerHelper. 2676 bool AMDGPULegalizerInfo::legalizeBuildVector( 2677 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2678 Register Dst = MI.getOperand(0).getReg(); 2679 const LLT S32 = LLT::scalar(32); 2680 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2681 2682 Register Src0 = MI.getOperand(1).getReg(); 2683 Register Src1 = MI.getOperand(2).getReg(); 2684 assert(MRI.getType(Src0) == LLT::scalar(16)); 2685 2686 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2687 B.buildBitcast(Dst, Merge); 2688 2689 MI.eraseFromParent(); 2690 return true; 2691 } 2692 2693 // Check that this is a G_XOR x, -1 2694 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2695 if (MI.getOpcode() != TargetOpcode::G_XOR) 2696 return false; 2697 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2698 return ConstVal && *ConstVal == -1; 2699 } 2700 2701 // Return the use branch instruction, otherwise null if the usage is invalid. 2702 static MachineInstr * 2703 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2704 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2705 Register CondDef = MI.getOperand(0).getReg(); 2706 if (!MRI.hasOneNonDBGUse(CondDef)) 2707 return nullptr; 2708 2709 MachineBasicBlock *Parent = MI.getParent(); 2710 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2711 2712 if (isNot(MRI, *UseMI)) { 2713 Register NegatedCond = UseMI->getOperand(0).getReg(); 2714 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2715 return nullptr; 2716 2717 // We're deleting the def of this value, so we need to remove it. 2718 UseMI->eraseFromParent(); 2719 2720 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2721 Negated = true; 2722 } 2723 2724 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2725 return nullptr; 2726 2727 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2728 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2729 if (Next == Parent->end()) { 2730 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2731 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2732 return nullptr; 2733 UncondBrTarget = &*NextMBB; 2734 } else { 2735 if (Next->getOpcode() != AMDGPU::G_BR) 2736 return nullptr; 2737 Br = &*Next; 2738 UncondBrTarget = Br->getOperand(0).getMBB(); 2739 } 2740 2741 return UseMI; 2742 } 2743 2744 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2745 const ArgDescriptor *Arg, 2746 const TargetRegisterClass *ArgRC, 2747 LLT ArgTy) const { 2748 MCRegister SrcReg = Arg->getRegister(); 2749 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2750 assert(DstReg.isVirtual() && "Virtual register expected"); 2751 2752 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2753 ArgTy); 2754 if (Arg->isMasked()) { 2755 // TODO: Should we try to emit this once in the entry block? 2756 const LLT S32 = LLT::scalar(32); 2757 const unsigned Mask = Arg->getMask(); 2758 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2759 2760 Register AndMaskSrc = LiveIn; 2761 2762 if (Shift != 0) { 2763 auto ShiftAmt = B.buildConstant(S32, Shift); 2764 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2765 } 2766 2767 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2768 } else { 2769 B.buildCopy(DstReg, LiveIn); 2770 } 2771 2772 return true; 2773 } 2774 2775 bool AMDGPULegalizerInfo::loadInputValue( 2776 Register DstReg, MachineIRBuilder &B, 2777 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2778 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2779 const ArgDescriptor *Arg; 2780 const TargetRegisterClass *ArgRC; 2781 LLT ArgTy; 2782 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2783 2784 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2785 return false; // TODO: Handle these 2786 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2790 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2791 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2792 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2793 return false; 2794 2795 MI.eraseFromParent(); 2796 return true; 2797 } 2798 2799 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2800 MachineRegisterInfo &MRI, 2801 MachineIRBuilder &B) const { 2802 Register Dst = MI.getOperand(0).getReg(); 2803 LLT DstTy = MRI.getType(Dst); 2804 LLT S16 = LLT::scalar(16); 2805 LLT S32 = LLT::scalar(32); 2806 LLT S64 = LLT::scalar(64); 2807 2808 if (DstTy == S16) 2809 return legalizeFDIV16(MI, MRI, B); 2810 if (DstTy == S32) 2811 return legalizeFDIV32(MI, MRI, B); 2812 if (DstTy == S64) 2813 return legalizeFDIV64(MI, MRI, B); 2814 2815 return false; 2816 } 2817 2818 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2819 Register DstDivReg, 2820 Register DstRemReg, 2821 Register X, 2822 Register Y) const { 2823 const LLT S1 = LLT::scalar(1); 2824 const LLT S32 = LLT::scalar(32); 2825 2826 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2827 // algorithm used here. 2828 2829 // Initial estimate of inv(y). 2830 auto FloatY = B.buildUITOFP(S32, Y); 2831 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2832 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2833 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2834 auto Z = B.buildFPTOUI(S32, ScaledY); 2835 2836 // One round of UNR. 2837 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2838 auto NegYZ = B.buildMul(S32, NegY, Z); 2839 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2840 2841 // Quotient/remainder estimate. 2842 auto Q = B.buildUMulH(S32, X, Z); 2843 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2844 2845 // First quotient/remainder refinement. 2846 auto One = B.buildConstant(S32, 1); 2847 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2848 if (DstDivReg) 2849 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2850 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2851 2852 // Second quotient/remainder refinement. 2853 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2854 if (DstDivReg) 2855 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2856 2857 if (DstRemReg) 2858 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2859 } 2860 2861 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2862 // 2863 // Return lo, hi of result 2864 // 2865 // %cvt.lo = G_UITOFP Val.lo 2866 // %cvt.hi = G_UITOFP Val.hi 2867 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2868 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2869 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2870 // %mul2 = G_FMUL %mul1, 2**(-32) 2871 // %trunc = G_INTRINSIC_TRUNC %mul2 2872 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2873 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2874 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2875 Register Val) { 2876 const LLT S32 = LLT::scalar(32); 2877 auto Unmerge = B.buildUnmerge(S32, Val); 2878 2879 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2880 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2881 2882 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2883 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2884 2885 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2886 auto Mul1 = 2887 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2888 2889 // 2**(-32) 2890 auto Mul2 = 2891 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2892 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2893 2894 // -(2**32) 2895 auto Mad2 = B.buildFMAD(S32, Trunc, 2896 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2897 2898 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2899 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2900 2901 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2902 } 2903 2904 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2905 Register DstDivReg, 2906 Register DstRemReg, 2907 Register Numer, 2908 Register Denom) const { 2909 const LLT S32 = LLT::scalar(32); 2910 const LLT S64 = LLT::scalar(64); 2911 const LLT S1 = LLT::scalar(1); 2912 Register RcpLo, RcpHi; 2913 2914 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2915 2916 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2917 2918 auto Zero64 = B.buildConstant(S64, 0); 2919 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2920 2921 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2922 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2923 2924 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2925 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2926 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2927 2928 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2929 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2930 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2931 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2932 2933 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2934 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2935 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2936 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2937 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2938 2939 auto Zero32 = B.buildConstant(S32, 0); 2940 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2941 auto Add2_HiC = 2942 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2943 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2944 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2945 2946 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2947 Register NumerLo = UnmergeNumer.getReg(0); 2948 Register NumerHi = UnmergeNumer.getReg(1); 2949 2950 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2951 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2952 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2953 Register Mul3_Lo = UnmergeMul3.getReg(0); 2954 Register Mul3_Hi = UnmergeMul3.getReg(1); 2955 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2956 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2957 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2958 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2959 2960 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2961 Register DenomLo = UnmergeDenom.getReg(0); 2962 Register DenomHi = UnmergeDenom.getReg(1); 2963 2964 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2965 auto C1 = B.buildSExt(S32, CmpHi); 2966 2967 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2968 auto C2 = B.buildSExt(S32, CmpLo); 2969 2970 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2971 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2972 2973 // TODO: Here and below portions of the code can be enclosed into if/endif. 2974 // Currently control flow is unconditional and we have 4 selects after 2975 // potential endif to substitute PHIs. 2976 2977 // if C3 != 0 ... 2978 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2979 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2980 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2981 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2982 2983 auto One64 = B.buildConstant(S64, 1); 2984 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2985 2986 auto C4 = 2987 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2988 auto C5 = 2989 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2990 auto C6 = B.buildSelect( 2991 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2992 2993 // if (C6 != 0) 2994 auto Add4 = B.buildAdd(S64, Add3, One64); 2995 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2996 2997 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2998 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2999 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3000 3001 // endif C6 3002 // endif C3 3003 3004 if (DstDivReg) { 3005 auto Sel1 = B.buildSelect( 3006 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3007 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3008 Sel1, MulHi3); 3009 } 3010 3011 if (DstRemReg) { 3012 auto Sel2 = B.buildSelect( 3013 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3014 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3015 Sel2, Sub1); 3016 } 3017 } 3018 3019 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3020 MachineRegisterInfo &MRI, 3021 MachineIRBuilder &B) const { 3022 Register DstDivReg, DstRemReg; 3023 switch (MI.getOpcode()) { 3024 default: 3025 llvm_unreachable("Unexpected opcode!"); 3026 case AMDGPU::G_UDIV: { 3027 DstDivReg = MI.getOperand(0).getReg(); 3028 break; 3029 } 3030 case AMDGPU::G_UREM: { 3031 DstRemReg = MI.getOperand(0).getReg(); 3032 break; 3033 } 3034 case AMDGPU::G_UDIVREM: { 3035 DstDivReg = MI.getOperand(0).getReg(); 3036 DstRemReg = MI.getOperand(1).getReg(); 3037 break; 3038 } 3039 } 3040 3041 const LLT S64 = LLT::scalar(64); 3042 const LLT S32 = LLT::scalar(32); 3043 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3044 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3045 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3046 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3047 3048 if (Ty == S32) 3049 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3050 else if (Ty == S64) 3051 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3052 else 3053 return false; 3054 3055 MI.eraseFromParent(); 3056 return true; 3057 } 3058 3059 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3060 MachineRegisterInfo &MRI, 3061 MachineIRBuilder &B) const { 3062 const LLT S64 = LLT::scalar(64); 3063 const LLT S32 = LLT::scalar(32); 3064 3065 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3066 if (Ty != S32 && Ty != S64) 3067 return false; 3068 3069 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3070 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3071 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3072 3073 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3074 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3075 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3076 3077 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3078 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3079 3080 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3081 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3082 3083 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3084 switch (MI.getOpcode()) { 3085 default: 3086 llvm_unreachable("Unexpected opcode!"); 3087 case AMDGPU::G_SDIV: { 3088 DstDivReg = MI.getOperand(0).getReg(); 3089 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3090 break; 3091 } 3092 case AMDGPU::G_SREM: { 3093 DstRemReg = MI.getOperand(0).getReg(); 3094 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3095 break; 3096 } 3097 case AMDGPU::G_SDIVREM: { 3098 DstDivReg = MI.getOperand(0).getReg(); 3099 DstRemReg = MI.getOperand(1).getReg(); 3100 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3101 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3102 break; 3103 } 3104 } 3105 3106 if (Ty == S32) 3107 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3108 else 3109 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3110 3111 if (DstDivReg) { 3112 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3113 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3114 B.buildSub(DstDivReg, SignXor, Sign); 3115 } 3116 3117 if (DstRemReg) { 3118 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3119 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3120 B.buildSub(DstRemReg, SignXor, Sign); 3121 } 3122 3123 MI.eraseFromParent(); 3124 return true; 3125 } 3126 3127 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3128 MachineRegisterInfo &MRI, 3129 MachineIRBuilder &B) const { 3130 Register Res = MI.getOperand(0).getReg(); 3131 Register LHS = MI.getOperand(1).getReg(); 3132 Register RHS = MI.getOperand(2).getReg(); 3133 uint16_t Flags = MI.getFlags(); 3134 LLT ResTy = MRI.getType(Res); 3135 3136 const MachineFunction &MF = B.getMF(); 3137 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3138 MI.getFlag(MachineInstr::FmAfn); 3139 3140 if (!AllowInaccurateRcp) 3141 return false; 3142 3143 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3144 // 1 / x -> RCP(x) 3145 if (CLHS->isExactlyValue(1.0)) { 3146 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3147 .addUse(RHS) 3148 .setMIFlags(Flags); 3149 3150 MI.eraseFromParent(); 3151 return true; 3152 } 3153 3154 // -1 / x -> RCP( FNEG(x) ) 3155 if (CLHS->isExactlyValue(-1.0)) { 3156 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3157 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3158 .addUse(FNeg.getReg(0)) 3159 .setMIFlags(Flags); 3160 3161 MI.eraseFromParent(); 3162 return true; 3163 } 3164 } 3165 3166 // x / y -> x * (1.0 / y) 3167 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3168 .addUse(RHS) 3169 .setMIFlags(Flags); 3170 B.buildFMul(Res, LHS, RCP, Flags); 3171 3172 MI.eraseFromParent(); 3173 return true; 3174 } 3175 3176 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3177 MachineRegisterInfo &MRI, 3178 MachineIRBuilder &B) const { 3179 Register Res = MI.getOperand(0).getReg(); 3180 Register X = MI.getOperand(1).getReg(); 3181 Register Y = MI.getOperand(2).getReg(); 3182 uint16_t Flags = MI.getFlags(); 3183 LLT ResTy = MRI.getType(Res); 3184 3185 const MachineFunction &MF = B.getMF(); 3186 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3187 MI.getFlag(MachineInstr::FmAfn); 3188 3189 if (!AllowInaccurateRcp) 3190 return false; 3191 3192 auto NegY = B.buildFNeg(ResTy, Y); 3193 auto One = B.buildFConstant(ResTy, 1.0); 3194 3195 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3196 .addUse(Y) 3197 .setMIFlags(Flags); 3198 3199 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3200 R = B.buildFMA(ResTy, Tmp0, R, R); 3201 3202 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3203 R = B.buildFMA(ResTy, Tmp1, R, R); 3204 3205 auto Ret = B.buildFMul(ResTy, X, R); 3206 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3207 3208 B.buildFMA(Res, Tmp2, R, Ret); 3209 MI.eraseFromParent(); 3210 return true; 3211 } 3212 3213 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3214 MachineRegisterInfo &MRI, 3215 MachineIRBuilder &B) const { 3216 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3217 return true; 3218 3219 Register Res = MI.getOperand(0).getReg(); 3220 Register LHS = MI.getOperand(1).getReg(); 3221 Register RHS = MI.getOperand(2).getReg(); 3222 3223 uint16_t Flags = MI.getFlags(); 3224 3225 LLT S16 = LLT::scalar(16); 3226 LLT S32 = LLT::scalar(32); 3227 3228 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3229 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3230 3231 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3232 .addUse(RHSExt.getReg(0)) 3233 .setMIFlags(Flags); 3234 3235 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3236 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3237 3238 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3239 .addUse(RDst.getReg(0)) 3240 .addUse(RHS) 3241 .addUse(LHS) 3242 .setMIFlags(Flags); 3243 3244 MI.eraseFromParent(); 3245 return true; 3246 } 3247 3248 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3249 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3250 static void toggleSPDenormMode(bool Enable, 3251 MachineIRBuilder &B, 3252 const GCNSubtarget &ST, 3253 AMDGPU::SIModeRegisterDefaults Mode) { 3254 // Set SP denorm mode to this value. 3255 unsigned SPDenormMode = 3256 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3257 3258 if (ST.hasDenormModeInst()) { 3259 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3260 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3261 3262 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3263 B.buildInstr(AMDGPU::S_DENORM_MODE) 3264 .addImm(NewDenormModeValue); 3265 3266 } else { 3267 // Select FP32 bit field in mode register. 3268 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3269 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3270 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3271 3272 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3273 .addImm(SPDenormMode) 3274 .addImm(SPDenormModeBitField); 3275 } 3276 } 3277 3278 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3279 MachineRegisterInfo &MRI, 3280 MachineIRBuilder &B) const { 3281 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3282 return true; 3283 3284 Register Res = MI.getOperand(0).getReg(); 3285 Register LHS = MI.getOperand(1).getReg(); 3286 Register RHS = MI.getOperand(2).getReg(); 3287 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3288 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3289 3290 uint16_t Flags = MI.getFlags(); 3291 3292 LLT S32 = LLT::scalar(32); 3293 LLT S1 = LLT::scalar(1); 3294 3295 auto One = B.buildFConstant(S32, 1.0f); 3296 3297 auto DenominatorScaled = 3298 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3299 .addUse(LHS) 3300 .addUse(RHS) 3301 .addImm(0) 3302 .setMIFlags(Flags); 3303 auto NumeratorScaled = 3304 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3305 .addUse(LHS) 3306 .addUse(RHS) 3307 .addImm(1) 3308 .setMIFlags(Flags); 3309 3310 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3311 .addUse(DenominatorScaled.getReg(0)) 3312 .setMIFlags(Flags); 3313 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3314 3315 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3316 // aren't modeled as reading it. 3317 if (!Mode.allFP32Denormals()) 3318 toggleSPDenormMode(true, B, ST, Mode); 3319 3320 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3321 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3322 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3323 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3324 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3325 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3326 3327 if (!Mode.allFP32Denormals()) 3328 toggleSPDenormMode(false, B, ST, Mode); 3329 3330 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3331 .addUse(Fma4.getReg(0)) 3332 .addUse(Fma1.getReg(0)) 3333 .addUse(Fma3.getReg(0)) 3334 .addUse(NumeratorScaled.getReg(1)) 3335 .setMIFlags(Flags); 3336 3337 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3338 .addUse(Fmas.getReg(0)) 3339 .addUse(RHS) 3340 .addUse(LHS) 3341 .setMIFlags(Flags); 3342 3343 MI.eraseFromParent(); 3344 return true; 3345 } 3346 3347 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3348 MachineRegisterInfo &MRI, 3349 MachineIRBuilder &B) const { 3350 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3351 return true; 3352 3353 Register Res = MI.getOperand(0).getReg(); 3354 Register LHS = MI.getOperand(1).getReg(); 3355 Register RHS = MI.getOperand(2).getReg(); 3356 3357 uint16_t Flags = MI.getFlags(); 3358 3359 LLT S64 = LLT::scalar(64); 3360 LLT S1 = LLT::scalar(1); 3361 3362 auto One = B.buildFConstant(S64, 1.0); 3363 3364 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3365 .addUse(LHS) 3366 .addUse(RHS) 3367 .addImm(0) 3368 .setMIFlags(Flags); 3369 3370 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3371 3372 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3373 .addUse(DivScale0.getReg(0)) 3374 .setMIFlags(Flags); 3375 3376 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3377 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3378 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3379 3380 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3381 .addUse(LHS) 3382 .addUse(RHS) 3383 .addImm(1) 3384 .setMIFlags(Flags); 3385 3386 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3387 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3388 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3389 3390 Register Scale; 3391 if (!ST.hasUsableDivScaleConditionOutput()) { 3392 // Workaround a hardware bug on SI where the condition output from div_scale 3393 // is not usable. 3394 3395 LLT S32 = LLT::scalar(32); 3396 3397 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3398 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3399 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3400 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3401 3402 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3403 Scale1Unmerge.getReg(1)); 3404 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3405 Scale0Unmerge.getReg(1)); 3406 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3407 } else { 3408 Scale = DivScale1.getReg(1); 3409 } 3410 3411 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3412 .addUse(Fma4.getReg(0)) 3413 .addUse(Fma3.getReg(0)) 3414 .addUse(Mul.getReg(0)) 3415 .addUse(Scale) 3416 .setMIFlags(Flags); 3417 3418 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3419 .addUse(Fmas.getReg(0)) 3420 .addUse(RHS) 3421 .addUse(LHS) 3422 .setMIFlags(Flags); 3423 3424 MI.eraseFromParent(); 3425 return true; 3426 } 3427 3428 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3429 MachineRegisterInfo &MRI, 3430 MachineIRBuilder &B) const { 3431 Register Res = MI.getOperand(0).getReg(); 3432 Register LHS = MI.getOperand(2).getReg(); 3433 Register RHS = MI.getOperand(3).getReg(); 3434 uint16_t Flags = MI.getFlags(); 3435 3436 LLT S32 = LLT::scalar(32); 3437 LLT S1 = LLT::scalar(1); 3438 3439 auto Abs = B.buildFAbs(S32, RHS, Flags); 3440 const APFloat C0Val(1.0f); 3441 3442 auto C0 = B.buildConstant(S32, 0x6f800000); 3443 auto C1 = B.buildConstant(S32, 0x2f800000); 3444 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3445 3446 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3447 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3448 3449 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3450 3451 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3452 .addUse(Mul0.getReg(0)) 3453 .setMIFlags(Flags); 3454 3455 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3456 3457 B.buildFMul(Res, Sel, Mul1, Flags); 3458 3459 MI.eraseFromParent(); 3460 return true; 3461 } 3462 3463 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3464 // FIXME: Why do we handle this one but not other removed instructions? 3465 // 3466 // Reciprocal square root. The clamp prevents infinite results, clamping 3467 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3468 // +-max_float. 3469 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3470 MachineRegisterInfo &MRI, 3471 MachineIRBuilder &B) const { 3472 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3473 return true; 3474 3475 Register Dst = MI.getOperand(0).getReg(); 3476 Register Src = MI.getOperand(2).getReg(); 3477 auto Flags = MI.getFlags(); 3478 3479 LLT Ty = MRI.getType(Dst); 3480 3481 const fltSemantics *FltSemantics; 3482 if (Ty == LLT::scalar(32)) 3483 FltSemantics = &APFloat::IEEEsingle(); 3484 else if (Ty == LLT::scalar(64)) 3485 FltSemantics = &APFloat::IEEEdouble(); 3486 else 3487 return false; 3488 3489 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3490 .addUse(Src) 3491 .setMIFlags(Flags); 3492 3493 // We don't need to concern ourselves with the snan handling difference, since 3494 // the rsq quieted (or not) so use the one which will directly select. 3495 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3496 const bool UseIEEE = MFI->getMode().IEEE; 3497 3498 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3499 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3500 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3501 3502 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3503 3504 if (UseIEEE) 3505 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3506 else 3507 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3508 MI.eraseFromParent(); 3509 return true; 3510 } 3511 3512 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3513 switch (IID) { 3514 case Intrinsic::amdgcn_ds_fadd: 3515 return AMDGPU::G_ATOMICRMW_FADD; 3516 case Intrinsic::amdgcn_ds_fmin: 3517 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3518 case Intrinsic::amdgcn_ds_fmax: 3519 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3520 default: 3521 llvm_unreachable("not a DS FP intrinsic"); 3522 } 3523 } 3524 3525 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3526 MachineInstr &MI, 3527 Intrinsic::ID IID) const { 3528 GISelChangeObserver &Observer = Helper.Observer; 3529 Observer.changingInstr(MI); 3530 3531 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3532 3533 // The remaining operands were used to set fields in the MemOperand on 3534 // construction. 3535 for (int I = 6; I > 3; --I) 3536 MI.RemoveOperand(I); 3537 3538 MI.RemoveOperand(1); // Remove the intrinsic ID. 3539 Observer.changedInstr(MI); 3540 return true; 3541 } 3542 3543 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3544 MachineRegisterInfo &MRI, 3545 MachineIRBuilder &B) const { 3546 uint64_t Offset = 3547 ST.getTargetLowering()->getImplicitParameterOffset( 3548 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3549 LLT DstTy = MRI.getType(DstReg); 3550 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3551 3552 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3553 if (!loadInputValue(KernargPtrReg, B, 3554 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3555 return false; 3556 3557 // FIXME: This should be nuw 3558 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3559 return true; 3560 } 3561 3562 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3563 MachineRegisterInfo &MRI, 3564 MachineIRBuilder &B) const { 3565 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3566 if (!MFI->isEntryFunction()) { 3567 return legalizePreloadedArgIntrin(MI, MRI, B, 3568 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3569 } 3570 3571 Register DstReg = MI.getOperand(0).getReg(); 3572 if (!getImplicitArgPtr(DstReg, MRI, B)) 3573 return false; 3574 3575 MI.eraseFromParent(); 3576 return true; 3577 } 3578 3579 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3580 MachineRegisterInfo &MRI, 3581 MachineIRBuilder &B, 3582 unsigned AddrSpace) const { 3583 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3584 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3585 Register Hi32 = Unmerge.getReg(1); 3586 3587 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3588 MI.eraseFromParent(); 3589 return true; 3590 } 3591 3592 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3593 // offset (the offset that is included in bounds checking and swizzling, to be 3594 // split between the instruction's voffset and immoffset fields) and soffset 3595 // (the offset that is excluded from bounds checking and swizzling, to go in 3596 // the instruction's soffset field). This function takes the first kind of 3597 // offset and figures out how to split it between voffset and immoffset. 3598 std::tuple<Register, unsigned, unsigned> 3599 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3600 Register OrigOffset) const { 3601 const unsigned MaxImm = 4095; 3602 Register BaseReg; 3603 unsigned TotalConstOffset; 3604 const LLT S32 = LLT::scalar(32); 3605 MachineRegisterInfo &MRI = *B.getMRI(); 3606 3607 std::tie(BaseReg, TotalConstOffset) = 3608 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3609 3610 unsigned ImmOffset = TotalConstOffset; 3611 3612 // If BaseReg is a pointer, convert it to int. 3613 if (MRI.getType(BaseReg).isPointer()) 3614 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3615 3616 // If the immediate value is too big for the immoffset field, put the value 3617 // and -4096 into the immoffset field so that the value that is copied/added 3618 // for the voffset field is a multiple of 4096, and it stands more chance 3619 // of being CSEd with the copy/add for another similar load/store. 3620 // However, do not do that rounding down to a multiple of 4096 if that is a 3621 // negative number, as it appears to be illegal to have a negative offset 3622 // in the vgpr, even if adding the immediate offset makes it positive. 3623 unsigned Overflow = ImmOffset & ~MaxImm; 3624 ImmOffset -= Overflow; 3625 if ((int32_t)Overflow < 0) { 3626 Overflow += ImmOffset; 3627 ImmOffset = 0; 3628 } 3629 3630 if (Overflow != 0) { 3631 if (!BaseReg) { 3632 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3633 } else { 3634 auto OverflowVal = B.buildConstant(S32, Overflow); 3635 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3636 } 3637 } 3638 3639 if (!BaseReg) 3640 BaseReg = B.buildConstant(S32, 0).getReg(0); 3641 3642 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3643 } 3644 3645 /// Handle register layout difference for f16 images for some subtargets. 3646 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3647 MachineRegisterInfo &MRI, 3648 Register Reg, 3649 bool ImageStore) const { 3650 const LLT S16 = LLT::scalar(16); 3651 const LLT S32 = LLT::scalar(32); 3652 LLT StoreVT = MRI.getType(Reg); 3653 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3654 3655 if (ST.hasUnpackedD16VMem()) { 3656 auto Unmerge = B.buildUnmerge(S16, Reg); 3657 3658 SmallVector<Register, 4> WideRegs; 3659 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3660 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3661 3662 int NumElts = StoreVT.getNumElements(); 3663 3664 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3665 } 3666 3667 if (ImageStore && ST.hasImageStoreD16Bug()) { 3668 if (StoreVT.getNumElements() == 2) { 3669 SmallVector<Register, 4> PackedRegs; 3670 Reg = B.buildBitcast(S32, Reg).getReg(0); 3671 PackedRegs.push_back(Reg); 3672 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3673 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3674 } 3675 3676 if (StoreVT.getNumElements() == 3) { 3677 SmallVector<Register, 4> PackedRegs; 3678 auto Unmerge = B.buildUnmerge(S16, Reg); 3679 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3680 PackedRegs.push_back(Unmerge.getReg(I)); 3681 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3682 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3683 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3684 } 3685 3686 if (StoreVT.getNumElements() == 4) { 3687 SmallVector<Register, 4> PackedRegs; 3688 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3689 auto Unmerge = B.buildUnmerge(S32, Reg); 3690 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3691 PackedRegs.push_back(Unmerge.getReg(I)); 3692 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3693 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3694 } 3695 3696 llvm_unreachable("invalid data type"); 3697 } 3698 3699 return Reg; 3700 } 3701 3702 Register AMDGPULegalizerInfo::fixStoreSourceType( 3703 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3704 MachineRegisterInfo *MRI = B.getMRI(); 3705 LLT Ty = MRI->getType(VData); 3706 3707 const LLT S16 = LLT::scalar(16); 3708 3709 // Fixup illegal register types for i8 stores. 3710 if (Ty == LLT::scalar(8) || Ty == S16) { 3711 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3712 return AnyExt; 3713 } 3714 3715 if (Ty.isVector()) { 3716 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3717 if (IsFormat) 3718 return handleD16VData(B, *MRI, VData); 3719 } 3720 } 3721 3722 return VData; 3723 } 3724 3725 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3726 MachineRegisterInfo &MRI, 3727 MachineIRBuilder &B, 3728 bool IsTyped, 3729 bool IsFormat) const { 3730 Register VData = MI.getOperand(1).getReg(); 3731 LLT Ty = MRI.getType(VData); 3732 LLT EltTy = Ty.getScalarType(); 3733 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3734 const LLT S32 = LLT::scalar(32); 3735 3736 VData = fixStoreSourceType(B, VData, IsFormat); 3737 Register RSrc = MI.getOperand(2).getReg(); 3738 3739 MachineMemOperand *MMO = *MI.memoperands_begin(); 3740 const int MemSize = MMO->getSize(); 3741 3742 unsigned ImmOffset; 3743 unsigned TotalOffset; 3744 3745 // The typed intrinsics add an immediate after the registers. 3746 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3747 3748 // The struct intrinsic variants add one additional operand over raw. 3749 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3750 Register VIndex; 3751 int OpOffset = 0; 3752 if (HasVIndex) { 3753 VIndex = MI.getOperand(3).getReg(); 3754 OpOffset = 1; 3755 } 3756 3757 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3758 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3759 3760 unsigned Format = 0; 3761 if (IsTyped) { 3762 Format = MI.getOperand(5 + OpOffset).getImm(); 3763 ++OpOffset; 3764 } 3765 3766 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3767 3768 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3769 if (TotalOffset != 0) 3770 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3771 3772 unsigned Opc; 3773 if (IsTyped) { 3774 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3775 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3776 } else if (IsFormat) { 3777 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3778 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3779 } else { 3780 switch (MemSize) { 3781 case 1: 3782 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3783 break; 3784 case 2: 3785 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3786 break; 3787 default: 3788 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3789 break; 3790 } 3791 } 3792 3793 if (!VIndex) 3794 VIndex = B.buildConstant(S32, 0).getReg(0); 3795 3796 auto MIB = B.buildInstr(Opc) 3797 .addUse(VData) // vdata 3798 .addUse(RSrc) // rsrc 3799 .addUse(VIndex) // vindex 3800 .addUse(VOffset) // voffset 3801 .addUse(SOffset) // soffset 3802 .addImm(ImmOffset); // offset(imm) 3803 3804 if (IsTyped) 3805 MIB.addImm(Format); 3806 3807 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3808 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3809 .addMemOperand(MMO); 3810 3811 MI.eraseFromParent(); 3812 return true; 3813 } 3814 3815 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3816 MachineRegisterInfo &MRI, 3817 MachineIRBuilder &B, 3818 bool IsFormat, 3819 bool IsTyped) const { 3820 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3821 MachineMemOperand *MMO = *MI.memoperands_begin(); 3822 const int MemSize = MMO->getSize(); 3823 const LLT S32 = LLT::scalar(32); 3824 3825 Register Dst = MI.getOperand(0).getReg(); 3826 Register RSrc = MI.getOperand(2).getReg(); 3827 3828 // The typed intrinsics add an immediate after the registers. 3829 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3830 3831 // The struct intrinsic variants add one additional operand over raw. 3832 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3833 Register VIndex; 3834 int OpOffset = 0; 3835 if (HasVIndex) { 3836 VIndex = MI.getOperand(3).getReg(); 3837 OpOffset = 1; 3838 } 3839 3840 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3841 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3842 3843 unsigned Format = 0; 3844 if (IsTyped) { 3845 Format = MI.getOperand(5 + OpOffset).getImm(); 3846 ++OpOffset; 3847 } 3848 3849 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3850 unsigned ImmOffset; 3851 unsigned TotalOffset; 3852 3853 LLT Ty = MRI.getType(Dst); 3854 LLT EltTy = Ty.getScalarType(); 3855 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3856 const bool Unpacked = ST.hasUnpackedD16VMem(); 3857 3858 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3859 if (TotalOffset != 0) 3860 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3861 3862 unsigned Opc; 3863 3864 if (IsTyped) { 3865 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3866 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3867 } else if (IsFormat) { 3868 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3869 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3870 } else { 3871 switch (MemSize) { 3872 case 1: 3873 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3874 break; 3875 case 2: 3876 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3877 break; 3878 default: 3879 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3880 break; 3881 } 3882 } 3883 3884 Register LoadDstReg; 3885 3886 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3887 LLT UnpackedTy = Ty.changeElementSize(32); 3888 3889 if (IsExtLoad) 3890 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3891 else if (Unpacked && IsD16 && Ty.isVector()) 3892 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3893 else 3894 LoadDstReg = Dst; 3895 3896 if (!VIndex) 3897 VIndex = B.buildConstant(S32, 0).getReg(0); 3898 3899 auto MIB = B.buildInstr(Opc) 3900 .addDef(LoadDstReg) // vdata 3901 .addUse(RSrc) // rsrc 3902 .addUse(VIndex) // vindex 3903 .addUse(VOffset) // voffset 3904 .addUse(SOffset) // soffset 3905 .addImm(ImmOffset); // offset(imm) 3906 3907 if (IsTyped) 3908 MIB.addImm(Format); 3909 3910 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3911 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3912 .addMemOperand(MMO); 3913 3914 if (LoadDstReg != Dst) { 3915 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3916 3917 // Widen result for extending loads was widened. 3918 if (IsExtLoad) 3919 B.buildTrunc(Dst, LoadDstReg); 3920 else { 3921 // Repack to original 16-bit vector result 3922 // FIXME: G_TRUNC should work, but legalization currently fails 3923 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3924 SmallVector<Register, 4> Repack; 3925 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3926 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3927 B.buildMerge(Dst, Repack); 3928 } 3929 } 3930 3931 MI.eraseFromParent(); 3932 return true; 3933 } 3934 3935 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3936 MachineIRBuilder &B, 3937 bool IsInc) const { 3938 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3939 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3940 B.buildInstr(Opc) 3941 .addDef(MI.getOperand(0).getReg()) 3942 .addUse(MI.getOperand(2).getReg()) 3943 .addUse(MI.getOperand(3).getReg()) 3944 .cloneMemRefs(MI); 3945 MI.eraseFromParent(); 3946 return true; 3947 } 3948 3949 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3950 switch (IntrID) { 3951 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3952 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3954 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3955 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3956 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3957 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3958 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3960 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3961 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3962 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3963 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3964 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3966 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3967 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3968 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3969 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3970 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3971 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3972 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3973 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3975 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3976 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3977 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3978 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3979 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3981 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3982 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3984 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3985 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3986 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3987 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3988 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3990 case Intrinsic::amdgcn_buffer_atomic_fadd: 3991 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3992 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3995 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3996 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3997 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3998 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4000 default: 4001 llvm_unreachable("unhandled atomic opcode"); 4002 } 4003 } 4004 4005 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4006 MachineIRBuilder &B, 4007 Intrinsic::ID IID) const { 4008 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4009 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4010 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4011 4012 Register Dst; 4013 4014 int OpOffset = 0; 4015 if (HasReturn) { 4016 // A few FP atomics do not support return values. 4017 Dst = MI.getOperand(0).getReg(); 4018 } else { 4019 OpOffset = -1; 4020 } 4021 4022 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4023 Register CmpVal; 4024 4025 if (IsCmpSwap) { 4026 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4027 ++OpOffset; 4028 } 4029 4030 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4031 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4032 4033 // The struct intrinsic variants add one additional operand over raw. 4034 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4035 Register VIndex; 4036 if (HasVIndex) { 4037 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4038 ++OpOffset; 4039 } 4040 4041 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4042 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4043 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4044 4045 MachineMemOperand *MMO = *MI.memoperands_begin(); 4046 4047 unsigned ImmOffset; 4048 unsigned TotalOffset; 4049 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4050 if (TotalOffset != 0) 4051 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4052 4053 if (!VIndex) 4054 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4055 4056 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4057 4058 if (HasReturn) 4059 MIB.addDef(Dst); 4060 4061 MIB.addUse(VData); // vdata 4062 4063 if (IsCmpSwap) 4064 MIB.addReg(CmpVal); 4065 4066 MIB.addUse(RSrc) // rsrc 4067 .addUse(VIndex) // vindex 4068 .addUse(VOffset) // voffset 4069 .addUse(SOffset) // soffset 4070 .addImm(ImmOffset) // offset(imm) 4071 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4072 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4073 .addMemOperand(MMO); 4074 4075 MI.eraseFromParent(); 4076 return true; 4077 } 4078 4079 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4080 /// vector with s16 typed elements. 4081 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4082 SmallVectorImpl<Register> &PackedAddrs, 4083 unsigned ArgOffset, 4084 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4085 bool IsA16, bool IsG16) { 4086 const LLT S16 = LLT::scalar(16); 4087 const LLT V2S16 = LLT::vector(2, 16); 4088 auto EndIdx = Intr->VAddrEnd; 4089 4090 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4091 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4092 if (!SrcOp.isReg()) 4093 continue; // _L to _LZ may have eliminated this. 4094 4095 Register AddrReg = SrcOp.getReg(); 4096 4097 if (I < Intr->GradientStart) { 4098 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4099 PackedAddrs.push_back(AddrReg); 4100 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4101 (I >= Intr->CoordStart && !IsA16)) { 4102 // Handle any gradient or coordinate operands that should not be packed 4103 PackedAddrs.push_back(AddrReg); 4104 } else { 4105 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4106 // derivatives dx/dh and dx/dv are packed with undef. 4107 if (((I + 1) >= EndIdx) || 4108 ((Intr->NumGradients / 2) % 2 == 1 && 4109 (I == static_cast<unsigned>(Intr->GradientStart + 4110 (Intr->NumGradients / 2) - 1) || 4111 I == static_cast<unsigned>(Intr->GradientStart + 4112 Intr->NumGradients - 1))) || 4113 // Check for _L to _LZ optimization 4114 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4115 PackedAddrs.push_back( 4116 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4117 .getReg(0)); 4118 } else { 4119 PackedAddrs.push_back( 4120 B.buildBuildVector( 4121 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4122 .getReg(0)); 4123 ++I; 4124 } 4125 } 4126 } 4127 } 4128 4129 /// Convert from separate vaddr components to a single vector address register, 4130 /// and replace the remaining operands with $noreg. 4131 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4132 int DimIdx, int NumVAddrs) { 4133 const LLT S32 = LLT::scalar(32); 4134 4135 SmallVector<Register, 8> AddrRegs; 4136 for (int I = 0; I != NumVAddrs; ++I) { 4137 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4138 if (SrcOp.isReg()) { 4139 AddrRegs.push_back(SrcOp.getReg()); 4140 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4141 } 4142 } 4143 4144 int NumAddrRegs = AddrRegs.size(); 4145 if (NumAddrRegs != 1) { 4146 // Round up to 8 elements for v5-v7 4147 // FIXME: Missing intermediate sized register classes and instructions. 4148 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4149 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4150 auto Undef = B.buildUndef(S32); 4151 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4152 NumAddrRegs = RoundedNumRegs; 4153 } 4154 4155 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4156 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4157 } 4158 4159 for (int I = 1; I != NumVAddrs; ++I) { 4160 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4161 if (SrcOp.isReg()) 4162 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4163 } 4164 } 4165 4166 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4167 /// 4168 /// Depending on the subtarget, load/store with 16-bit element data need to be 4169 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4170 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4171 /// registers. 4172 /// 4173 /// We don't want to directly select image instructions just yet, but also want 4174 /// to exposes all register repacking to the legalizer/combiners. We also don't 4175 /// want a selected instrution entering RegBankSelect. In order to avoid 4176 /// defining a multitude of intermediate image instructions, directly hack on 4177 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4178 /// now unnecessary arguments with $noreg. 4179 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4180 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4181 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4182 4183 const unsigned NumDefs = MI.getNumExplicitDefs(); 4184 const unsigned ArgOffset = NumDefs + 1; 4185 bool IsTFE = NumDefs == 2; 4186 // We are only processing the operands of d16 image operations on subtargets 4187 // that use the unpacked register layout, or need to repack the TFE result. 4188 4189 // TODO: Do we need to guard against already legalized intrinsics? 4190 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4191 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4192 4193 MachineRegisterInfo *MRI = B.getMRI(); 4194 const LLT S32 = LLT::scalar(32); 4195 const LLT S16 = LLT::scalar(16); 4196 const LLT V2S16 = LLT::vector(2, 16); 4197 4198 unsigned DMask = 0; 4199 4200 // Check for 16 bit addresses and pack if true. 4201 LLT GradTy = 4202 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4203 LLT AddrTy = 4204 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4205 const bool IsG16 = GradTy == S16; 4206 const bool IsA16 = AddrTy == S16; 4207 4208 int DMaskLanes = 0; 4209 if (!BaseOpcode->Atomic) { 4210 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4211 if (BaseOpcode->Gather4) { 4212 DMaskLanes = 4; 4213 } else if (DMask != 0) { 4214 DMaskLanes = countPopulation(DMask); 4215 } else if (!IsTFE && !BaseOpcode->Store) { 4216 // If dmask is 0, this is a no-op load. This can be eliminated. 4217 B.buildUndef(MI.getOperand(0)); 4218 MI.eraseFromParent(); 4219 return true; 4220 } 4221 } 4222 4223 Observer.changingInstr(MI); 4224 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4225 4226 unsigned NewOpcode = NumDefs == 0 ? 4227 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4228 4229 // Track that we legalized this 4230 MI.setDesc(B.getTII().get(NewOpcode)); 4231 4232 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4233 // dmask to be at least 1 otherwise the instruction will fail 4234 if (IsTFE && DMask == 0) { 4235 DMask = 0x1; 4236 DMaskLanes = 1; 4237 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4238 } 4239 4240 if (BaseOpcode->Atomic) { 4241 Register VData0 = MI.getOperand(2).getReg(); 4242 LLT Ty = MRI->getType(VData0); 4243 4244 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4245 if (Ty.isVector()) 4246 return false; 4247 4248 if (BaseOpcode->AtomicX2) { 4249 Register VData1 = MI.getOperand(3).getReg(); 4250 // The two values are packed in one register. 4251 LLT PackedTy = LLT::vector(2, Ty); 4252 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4253 MI.getOperand(2).setReg(Concat.getReg(0)); 4254 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4255 } 4256 } 4257 4258 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4259 4260 // Optimize _L to _LZ when _L is zero 4261 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4262 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4263 const ConstantFP *ConstantLod; 4264 4265 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4266 m_GFCst(ConstantLod))) { 4267 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4268 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4269 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4270 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4271 Intr->Dim); 4272 4273 // The starting indexes should remain in the same place. 4274 --CorrectedNumVAddrs; 4275 4276 MI.getOperand(MI.getNumExplicitDefs()) 4277 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4278 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4279 Intr = NewImageDimIntr; 4280 } 4281 } 4282 } 4283 4284 // Optimize _mip away, when 'lod' is zero 4285 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4286 int64_t ConstantLod; 4287 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4288 m_ICst(ConstantLod))) { 4289 if (ConstantLod == 0) { 4290 // TODO: Change intrinsic opcode and remove operand instead or replacing 4291 // it with 0, as the _L to _LZ handling is done above. 4292 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4293 --CorrectedNumVAddrs; 4294 } 4295 } 4296 } 4297 4298 // Rewrite the addressing register layout before doing anything else. 4299 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4300 // 16 bit gradients are supported, but are tied to the A16 control 4301 // so both gradients and addresses must be 16 bit 4302 return false; 4303 } 4304 4305 if (IsA16 && !ST.hasA16()) { 4306 // A16 not supported 4307 return false; 4308 } 4309 4310 if (IsA16 || IsG16) { 4311 if (Intr->NumVAddrs > 1) { 4312 SmallVector<Register, 4> PackedRegs; 4313 4314 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4315 IsG16); 4316 4317 // See also below in the non-a16 branch 4318 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4319 4320 if (!UseNSA && PackedRegs.size() > 1) { 4321 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4322 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4323 PackedRegs[0] = Concat.getReg(0); 4324 PackedRegs.resize(1); 4325 } 4326 4327 const unsigned NumPacked = PackedRegs.size(); 4328 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4329 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4330 if (!SrcOp.isReg()) { 4331 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4332 continue; 4333 } 4334 4335 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4336 4337 if (I - Intr->VAddrStart < NumPacked) 4338 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4339 else 4340 SrcOp.setReg(AMDGPU::NoRegister); 4341 } 4342 } 4343 } else { 4344 // If the register allocator cannot place the address registers contiguously 4345 // without introducing moves, then using the non-sequential address encoding 4346 // is always preferable, since it saves VALU instructions and is usually a 4347 // wash in terms of code size or even better. 4348 // 4349 // However, we currently have no way of hinting to the register allocator 4350 // that MIMG addresses should be placed contiguously when it is possible to 4351 // do so, so force non-NSA for the common 2-address case as a heuristic. 4352 // 4353 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4354 // allocation when possible. 4355 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4356 4357 if (!UseNSA && Intr->NumVAddrs > 1) 4358 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4359 Intr->NumVAddrs); 4360 } 4361 4362 int Flags = 0; 4363 if (IsA16) 4364 Flags |= 1; 4365 if (IsG16) 4366 Flags |= 2; 4367 MI.addOperand(MachineOperand::CreateImm(Flags)); 4368 4369 if (BaseOpcode->Store) { // No TFE for stores? 4370 // TODO: Handle dmask trim 4371 Register VData = MI.getOperand(1).getReg(); 4372 LLT Ty = MRI->getType(VData); 4373 if (!Ty.isVector() || Ty.getElementType() != S16) 4374 return true; 4375 4376 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4377 if (RepackedReg != VData) { 4378 MI.getOperand(1).setReg(RepackedReg); 4379 } 4380 4381 return true; 4382 } 4383 4384 Register DstReg = MI.getOperand(0).getReg(); 4385 LLT Ty = MRI->getType(DstReg); 4386 const LLT EltTy = Ty.getScalarType(); 4387 const bool IsD16 = Ty.getScalarType() == S16; 4388 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4389 4390 // Confirm that the return type is large enough for the dmask specified 4391 if (NumElts < DMaskLanes) 4392 return false; 4393 4394 if (NumElts > 4 || DMaskLanes > 4) 4395 return false; 4396 4397 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4398 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4399 4400 // The raw dword aligned data component of the load. The only legal cases 4401 // where this matters should be when using the packed D16 format, for 4402 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4403 LLT RoundedTy; 4404 4405 // S32 vector to to cover all data, plus TFE result element. 4406 LLT TFETy; 4407 4408 // Register type to use for each loaded component. Will be S32 or V2S16. 4409 LLT RegTy; 4410 4411 if (IsD16 && ST.hasUnpackedD16VMem()) { 4412 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4413 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4414 RegTy = S32; 4415 } else { 4416 unsigned EltSize = EltTy.getSizeInBits(); 4417 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4418 unsigned RoundedSize = 32 * RoundedElts; 4419 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4420 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4421 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4422 } 4423 4424 // The return type does not need adjustment. 4425 // TODO: Should we change s16 case to s32 or <2 x s16>? 4426 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4427 return true; 4428 4429 Register Dst1Reg; 4430 4431 // Insert after the instruction. 4432 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4433 4434 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4435 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4436 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4437 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4438 4439 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4440 4441 MI.getOperand(0).setReg(NewResultReg); 4442 4443 // In the IR, TFE is supposed to be used with a 2 element struct return 4444 // type. The intruction really returns these two values in one contiguous 4445 // register, with one additional dword beyond the loaded data. Rewrite the 4446 // return type to use a single register result. 4447 4448 if (IsTFE) { 4449 Dst1Reg = MI.getOperand(1).getReg(); 4450 if (MRI->getType(Dst1Reg) != S32) 4451 return false; 4452 4453 // TODO: Make sure the TFE operand bit is set. 4454 MI.RemoveOperand(1); 4455 4456 // Handle the easy case that requires no repack instructions. 4457 if (Ty == S32) { 4458 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4459 return true; 4460 } 4461 } 4462 4463 // Now figure out how to copy the new result register back into the old 4464 // result. 4465 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4466 4467 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4468 4469 if (ResultNumRegs == 1) { 4470 assert(!IsTFE); 4471 ResultRegs[0] = NewResultReg; 4472 } else { 4473 // We have to repack into a new vector of some kind. 4474 for (int I = 0; I != NumDataRegs; ++I) 4475 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4476 B.buildUnmerge(ResultRegs, NewResultReg); 4477 4478 // Drop the final TFE element to get the data part. The TFE result is 4479 // directly written to the right place already. 4480 if (IsTFE) 4481 ResultRegs.resize(NumDataRegs); 4482 } 4483 4484 // For an s16 scalar result, we form an s32 result with a truncate regardless 4485 // of packed vs. unpacked. 4486 if (IsD16 && !Ty.isVector()) { 4487 B.buildTrunc(DstReg, ResultRegs[0]); 4488 return true; 4489 } 4490 4491 // Avoid a build/concat_vector of 1 entry. 4492 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4493 B.buildBitcast(DstReg, ResultRegs[0]); 4494 return true; 4495 } 4496 4497 assert(Ty.isVector()); 4498 4499 if (IsD16) { 4500 // For packed D16 results with TFE enabled, all the data components are 4501 // S32. Cast back to the expected type. 4502 // 4503 // TODO: We don't really need to use load s32 elements. We would only need one 4504 // cast for the TFE result if a multiple of v2s16 was used. 4505 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4506 for (Register &Reg : ResultRegs) 4507 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4508 } else if (ST.hasUnpackedD16VMem()) { 4509 for (Register &Reg : ResultRegs) 4510 Reg = B.buildTrunc(S16, Reg).getReg(0); 4511 } 4512 } 4513 4514 auto padWithUndef = [&](LLT Ty, int NumElts) { 4515 if (NumElts == 0) 4516 return; 4517 Register Undef = B.buildUndef(Ty).getReg(0); 4518 for (int I = 0; I != NumElts; ++I) 4519 ResultRegs.push_back(Undef); 4520 }; 4521 4522 // Pad out any elements eliminated due to the dmask. 4523 LLT ResTy = MRI->getType(ResultRegs[0]); 4524 if (!ResTy.isVector()) { 4525 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4526 B.buildBuildVector(DstReg, ResultRegs); 4527 return true; 4528 } 4529 4530 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4531 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4532 4533 // Deal with the one annoying legal case. 4534 const LLT V3S16 = LLT::vector(3, 16); 4535 if (Ty == V3S16) { 4536 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4537 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4538 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4539 return true; 4540 } 4541 4542 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4543 B.buildConcatVectors(DstReg, ResultRegs); 4544 return true; 4545 } 4546 4547 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4548 LegalizerHelper &Helper, MachineInstr &MI) const { 4549 MachineIRBuilder &B = Helper.MIRBuilder; 4550 GISelChangeObserver &Observer = Helper.Observer; 4551 4552 Register Dst = MI.getOperand(0).getReg(); 4553 LLT Ty = B.getMRI()->getType(Dst); 4554 unsigned Size = Ty.getSizeInBits(); 4555 MachineFunction &MF = B.getMF(); 4556 4557 Observer.changingInstr(MI); 4558 4559 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4560 Ty = getBitcastRegisterType(Ty); 4561 Helper.bitcastDst(MI, Ty, 0); 4562 Dst = MI.getOperand(0).getReg(); 4563 B.setInsertPt(B.getMBB(), MI); 4564 } 4565 4566 // FIXME: We don't really need this intermediate instruction. The intrinsic 4567 // should be fixed to have a memory operand. Since it's readnone, we're not 4568 // allowed to add one. 4569 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4570 MI.RemoveOperand(1); // Remove intrinsic ID 4571 4572 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4573 // TODO: Should this use datalayout alignment? 4574 const unsigned MemSize = (Size + 7) / 8; 4575 const Align MemAlign(4); 4576 MachineMemOperand *MMO = MF.getMachineMemOperand( 4577 MachinePointerInfo(), 4578 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4579 MachineMemOperand::MOInvariant, 4580 MemSize, MemAlign); 4581 MI.addMemOperand(MF, MMO); 4582 4583 // There are no 96-bit result scalar loads, but widening to 128-bit should 4584 // always be legal. We may need to restore this to a 96-bit result if it turns 4585 // out this needs to be converted to a vector load during RegBankSelect. 4586 if (!isPowerOf2_32(Size)) { 4587 if (Ty.isVector()) 4588 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4589 else 4590 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4591 } 4592 4593 Observer.changedInstr(MI); 4594 return true; 4595 } 4596 4597 // TODO: Move to selection 4598 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4599 MachineRegisterInfo &MRI, 4600 MachineIRBuilder &B) const { 4601 if (!ST.isTrapHandlerEnabled() || 4602 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4603 return legalizeTrapEndpgm(MI, MRI, B); 4604 4605 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4606 switch (*HsaAbiVer) { 4607 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4608 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4609 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4610 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4611 return ST.supportsGetDoorbellID() ? 4612 legalizeTrapHsa(MI, MRI, B) : 4613 legalizeTrapHsaQueuePtr(MI, MRI, B); 4614 } 4615 } 4616 4617 llvm_unreachable("Unknown trap handler"); 4618 } 4619 4620 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4621 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4622 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4623 MI.eraseFromParent(); 4624 return true; 4625 } 4626 4627 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4628 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4629 // Pass queue pointer to trap handler as input, and insert trap instruction 4630 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4631 Register LiveIn = 4632 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4633 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4634 return false; 4635 4636 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4637 B.buildCopy(SGPR01, LiveIn); 4638 B.buildInstr(AMDGPU::S_TRAP) 4639 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4640 .addReg(SGPR01, RegState::Implicit); 4641 4642 MI.eraseFromParent(); 4643 return true; 4644 } 4645 4646 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4647 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4648 B.buildInstr(AMDGPU::S_TRAP) 4649 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4650 MI.eraseFromParent(); 4651 return true; 4652 } 4653 4654 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4655 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4656 // Is non-HSA path or trap-handler disabled? then, report a warning 4657 // accordingly 4658 if (!ST.isTrapHandlerEnabled() || 4659 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4660 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4661 "debugtrap handler not supported", 4662 MI.getDebugLoc(), DS_Warning); 4663 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4664 Ctx.diagnose(NoTrap); 4665 } else { 4666 // Insert debug-trap instruction 4667 B.buildInstr(AMDGPU::S_TRAP) 4668 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4669 } 4670 4671 MI.eraseFromParent(); 4672 return true; 4673 } 4674 4675 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4676 MachineIRBuilder &B) const { 4677 MachineRegisterInfo &MRI = *B.getMRI(); 4678 const LLT S16 = LLT::scalar(16); 4679 const LLT S32 = LLT::scalar(32); 4680 4681 Register DstReg = MI.getOperand(0).getReg(); 4682 Register NodePtr = MI.getOperand(2).getReg(); 4683 Register RayExtent = MI.getOperand(3).getReg(); 4684 Register RayOrigin = MI.getOperand(4).getReg(); 4685 Register RayDir = MI.getOperand(5).getReg(); 4686 Register RayInvDir = MI.getOperand(6).getReg(); 4687 Register TDescr = MI.getOperand(7).getReg(); 4688 4689 if (!ST.hasGFX10_AEncoding()) { 4690 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4691 "intrinsic not supported on subtarget", 4692 MI.getDebugLoc()); 4693 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4694 return false; 4695 } 4696 4697 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4698 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4699 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4700 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4701 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4702 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4703 4704 SmallVector<Register, 12> Ops; 4705 if (Is64) { 4706 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4707 Ops.push_back(Unmerge.getReg(0)); 4708 Ops.push_back(Unmerge.getReg(1)); 4709 } else { 4710 Ops.push_back(NodePtr); 4711 } 4712 Ops.push_back(RayExtent); 4713 4714 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4715 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4716 Ops.push_back(Unmerge.getReg(0)); 4717 Ops.push_back(Unmerge.getReg(1)); 4718 Ops.push_back(Unmerge.getReg(2)); 4719 }; 4720 4721 packLanes(RayOrigin); 4722 if (IsA16) { 4723 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4724 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4725 Register R1 = MRI.createGenericVirtualRegister(S32); 4726 Register R2 = MRI.createGenericVirtualRegister(S32); 4727 Register R3 = MRI.createGenericVirtualRegister(S32); 4728 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4729 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4730 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4731 Ops.push_back(R1); 4732 Ops.push_back(R2); 4733 Ops.push_back(R3); 4734 } else { 4735 packLanes(RayDir); 4736 packLanes(RayInvDir); 4737 } 4738 4739 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4740 .addDef(DstReg) 4741 .addImm(Opcode); 4742 4743 for (Register R : Ops) { 4744 MIB.addUse(R); 4745 } 4746 4747 MIB.addUse(TDescr) 4748 .addImm(IsA16 ? 1 : 0) 4749 .cloneMemRefs(MI); 4750 4751 MI.eraseFromParent(); 4752 return true; 4753 } 4754 4755 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4756 MachineInstr &MI) const { 4757 MachineIRBuilder &B = Helper.MIRBuilder; 4758 MachineRegisterInfo &MRI = *B.getMRI(); 4759 4760 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4761 auto IntrID = MI.getIntrinsicID(); 4762 switch (IntrID) { 4763 case Intrinsic::amdgcn_if: 4764 case Intrinsic::amdgcn_else: { 4765 MachineInstr *Br = nullptr; 4766 MachineBasicBlock *UncondBrTarget = nullptr; 4767 bool Negated = false; 4768 if (MachineInstr *BrCond = 4769 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4770 const SIRegisterInfo *TRI 4771 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4772 4773 Register Def = MI.getOperand(1).getReg(); 4774 Register Use = MI.getOperand(3).getReg(); 4775 4776 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4777 4778 if (Negated) 4779 std::swap(CondBrTarget, UncondBrTarget); 4780 4781 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4782 if (IntrID == Intrinsic::amdgcn_if) { 4783 B.buildInstr(AMDGPU::SI_IF) 4784 .addDef(Def) 4785 .addUse(Use) 4786 .addMBB(UncondBrTarget); 4787 } else { 4788 B.buildInstr(AMDGPU::SI_ELSE) 4789 .addDef(Def) 4790 .addUse(Use) 4791 .addMBB(UncondBrTarget); 4792 } 4793 4794 if (Br) { 4795 Br->getOperand(0).setMBB(CondBrTarget); 4796 } else { 4797 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4798 // since we're swapping branch targets it needs to be reinserted. 4799 // FIXME: IRTranslator should probably not do this 4800 B.buildBr(*CondBrTarget); 4801 } 4802 4803 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4804 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4805 MI.eraseFromParent(); 4806 BrCond->eraseFromParent(); 4807 return true; 4808 } 4809 4810 return false; 4811 } 4812 case Intrinsic::amdgcn_loop: { 4813 MachineInstr *Br = nullptr; 4814 MachineBasicBlock *UncondBrTarget = nullptr; 4815 bool Negated = false; 4816 if (MachineInstr *BrCond = 4817 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4818 const SIRegisterInfo *TRI 4819 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4820 4821 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4822 Register Reg = MI.getOperand(2).getReg(); 4823 4824 if (Negated) 4825 std::swap(CondBrTarget, UncondBrTarget); 4826 4827 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4828 B.buildInstr(AMDGPU::SI_LOOP) 4829 .addUse(Reg) 4830 .addMBB(UncondBrTarget); 4831 4832 if (Br) 4833 Br->getOperand(0).setMBB(CondBrTarget); 4834 else 4835 B.buildBr(*CondBrTarget); 4836 4837 MI.eraseFromParent(); 4838 BrCond->eraseFromParent(); 4839 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4840 return true; 4841 } 4842 4843 return false; 4844 } 4845 case Intrinsic::amdgcn_kernarg_segment_ptr: 4846 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4847 // This only makes sense to call in a kernel, so just lower to null. 4848 B.buildConstant(MI.getOperand(0).getReg(), 0); 4849 MI.eraseFromParent(); 4850 return true; 4851 } 4852 4853 return legalizePreloadedArgIntrin( 4854 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4855 case Intrinsic::amdgcn_implicitarg_ptr: 4856 return legalizeImplicitArgPtr(MI, MRI, B); 4857 case Intrinsic::amdgcn_workitem_id_x: 4858 return legalizePreloadedArgIntrin(MI, MRI, B, 4859 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4860 case Intrinsic::amdgcn_workitem_id_y: 4861 return legalizePreloadedArgIntrin(MI, MRI, B, 4862 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4863 case Intrinsic::amdgcn_workitem_id_z: 4864 return legalizePreloadedArgIntrin(MI, MRI, B, 4865 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4866 case Intrinsic::amdgcn_workgroup_id_x: 4867 return legalizePreloadedArgIntrin(MI, MRI, B, 4868 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4869 case Intrinsic::amdgcn_workgroup_id_y: 4870 return legalizePreloadedArgIntrin(MI, MRI, B, 4871 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4872 case Intrinsic::amdgcn_workgroup_id_z: 4873 return legalizePreloadedArgIntrin(MI, MRI, B, 4874 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4875 case Intrinsic::amdgcn_dispatch_ptr: 4876 return legalizePreloadedArgIntrin(MI, MRI, B, 4877 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4878 case Intrinsic::amdgcn_queue_ptr: 4879 return legalizePreloadedArgIntrin(MI, MRI, B, 4880 AMDGPUFunctionArgInfo::QUEUE_PTR); 4881 case Intrinsic::amdgcn_implicit_buffer_ptr: 4882 return legalizePreloadedArgIntrin( 4883 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4884 case Intrinsic::amdgcn_dispatch_id: 4885 return legalizePreloadedArgIntrin(MI, MRI, B, 4886 AMDGPUFunctionArgInfo::DISPATCH_ID); 4887 case Intrinsic::amdgcn_fdiv_fast: 4888 return legalizeFDIVFastIntrin(MI, MRI, B); 4889 case Intrinsic::amdgcn_is_shared: 4890 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4891 case Intrinsic::amdgcn_is_private: 4892 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4893 case Intrinsic::amdgcn_wavefrontsize: { 4894 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4895 MI.eraseFromParent(); 4896 return true; 4897 } 4898 case Intrinsic::amdgcn_s_buffer_load: 4899 return legalizeSBufferLoad(Helper, MI); 4900 case Intrinsic::amdgcn_raw_buffer_store: 4901 case Intrinsic::amdgcn_struct_buffer_store: 4902 return legalizeBufferStore(MI, MRI, B, false, false); 4903 case Intrinsic::amdgcn_raw_buffer_store_format: 4904 case Intrinsic::amdgcn_struct_buffer_store_format: 4905 return legalizeBufferStore(MI, MRI, B, false, true); 4906 case Intrinsic::amdgcn_raw_tbuffer_store: 4907 case Intrinsic::amdgcn_struct_tbuffer_store: 4908 return legalizeBufferStore(MI, MRI, B, true, true); 4909 case Intrinsic::amdgcn_raw_buffer_load: 4910 case Intrinsic::amdgcn_struct_buffer_load: 4911 return legalizeBufferLoad(MI, MRI, B, false, false); 4912 case Intrinsic::amdgcn_raw_buffer_load_format: 4913 case Intrinsic::amdgcn_struct_buffer_load_format: 4914 return legalizeBufferLoad(MI, MRI, B, true, false); 4915 case Intrinsic::amdgcn_raw_tbuffer_load: 4916 case Intrinsic::amdgcn_struct_tbuffer_load: 4917 return legalizeBufferLoad(MI, MRI, B, true, true); 4918 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4919 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4920 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4921 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4922 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4923 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4924 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4925 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4926 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4927 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4928 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4929 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4930 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4931 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4932 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4933 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4934 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4935 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4936 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4937 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4938 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4939 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4940 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4941 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4942 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4943 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4944 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4945 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4946 case Intrinsic::amdgcn_buffer_atomic_fadd: 4947 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4948 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4949 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4950 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4951 return legalizeBufferAtomic(MI, B, IntrID); 4952 case Intrinsic::amdgcn_atomic_inc: 4953 return legalizeAtomicIncDec(MI, B, true); 4954 case Intrinsic::amdgcn_atomic_dec: 4955 return legalizeAtomicIncDec(MI, B, false); 4956 case Intrinsic::trap: 4957 return legalizeTrapIntrinsic(MI, MRI, B); 4958 case Intrinsic::debugtrap: 4959 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4960 case Intrinsic::amdgcn_rsq_clamp: 4961 return legalizeRsqClampIntrinsic(MI, MRI, B); 4962 case Intrinsic::amdgcn_ds_fadd: 4963 case Intrinsic::amdgcn_ds_fmin: 4964 case Intrinsic::amdgcn_ds_fmax: 4965 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4966 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4967 return legalizeBVHIntrinsic(MI, B); 4968 default: { 4969 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4970 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4971 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4972 return true; 4973 } 4974 } 4975 4976 return true; 4977 } 4978