1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 // If we have a truncating store or an extending load with a data size larger 226 // than 32-bits, we need to reduce to a 32-bit type. 227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // Do not handle extending vector loads. 280 if (Ty.isVector() && MemSize != RegSize) 281 return false; 282 283 // TODO: We should be able to widen loads if the alignment is high enough, but 284 // we also need to modify the memory access size. 285 #if 0 286 // Accept widening loads based on alignment. 287 if (IsLoad && MemSize < Size) 288 MemSize = std::max(MemSize, Align); 289 #endif 290 291 // Only 1-byte and 2-byte to 32-bit extloads are valid. 292 if (MemSize != RegSize && RegSize != 32) 293 return false; 294 295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 296 return false; 297 298 switch (MemSize) { 299 case 8: 300 case 16: 301 case 32: 302 case 64: 303 case 128: 304 break; 305 case 96: 306 if (!ST.hasDwordx3LoadStores()) 307 return false; 308 break; 309 case 256: 310 case 512: 311 // These may contextually need to be broken down. 312 break; 313 default: 314 return false; 315 } 316 317 assert(RegSize >= MemSize); 318 319 if (AlignBits < MemSize) { 320 const SITargetLowering *TLI = ST.getTargetLowering(); 321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 322 Align(AlignBits / 8))) 323 return false; 324 } 325 326 return true; 327 } 328 329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 330 // workaround this. Eventually it should ignore the type for loads and only care 331 // about the size. Return true in cases where we will workaround this for now by 332 // bitcasting. 333 static bool loadStoreBitcastWorkaround(const LLT Ty) { 334 if (EnableNewLegality) 335 return false; 336 337 const unsigned Size = Ty.getSizeInBits(); 338 if (Size <= 64) 339 return false; 340 if (!Ty.isVector()) 341 return true; 342 343 LLT EltTy = Ty.getElementType(); 344 if (EltTy.isPointer()) 345 return true; 346 347 unsigned EltSize = EltTy.getSizeInBits(); 348 return EltSize != 32 && EltSize != 64; 349 } 350 351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 352 const LLT Ty = Query.Types[0]; 353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 354 !loadStoreBitcastWorkaround(Ty); 355 } 356 357 /// Return true if a load or store of the type should be lowered with a bitcast 358 /// to a different type. 359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 360 const unsigned MemSizeInBits) { 361 const unsigned Size = Ty.getSizeInBits(); 362 if (Size != MemSizeInBits) 363 return Size <= 32 && Ty.isVector(); 364 365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 366 return true; 367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 368 !isRegisterVectorElementType(Ty.getElementType()); 369 } 370 371 /// Return true if we should legalize a load by widening an odd sized memory 372 /// access up to the alignment. Note this case when the memory access itself 373 /// changes, not the size of the result register. 374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 375 unsigned AlignInBits, unsigned AddrSpace, 376 unsigned Opcode) { 377 // We don't want to widen cases that are naturally legal. 378 if (isPowerOf2_32(SizeInBits)) 379 return false; 380 381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 382 // end up widening these for a scalar load during RegBankSelect, since there 383 // aren't 96-bit scalar loads. 384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 385 return false; 386 387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 388 return false; 389 390 // A load is known dereferenceable up to the alignment, so it's legal to widen 391 // to it. 392 // 393 // TODO: Could check dereferenceable for less aligned cases. 394 unsigned RoundedSize = NextPowerOf2(SizeInBits); 395 if (AlignInBits < RoundedSize) 396 return false; 397 398 // Do not widen if it would introduce a slow unaligned load. 399 const SITargetLowering *TLI = ST.getTargetLowering(); 400 bool Fast = false; 401 return TLI->allowsMisalignedMemoryAccessesImpl( 402 RoundedSize, AddrSpace, Align(AlignInBits / 8), 403 MachineMemOperand::MOLoad, &Fast) && 404 Fast; 405 } 406 407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 408 unsigned Opcode) { 409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 410 return false; 411 412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 413 Query.MMODescrs[0].AlignInBits, 414 Query.Types[1].getAddressSpace(), Opcode); 415 } 416 417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 418 const GCNTargetMachine &TM) 419 : ST(ST_) { 420 using namespace TargetOpcode; 421 422 auto GetAddrSpacePtr = [&TM](unsigned AS) { 423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 424 }; 425 426 const LLT S1 = LLT::scalar(1); 427 const LLT S8 = LLT::scalar(8); 428 const LLT S16 = LLT::scalar(16); 429 const LLT S32 = LLT::scalar(32); 430 const LLT S64 = LLT::scalar(64); 431 const LLT S128 = LLT::scalar(128); 432 const LLT S256 = LLT::scalar(256); 433 const LLT S512 = LLT::scalar(512); 434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 435 436 const LLT V2S8 = LLT::vector(2, 8); 437 const LLT V2S16 = LLT::vector(2, 16); 438 const LLT V4S16 = LLT::vector(4, 16); 439 440 const LLT V2S32 = LLT::vector(2, 32); 441 const LLT V3S32 = LLT::vector(3, 32); 442 const LLT V4S32 = LLT::vector(4, 32); 443 const LLT V5S32 = LLT::vector(5, 32); 444 const LLT V6S32 = LLT::vector(6, 32); 445 const LLT V7S32 = LLT::vector(7, 32); 446 const LLT V8S32 = LLT::vector(8, 32); 447 const LLT V9S32 = LLT::vector(9, 32); 448 const LLT V10S32 = LLT::vector(10, 32); 449 const LLT V11S32 = LLT::vector(11, 32); 450 const LLT V12S32 = LLT::vector(12, 32); 451 const LLT V13S32 = LLT::vector(13, 32); 452 const LLT V14S32 = LLT::vector(14, 32); 453 const LLT V15S32 = LLT::vector(15, 32); 454 const LLT V16S32 = LLT::vector(16, 32); 455 const LLT V32S32 = LLT::vector(32, 32); 456 457 const LLT V2S64 = LLT::vector(2, 64); 458 const LLT V3S64 = LLT::vector(3, 64); 459 const LLT V4S64 = LLT::vector(4, 64); 460 const LLT V5S64 = LLT::vector(5, 64); 461 const LLT V6S64 = LLT::vector(6, 64); 462 const LLT V7S64 = LLT::vector(7, 64); 463 const LLT V8S64 = LLT::vector(8, 64); 464 const LLT V16S64 = LLT::vector(16, 64); 465 466 std::initializer_list<LLT> AllS32Vectors = 467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 469 std::initializer_list<LLT> AllS64Vectors = 470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 471 472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 479 480 const LLT CodePtr = FlatPtr; 481 482 const std::initializer_list<LLT> AddrSpaces64 = { 483 GlobalPtr, ConstantPtr, FlatPtr 484 }; 485 486 const std::initializer_list<LLT> AddrSpaces32 = { 487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 488 }; 489 490 const std::initializer_list<LLT> FPTypesBase = { 491 S32, S64 492 }; 493 494 const std::initializer_list<LLT> FPTypes16 = { 495 S32, S64, S16 496 }; 497 498 const std::initializer_list<LLT> FPTypesPK16 = { 499 S32, S64, S16, V2S16 500 }; 501 502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 503 504 auto &LegacyInfo = getLegacyLegalizerInfo(); 505 LegacyInfo.setAction({G_BRCOND, S1}, 506 LegacyLegalizeActions::Legal); // VCC branches 507 LegacyInfo.setAction({G_BRCOND, S32}, 508 LegacyLegalizeActions::Legal); // SCC branches 509 510 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 511 // elements for v3s16 512 getActionDefinitionsBuilder(G_PHI) 513 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 514 .legalFor(AllS32Vectors) 515 .legalFor(AllS64Vectors) 516 .legalFor(AddrSpaces64) 517 .legalFor(AddrSpaces32) 518 .legalIf(isPointer(0)) 519 .clampScalar(0, S16, S256) 520 .widenScalarToNextPow2(0, 32) 521 .clampMaxNumElements(0, S32, 16) 522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 523 .scalarize(0); 524 525 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 526 // Full set of gfx9 features. 527 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 528 .legalFor({S32, S16, V2S16}) 529 .clampScalar(0, S16, S32) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32); 533 534 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 535 .legalFor({S32, S16, V2S16}) // Clamp modifier 536 .minScalarOrElt(0, S16) 537 .clampMaxNumElements(0, S16, 2) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32) 540 .lower(); 541 } else if (ST.has16BitInsts()) { 542 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 543 .legalFor({S32, S16}) 544 .clampScalar(0, S16, S32) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 547 548 // Technically the saturating operations require clamp bit support, but this 549 // was introduced at the same time as 16-bit operations. 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 551 .legalFor({S32, S16}) // Clamp modifier 552 .minScalar(0, S16) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 16) 555 .lower(); 556 557 // We're just lowering this, but it helps get a better result to try to 558 // coerce to the desired type first. 559 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 560 .minScalar(0, S16) 561 .scalarize(0) 562 .lower(); 563 } else { 564 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 565 .legalFor({S32}) 566 .clampScalar(0, S32, S32) 567 .scalarize(0); 568 569 if (ST.hasIntClamp()) { 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .legalFor({S32}) // Clamp modifier. 572 .scalarize(0) 573 .minScalarOrElt(0, S32) 574 .lower(); 575 } else { 576 // Clamp bit support was added in VI, along with 16-bit operations. 577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 // FIXME: DAG expansion gets better results. The widening uses the smaller 584 // range values and goes for the min/max lowering directly. 585 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 586 .minScalar(0, S32) 587 .scalarize(0) 588 .lower(); 589 } 590 591 getActionDefinitionsBuilder( 592 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 593 .customFor({S32, S64}) 594 .clampScalar(0, S32, S64) 595 .widenScalarToNextPow2(0, 32) 596 .scalarize(0); 597 598 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 599 .legalFor({S32}) 600 .maxScalarOrElt(0, S32); 601 602 if (ST.hasVOP3PInsts()) { 603 Mulh 604 .clampMaxNumElements(0, S8, 2) 605 .lowerFor({V2S8}); 606 } 607 608 Mulh 609 .scalarize(0) 610 .lower(); 611 612 // Report legal for any types we can handle anywhere. For the cases only legal 613 // on the SALU, RegBankSelect will be able to re-legalize. 614 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 615 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 616 .clampScalar(0, S32, S64) 617 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 618 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 619 .widenScalarToNextPow2(0) 620 .scalarize(0); 621 622 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 623 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 624 .legalFor({{S32, S1}, {S32, S32}}) 625 .minScalar(0, S32) 626 // TODO: .scalarize(0) 627 .lower(); 628 629 getActionDefinitionsBuilder(G_BITCAST) 630 // Don't worry about the size constraint. 631 .legalIf(all(isRegisterType(0), isRegisterType(1))) 632 .lower(); 633 634 635 getActionDefinitionsBuilder(G_CONSTANT) 636 .legalFor({S1, S32, S64, S16, GlobalPtr, 637 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 638 .legalIf(isPointer(0)) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(0); 641 642 getActionDefinitionsBuilder(G_FCONSTANT) 643 .legalFor({S32, S64, S16}) 644 .clampScalar(0, S16, S64); 645 646 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 647 .legalIf(isRegisterType(0)) 648 // s1 and s16 are special cases because they have legal operations on 649 // them, but don't really occupy registers in the normal way. 650 .legalFor({S1, S16}) 651 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 652 .clampScalarOrElt(0, S32, MaxScalar) 653 .widenScalarToNextPow2(0, 32) 654 .clampMaxNumElements(0, S32, 16); 655 656 LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr}, 657 LegacyLegalizeActions::Legal); 658 659 // If the amount is divergent, we have to do a wave reduction to get the 660 // maximum value, so this is expanded during RegBankSelect. 661 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 662 .legalFor({{PrivatePtr, S32}}); 663 664 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 665 .customIf(typeIsNot(0, PrivatePtr)); 666 667 LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal); 668 669 auto &FPOpActions = getActionDefinitionsBuilder( 670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 671 .legalFor({S32, S64}); 672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 673 .customFor({S32, S64}); 674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 675 .customFor({S32, S64}); 676 677 if (ST.has16BitInsts()) { 678 if (ST.hasVOP3PInsts()) 679 FPOpActions.legalFor({S16, V2S16}); 680 else 681 FPOpActions.legalFor({S16}); 682 683 TrigActions.customFor({S16}); 684 FDIVActions.customFor({S16}); 685 } 686 687 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 689 690 if (ST.hasVOP3PInsts()) { 691 MinNumMaxNum.customFor(FPTypesPK16) 692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 693 .clampMaxNumElements(0, S16, 2) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else if (ST.has16BitInsts()) { 697 MinNumMaxNum.customFor(FPTypes16) 698 .clampScalar(0, S16, S64) 699 .scalarize(0); 700 } else { 701 MinNumMaxNum.customFor(FPTypesBase) 702 .clampScalar(0, S32, S64) 703 .scalarize(0); 704 } 705 706 if (ST.hasVOP3PInsts()) 707 FPOpActions.clampMaxNumElements(0, S16, 2); 708 709 FPOpActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 TrigActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 FDIVActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 722 .legalFor(FPTypesPK16) 723 .clampMaxNumElements(0, S16, 2) 724 .scalarize(0) 725 .clampScalar(0, S16, S64); 726 727 if (ST.has16BitInsts()) { 728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 729 .legalFor({S32, S64, S16}) 730 .scalarize(0) 731 .clampScalar(0, S16, S64); 732 } else { 733 getActionDefinitionsBuilder(G_FSQRT) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 738 if (ST.hasFractBug()) { 739 getActionDefinitionsBuilder(G_FFLOOR) 740 .customFor({S64}) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 } else { 745 getActionDefinitionsBuilder(G_FFLOOR) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } 750 } 751 752 getActionDefinitionsBuilder(G_FPTRUNC) 753 .legalFor({{S32, S64}, {S16, S32}}) 754 .scalarize(0) 755 .lower(); 756 757 getActionDefinitionsBuilder(G_FPEXT) 758 .legalFor({{S64, S32}, {S32, S16}}) 759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 760 .scalarize(0); 761 762 getActionDefinitionsBuilder(G_FSUB) 763 // Use actual fsub instruction 764 .legalFor({S32}) 765 // Must use fadd + fneg 766 .lowerFor({S64, S16, V2S16}) 767 .scalarize(0) 768 .clampScalar(0, S32, S64); 769 770 // Whether this is legal depends on the floating point mode for the function. 771 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 772 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 773 FMad.customFor({S32, S16}); 774 else if (ST.hasMadMacF32Insts()) 775 FMad.customFor({S32}); 776 else if (ST.hasMadF16()) 777 FMad.customFor({S16}); 778 FMad.scalarize(0) 779 .lower(); 780 781 auto &FRem = getActionDefinitionsBuilder(G_FREM); 782 if (ST.has16BitInsts()) { 783 FRem.customFor({S16, S32, S64}); 784 } else { 785 FRem.minScalar(0, S32) 786 .customFor({S32, S64}); 787 } 788 FRem.scalarize(0); 789 790 // TODO: Do we need to clamp maximum bitwidth? 791 getActionDefinitionsBuilder(G_TRUNC) 792 .legalIf(isScalar(0)) 793 .legalFor({{V2S16, V2S32}}) 794 .clampMaxNumElements(0, S16, 2) 795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 796 // situations (like an invalid implicit use), we don't want to infinite loop 797 // in the legalizer. 798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 799 .alwaysLegal(); 800 801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 803 {S32, S1}, {S64, S1}, {S16, S1}}) 804 .scalarize(0) 805 .clampScalar(0, S32, S64) 806 .widenScalarToNextPow2(1, 32); 807 808 // TODO: Split s1->s64 during regbankselect for VALU. 809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 811 .lowerFor({{S32, S64}}) 812 .lowerIf(typeIs(1, S1)) 813 .customFor({{S64, S64}}); 814 if (ST.has16BitInsts()) 815 IToFP.legalFor({{S16, S16}}); 816 IToFP.clampScalar(1, S32, S64) 817 .minScalar(0, S32) 818 .scalarize(0) 819 .widenScalarToNextPow2(1); 820 821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 823 .customFor({{S64, S32}, {S64, S64}}) 824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 825 if (ST.has16BitInsts()) 826 FPToI.legalFor({{S16, S16}}); 827 else 828 FPToI.minScalar(1, S32); 829 830 FPToI.minScalar(0, S32) 831 .widenScalarToNextPow2(0, 32) 832 .scalarize(0) 833 .lower(); 834 835 // Lower roundeven into G_FRINT 836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 837 .scalarize(0) 838 .lower(); 839 840 if (ST.has16BitInsts()) { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S16, S32, S64}) 843 .clampScalar(0, S16, S64) 844 .scalarize(0); 845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S32, S64}) 848 .clampScalar(0, S32, S64) 849 .scalarize(0); 850 } else { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32}) 853 .customFor({S64}) 854 .clampScalar(0, S32, S64) 855 .scalarize(0); 856 } 857 858 getActionDefinitionsBuilder(G_PTR_ADD) 859 .legalIf(all(isPointer(0), sameSize(0, 1))) 860 .scalarize(0) 861 .scalarSameSizeAs(1, 0); 862 863 getActionDefinitionsBuilder(G_PTRMASK) 864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 865 .scalarSameSizeAs(1, 0) 866 .scalarize(0); 867 868 auto &CmpBuilder = 869 getActionDefinitionsBuilder(G_ICMP) 870 // The compare output type differs based on the register bank of the output, 871 // so make both s1 and s32 legal. 872 // 873 // Scalar compares producing output in scc will be promoted to s32, as that 874 // is the allocatable register type that will be needed for the copy from 875 // scc. This will be promoted during RegBankSelect, and we assume something 876 // before that won't try to use s32 result types. 877 // 878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 879 // bank. 880 .legalForCartesianProduct( 881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 882 .legalForCartesianProduct( 883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 884 if (ST.has16BitInsts()) { 885 CmpBuilder.legalFor({{S1, S16}}); 886 } 887 888 CmpBuilder 889 .widenScalarToNextPow2(1) 890 .clampScalar(1, S32, S64) 891 .scalarize(0) 892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 893 894 getActionDefinitionsBuilder(G_FCMP) 895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0); 899 900 // FIXME: fpow has a selection pattern that should move to custom lowering. 901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 902 if (ST.has16BitInsts()) 903 Exp2Ops.legalFor({S32, S16}); 904 else 905 Exp2Ops.legalFor({S32}); 906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 907 Exp2Ops.scalarize(0); 908 909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 910 if (ST.has16BitInsts()) 911 ExpOps.customFor({{S32}, {S16}}); 912 else 913 ExpOps.customFor({S32}); 914 ExpOps.clampScalar(0, MinScalarFPTy, S32) 915 .scalarize(0); 916 917 getActionDefinitionsBuilder(G_FPOWI) 918 .clampScalar(0, MinScalarFPTy, S32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder(G_CTPOP) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 // The hardware instructions return a different result on 0 than the generic 931 // instructions expect. The hardware produces -1, but these produce the 932 // bitwidth. 933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 934 .scalarize(0) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32) 939 .lower(); 940 941 // The 64-bit versions produce 32-bit results, but only on the SALU. 942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 943 .legalFor({{S32, S32}, {S32, S64}}) 944 .clampScalar(0, S32, S32) 945 .clampScalar(1, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0, 32) 948 .widenScalarToNextPow2(1, 32); 949 950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 951 // RegBankSelect. 952 getActionDefinitionsBuilder(G_BITREVERSE) 953 .legalFor({S32, S64}) 954 .clampScalar(0, S32, S64) 955 .scalarize(0) 956 .widenScalarToNextPow2(0); 957 958 if (ST.has16BitInsts()) { 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S16, S32, V2S16}) 961 .clampMaxNumElements(0, S16, 2) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .clampScalar(0, S16, S32) 966 .scalarize(0); 967 968 if (ST.hasVOP3PInsts()) { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 970 .legalFor({S32, S16, V2S16}) 971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 972 .clampMaxNumElements(0, S16, 2) 973 .minScalar(0, S16) 974 .widenScalarToNextPow2(0) 975 .scalarize(0) 976 .lower(); 977 } else { 978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 979 .legalFor({S32, S16}) 980 .widenScalarToNextPow2(0) 981 .minScalar(0, S16) 982 .scalarize(0) 983 .lower(); 984 } 985 } else { 986 // TODO: Should have same legality without v_perm_b32 987 getActionDefinitionsBuilder(G_BSWAP) 988 .legalFor({S32}) 989 .lowerIf(scalarNarrowerThan(0, 32)) 990 // FIXME: Fixing non-power-of-2 before clamp is workaround for 991 // narrowScalar limitation. 992 .widenScalarToNextPow2(0) 993 .maxScalar(0, S32) 994 .scalarize(0) 995 .lower(); 996 997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 998 .legalFor({S32}) 999 .minScalar(0, S32) 1000 .widenScalarToNextPow2(0) 1001 .scalarize(0) 1002 .lower(); 1003 } 1004 1005 getActionDefinitionsBuilder(G_INTTOPTR) 1006 // List the common cases 1007 .legalForCartesianProduct(AddrSpaces64, {S64}) 1008 .legalForCartesianProduct(AddrSpaces32, {S32}) 1009 .scalarize(0) 1010 // Accept any address space as long as the size matches 1011 .legalIf(sameSize(0, 1)) 1012 .widenScalarIf(smallerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }) 1016 .narrowScalarIf(largerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }); 1020 1021 getActionDefinitionsBuilder(G_PTRTOINT) 1022 // List the common cases 1023 .legalForCartesianProduct(AddrSpaces64, {S64}) 1024 .legalForCartesianProduct(AddrSpaces32, {S32}) 1025 .scalarize(0) 1026 // Accept any address space as long as the size matches 1027 .legalIf(sameSize(0, 1)) 1028 .widenScalarIf(smallerThan(0, 1), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1031 }) 1032 .narrowScalarIf( 1033 largerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }); 1037 1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1039 .scalarize(0) 1040 .custom(); 1041 1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1043 bool IsLoad) -> bool { 1044 const LLT DstTy = Query.Types[0]; 1045 1046 // Split vector extloads. 1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1049 1050 if (MemSize < DstTy.getSizeInBits()) 1051 MemSize = std::max(MemSize, AlignBits); 1052 1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1054 return true; 1055 1056 const LLT PtrTy = Query.Types[1]; 1057 unsigned AS = PtrTy.getAddressSpace(); 1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1059 return true; 1060 1061 // Catch weird sized loads that don't evenly divide into the access sizes 1062 // TODO: May be able to widen depending on alignment etc. 1063 unsigned NumRegs = (MemSize + 31) / 32; 1064 if (NumRegs == 3) { 1065 if (!ST.hasDwordx3LoadStores()) 1066 return true; 1067 } else { 1068 // If the alignment allows, these should have been widened. 1069 if (!isPowerOf2_32(NumRegs)) 1070 return true; 1071 } 1072 1073 if (AlignBits < MemSize) { 1074 const SITargetLowering *TLI = ST.getTargetLowering(); 1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1076 Align(AlignBits / 8)); 1077 } 1078 1079 return false; 1080 }; 1081 1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1085 1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1087 // LDS 1088 // TODO: Unsupported flat for SI. 1089 1090 for (unsigned Op : {G_LOAD, G_STORE}) { 1091 const bool IsStore = Op == G_STORE; 1092 1093 auto &Actions = getActionDefinitionsBuilder(Op); 1094 // Explicitly list some common cases. 1095 // TODO: Does this help compile time at all? 1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1097 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1098 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1099 {S64, GlobalPtr, 64, GlobalAlign32}, 1100 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1101 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1102 {S32, GlobalPtr, 8, GlobalAlign8}, 1103 {S32, GlobalPtr, 16, GlobalAlign16}, 1104 1105 {S32, LocalPtr, 32, 32}, 1106 {S64, LocalPtr, 64, 32}, 1107 {V2S32, LocalPtr, 64, 32}, 1108 {S32, LocalPtr, 8, 8}, 1109 {S32, LocalPtr, 16, 16}, 1110 {V2S16, LocalPtr, 32, 32}, 1111 1112 {S32, PrivatePtr, 32, 32}, 1113 {S32, PrivatePtr, 8, 8}, 1114 {S32, PrivatePtr, 16, 16}, 1115 {V2S16, PrivatePtr, 32, 32}, 1116 1117 {S32, ConstantPtr, 32, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1119 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1120 {S64, ConstantPtr, 64, GlobalAlign32}, 1121 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1122 Actions.legalIf( 1123 [=](const LegalityQuery &Query) -> bool { 1124 return isLoadStoreLegal(ST, Query); 1125 }); 1126 1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1128 // 64-bits. 1129 // 1130 // TODO: Should generalize bitcast action into coerce, which will also cover 1131 // inserting addrspacecasts. 1132 Actions.customIf(typeIs(1, Constant32Ptr)); 1133 1134 // Turn any illegal element vectors into something easier to deal 1135 // with. These will ultimately produce 32-bit scalar shifts to extract the 1136 // parts anyway. 1137 // 1138 // For odd 16-bit element vectors, prefer to split those into pieces with 1139 // 16-bit vector parts. 1140 Actions.bitcastIf( 1141 [=](const LegalityQuery &Query) -> bool { 1142 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1143 Query.MMODescrs[0].SizeInBits); 1144 }, bitcastToRegisterType(0)); 1145 1146 if (!IsStore) { 1147 // Widen suitably aligned loads by loading extra bytes. The standard 1148 // legalization actions can't properly express widening memory operands. 1149 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1150 return shouldWidenLoad(ST, Query, G_LOAD); 1151 }); 1152 } 1153 1154 // FIXME: load/store narrowing should be moved to lower action 1155 Actions 1156 .narrowScalarIf( 1157 [=](const LegalityQuery &Query) -> bool { 1158 return !Query.Types[0].isVector() && 1159 needToSplitMemOp(Query, Op == G_LOAD); 1160 }, 1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1162 const LLT DstTy = Query.Types[0]; 1163 const LLT PtrTy = Query.Types[1]; 1164 1165 const unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1167 1168 // Split extloads. 1169 if (DstSize > MemSize) 1170 return std::make_pair(0, LLT::scalar(MemSize)); 1171 1172 if (!isPowerOf2_32(DstSize)) { 1173 // We're probably decomposing an odd sized store. Try to split 1174 // to the widest type. TODO: Account for alignment. As-is it 1175 // should be OK, since the new parts will be further legalized. 1176 unsigned FloorSize = PowerOf2Floor(DstSize); 1177 return std::make_pair(0, LLT::scalar(FloorSize)); 1178 } 1179 1180 if (DstSize > 32 && (DstSize % 32 != 0)) { 1181 // FIXME: Need a way to specify non-extload of larger size if 1182 // suitably aligned. 1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1184 } 1185 1186 unsigned MaxSize = maxSizeForAddrSpace(ST, 1187 PtrTy.getAddressSpace(), 1188 Op == G_LOAD); 1189 if (MemSize > MaxSize) 1190 return std::make_pair(0, LLT::scalar(MaxSize)); 1191 1192 unsigned Align = Query.MMODescrs[0].AlignInBits; 1193 return std::make_pair(0, LLT::scalar(Align)); 1194 }) 1195 .fewerElementsIf( 1196 [=](const LegalityQuery &Query) -> bool { 1197 return Query.Types[0].isVector() && 1198 needToSplitMemOp(Query, Op == G_LOAD); 1199 }, 1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1201 const LLT DstTy = Query.Types[0]; 1202 const LLT PtrTy = Query.Types[1]; 1203 1204 LLT EltTy = DstTy.getElementType(); 1205 unsigned MaxSize = maxSizeForAddrSpace(ST, 1206 PtrTy.getAddressSpace(), 1207 Op == G_LOAD); 1208 1209 // FIXME: Handle widened to power of 2 results better. This ends 1210 // up scalarizing. 1211 // FIXME: 3 element stores scalarized on SI 1212 1213 // Split if it's too large for the address space. 1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1215 unsigned NumElts = DstTy.getNumElements(); 1216 unsigned EltSize = EltTy.getSizeInBits(); 1217 1218 if (MaxSize % EltSize == 0) { 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1221 } 1222 1223 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1224 1225 // FIXME: Refine when odd breakdowns handled 1226 // The scalars will need to be re-legalized. 1227 if (NumPieces == 1 || NumPieces >= NumElts || 1228 NumElts % NumPieces != 0) 1229 return std::make_pair(0, EltTy); 1230 1231 return std::make_pair(0, 1232 LLT::vector(NumElts / NumPieces, EltTy)); 1233 } 1234 1235 // FIXME: We could probably handle weird extending loads better. 1236 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1237 if (DstTy.getSizeInBits() > MemSize) 1238 return std::make_pair(0, EltTy); 1239 1240 unsigned EltSize = EltTy.getSizeInBits(); 1241 unsigned DstSize = DstTy.getSizeInBits(); 1242 if (!isPowerOf2_32(DstSize)) { 1243 // We're probably decomposing an odd sized store. Try to split 1244 // to the widest type. TODO: Account for alignment. As-is it 1245 // should be OK, since the new parts will be further legalized. 1246 unsigned FloorSize = PowerOf2Floor(DstSize); 1247 return std::make_pair( 1248 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1249 } 1250 1251 // Need to split because of alignment. 1252 unsigned Align = Query.MMODescrs[0].AlignInBits; 1253 if (EltSize > Align && 1254 (EltSize / Align < DstTy.getNumElements())) { 1255 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1256 } 1257 1258 // May need relegalization for the scalars. 1259 return std::make_pair(0, EltTy); 1260 }) 1261 .lowerIfMemSizeNotPow2() 1262 .minScalar(0, S32) 1263 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1264 .widenScalarToNextPow2(0) 1265 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1266 .lower(); 1267 } 1268 1269 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1270 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1271 {S32, GlobalPtr, 16, 2 * 8}, 1272 {S32, LocalPtr, 8, 8}, 1273 {S32, LocalPtr, 16, 16}, 1274 {S32, PrivatePtr, 8, 8}, 1275 {S32, PrivatePtr, 16, 16}, 1276 {S32, ConstantPtr, 8, 8}, 1277 {S32, ConstantPtr, 16, 2 * 8}}) 1278 .legalIf( 1279 [=](const LegalityQuery &Query) -> bool { 1280 return isLoadStoreLegal(ST, Query); 1281 }); 1282 1283 if (ST.hasFlatAddressSpace()) { 1284 ExtLoads.legalForTypesWithMemDesc( 1285 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1286 } 1287 1288 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1289 // 64-bits. 1290 // 1291 // TODO: Should generalize bitcast action into coerce, which will also cover 1292 // inserting addrspacecasts. 1293 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1294 1295 ExtLoads.clampScalar(0, S32, S32) 1296 .widenScalarToNextPow2(0) 1297 .unsupportedIfMemSizeNotPow2() 1298 .lower(); 1299 1300 auto &Atomics = getActionDefinitionsBuilder( 1301 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1302 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1303 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1304 G_ATOMICRMW_UMIN}) 1305 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1306 {S64, GlobalPtr}, {S64, LocalPtr}, 1307 {S32, RegionPtr}, {S64, RegionPtr}}); 1308 if (ST.hasFlatAddressSpace()) { 1309 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1310 } 1311 1312 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1313 if (ST.hasLDSFPAtomics()) { 1314 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1315 if (ST.hasGFX90AInsts()) 1316 Atomic.legalFor({{S64, LocalPtr}}); 1317 } 1318 if (ST.hasAtomicFaddInsts()) 1319 Atomic.legalFor({{S32, GlobalPtr}}); 1320 1321 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1322 // demarshalling 1323 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1324 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1325 {S32, FlatPtr}, {S64, FlatPtr}}) 1326 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1327 {S32, RegionPtr}, {S64, RegionPtr}}); 1328 // TODO: Pointer types, any 32-bit or 64-bit vector 1329 1330 // Condition should be s32 for scalar, s1 for vector. 1331 getActionDefinitionsBuilder(G_SELECT) 1332 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1333 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1334 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1335 .clampScalar(0, S16, S64) 1336 .scalarize(1) 1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1339 .clampMaxNumElements(0, S32, 2) 1340 .clampMaxNumElements(0, LocalPtr, 2) 1341 .clampMaxNumElements(0, PrivatePtr, 2) 1342 .scalarize(0) 1343 .widenScalarToNextPow2(0) 1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1345 1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1347 // be more flexible with the shift amount type. 1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1349 .legalFor({{S32, S32}, {S64, S32}}); 1350 if (ST.has16BitInsts()) { 1351 if (ST.hasVOP3PInsts()) { 1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1353 .clampMaxNumElements(0, S16, 2); 1354 } else 1355 Shifts.legalFor({{S16, S16}}); 1356 1357 // TODO: Support 16-bit shift amounts for all types 1358 Shifts.widenScalarIf( 1359 [=](const LegalityQuery &Query) { 1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1361 // 32-bit amount. 1362 const LLT ValTy = Query.Types[0]; 1363 const LLT AmountTy = Query.Types[1]; 1364 return ValTy.getSizeInBits() <= 16 && 1365 AmountTy.getSizeInBits() < 16; 1366 }, changeTo(1, S16)); 1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1368 Shifts.clampScalar(1, S32, S32); 1369 Shifts.clampScalar(0, S16, S64); 1370 Shifts.widenScalarToNextPow2(0, 16); 1371 1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1373 .minScalar(0, S16) 1374 .scalarize(0) 1375 .lower(); 1376 } else { 1377 // Make sure we legalize the shift amount type first, as the general 1378 // expansion for the shifted type will produce much worse code if it hasn't 1379 // been truncated already. 1380 Shifts.clampScalar(1, S32, S32); 1381 Shifts.clampScalar(0, S32, S64); 1382 Shifts.widenScalarToNextPow2(0, 32); 1383 1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1385 .minScalar(0, S32) 1386 .scalarize(0) 1387 .lower(); 1388 } 1389 Shifts.scalarize(0); 1390 1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1394 unsigned IdxTypeIdx = 2; 1395 1396 getActionDefinitionsBuilder(Op) 1397 .customIf([=](const LegalityQuery &Query) { 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1401 const unsigned EltSize = EltTy.getSizeInBits(); 1402 return (EltSize == 32 || EltSize == 64) && 1403 VecTy.getSizeInBits() % 32 == 0 && 1404 VecTy.getSizeInBits() <= MaxRegisterSize && 1405 IdxTy.getSizeInBits() == 32; 1406 }) 1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1408 bitcastToVectorElement32(VecTypeIdx)) 1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1410 .bitcastIf( 1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1412 [=](const LegalityQuery &Query) { 1413 // For > 64-bit element types, try to turn this into a 64-bit 1414 // element vector since we may be able to do better indexing 1415 // if this is scalar. If not, fall back to 32. 1416 const LLT EltTy = Query.Types[EltTypeIdx]; 1417 const LLT VecTy = Query.Types[VecTypeIdx]; 1418 const unsigned DstEltSize = EltTy.getSizeInBits(); 1419 const unsigned VecSize = VecTy.getSizeInBits(); 1420 1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1422 return std::make_pair( 1423 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 // TODO: Only Try to form v2s16 with legal packed instructions. 1621 getActionDefinitionsBuilder(G_FSHR) 1622 .legalFor({{S32, S32}}) 1623 .lowerFor({{V2S16, V2S16}}) 1624 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1625 .scalarize(0) 1626 .lower(); 1627 1628 if (ST.hasVOP3PInsts()) { 1629 getActionDefinitionsBuilder(G_FSHL) 1630 .lowerFor({{V2S16, V2S16}}) 1631 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1632 .scalarize(0) 1633 .lower(); 1634 } else { 1635 getActionDefinitionsBuilder(G_FSHL) 1636 .scalarize(0) 1637 .lower(); 1638 } 1639 1640 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1641 .legalFor({S64}); 1642 1643 getActionDefinitionsBuilder(G_FENCE) 1644 .alwaysLegal(); 1645 1646 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1647 .scalarize(0) 1648 .minScalar(0, S32) 1649 .lower(); 1650 1651 getActionDefinitionsBuilder({ 1652 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1653 G_FCOPYSIGN, 1654 1655 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1656 G_ATOMICRMW_NAND, 1657 G_ATOMICRMW_FSUB, 1658 G_READ_REGISTER, 1659 G_WRITE_REGISTER, 1660 1661 G_SADDO, G_SSUBO, 1662 1663 // TODO: Implement 1664 G_FMINIMUM, G_FMAXIMUM}).lower(); 1665 1666 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1667 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1668 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1669 .unsupported(); 1670 1671 getLegacyLegalizerInfo().computeTables(); 1672 verify(*ST.getInstrInfo()); 1673 } 1674 1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1676 MachineInstr &MI) const { 1677 MachineIRBuilder &B = Helper.MIRBuilder; 1678 MachineRegisterInfo &MRI = *B.getMRI(); 1679 1680 switch (MI.getOpcode()) { 1681 case TargetOpcode::G_ADDRSPACE_CAST: 1682 return legalizeAddrSpaceCast(MI, MRI, B); 1683 case TargetOpcode::G_FRINT: 1684 return legalizeFrint(MI, MRI, B); 1685 case TargetOpcode::G_FCEIL: 1686 return legalizeFceil(MI, MRI, B); 1687 case TargetOpcode::G_FREM: 1688 return legalizeFrem(MI, MRI, B); 1689 case TargetOpcode::G_INTRINSIC_TRUNC: 1690 return legalizeIntrinsicTrunc(MI, MRI, B); 1691 case TargetOpcode::G_SITOFP: 1692 return legalizeITOFP(MI, MRI, B, true); 1693 case TargetOpcode::G_UITOFP: 1694 return legalizeITOFP(MI, MRI, B, false); 1695 case TargetOpcode::G_FPTOSI: 1696 return legalizeFPTOI(MI, MRI, B, true); 1697 case TargetOpcode::G_FPTOUI: 1698 return legalizeFPTOI(MI, MRI, B, false); 1699 case TargetOpcode::G_FMINNUM: 1700 case TargetOpcode::G_FMAXNUM: 1701 case TargetOpcode::G_FMINNUM_IEEE: 1702 case TargetOpcode::G_FMAXNUM_IEEE: 1703 return legalizeMinNumMaxNum(Helper, MI); 1704 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1705 return legalizeExtractVectorElt(MI, MRI, B); 1706 case TargetOpcode::G_INSERT_VECTOR_ELT: 1707 return legalizeInsertVectorElt(MI, MRI, B); 1708 case TargetOpcode::G_SHUFFLE_VECTOR: 1709 return legalizeShuffleVector(MI, MRI, B); 1710 case TargetOpcode::G_FSIN: 1711 case TargetOpcode::G_FCOS: 1712 return legalizeSinCos(MI, MRI, B); 1713 case TargetOpcode::G_GLOBAL_VALUE: 1714 return legalizeGlobalValue(MI, MRI, B); 1715 case TargetOpcode::G_LOAD: 1716 case TargetOpcode::G_SEXTLOAD: 1717 case TargetOpcode::G_ZEXTLOAD: 1718 return legalizeLoad(Helper, MI); 1719 case TargetOpcode::G_FMAD: 1720 return legalizeFMad(MI, MRI, B); 1721 case TargetOpcode::G_FDIV: 1722 return legalizeFDIV(MI, MRI, B); 1723 case TargetOpcode::G_UDIV: 1724 case TargetOpcode::G_UREM: 1725 case TargetOpcode::G_UDIVREM: 1726 return legalizeUnsignedDIV_REM(MI, MRI, B); 1727 case TargetOpcode::G_SDIV: 1728 case TargetOpcode::G_SREM: 1729 case TargetOpcode::G_SDIVREM: 1730 return legalizeSignedDIV_REM(MI, MRI, B); 1731 case TargetOpcode::G_ATOMIC_CMPXCHG: 1732 return legalizeAtomicCmpXChg(MI, MRI, B); 1733 case TargetOpcode::G_FLOG: 1734 return legalizeFlog(MI, B, numbers::ln2f); 1735 case TargetOpcode::G_FLOG10: 1736 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1737 case TargetOpcode::G_FEXP: 1738 return legalizeFExp(MI, B); 1739 case TargetOpcode::G_FPOW: 1740 return legalizeFPow(MI, B); 1741 case TargetOpcode::G_FFLOOR: 1742 return legalizeFFloor(MI, MRI, B); 1743 case TargetOpcode::G_BUILD_VECTOR: 1744 return legalizeBuildVector(MI, MRI, B); 1745 default: 1746 return false; 1747 } 1748 1749 llvm_unreachable("expected switch to return"); 1750 } 1751 1752 Register AMDGPULegalizerInfo::getSegmentAperture( 1753 unsigned AS, 1754 MachineRegisterInfo &MRI, 1755 MachineIRBuilder &B) const { 1756 MachineFunction &MF = B.getMF(); 1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1758 const LLT S32 = LLT::scalar(32); 1759 1760 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1761 1762 if (ST.hasApertureRegs()) { 1763 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1764 // getreg. 1765 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1766 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1767 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1768 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1769 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1770 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1771 unsigned Encoding = 1772 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1773 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1774 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1775 1776 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1777 1778 B.buildInstr(AMDGPU::S_GETREG_B32) 1779 .addDef(GetReg) 1780 .addImm(Encoding); 1781 MRI.setType(GetReg, S32); 1782 1783 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1784 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1785 } 1786 1787 Register QueuePtr = MRI.createGenericVirtualRegister( 1788 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1789 1790 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1791 return Register(); 1792 1793 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1794 // private_segment_aperture_base_hi. 1795 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1796 1797 // TODO: can we be smarter about machine pointer info? 1798 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1799 MachineMemOperand *MMO = MF.getMachineMemOperand( 1800 PtrInfo, 1801 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1802 MachineMemOperand::MOInvariant, 1803 4, commonAlignment(Align(64), StructOffset)); 1804 1805 Register LoadAddr; 1806 1807 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1808 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1809 } 1810 1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1812 MachineInstr &MI, MachineRegisterInfo &MRI, 1813 MachineIRBuilder &B) const { 1814 MachineFunction &MF = B.getMF(); 1815 1816 const LLT S32 = LLT::scalar(32); 1817 Register Dst = MI.getOperand(0).getReg(); 1818 Register Src = MI.getOperand(1).getReg(); 1819 1820 LLT DstTy = MRI.getType(Dst); 1821 LLT SrcTy = MRI.getType(Src); 1822 unsigned DestAS = DstTy.getAddressSpace(); 1823 unsigned SrcAS = SrcTy.getAddressSpace(); 1824 1825 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1826 // vector element. 1827 assert(!DstTy.isVector()); 1828 1829 const AMDGPUTargetMachine &TM 1830 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1831 1832 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1833 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1834 return true; 1835 } 1836 1837 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1838 // Truncate. 1839 B.buildExtract(Dst, Src, 0); 1840 MI.eraseFromParent(); 1841 return true; 1842 } 1843 1844 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1845 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1846 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1847 1848 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1849 // another. Merge operands are required to be the same type, but creating an 1850 // extra ptrtoint would be kind of pointless. 1851 auto HighAddr = B.buildConstant( 1852 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1853 B.buildMerge(Dst, {Src, HighAddr}); 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1859 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1860 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1861 unsigned NullVal = TM.getNullPointerValue(DestAS); 1862 1863 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1864 auto FlatNull = B.buildConstant(SrcTy, 0); 1865 1866 // Extract low 32-bits of the pointer. 1867 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1868 1869 auto CmpRes = 1870 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1871 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1872 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1878 return false; 1879 1880 if (!ST.hasFlatAddressSpace()) 1881 return false; 1882 1883 auto SegmentNull = 1884 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1885 auto FlatNull = 1886 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1887 1888 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1889 if (!ApertureReg.isValid()) 1890 return false; 1891 1892 auto CmpRes = 1893 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1894 1895 // Coerce the type of the low half of the result so we can use merge_values. 1896 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1897 1898 // TODO: Should we allow mismatched types but matching sizes in merges to 1899 // avoid the ptrtoint? 1900 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1901 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1902 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 bool AMDGPULegalizerInfo::legalizeFrint( 1908 MachineInstr &MI, MachineRegisterInfo &MRI, 1909 MachineIRBuilder &B) const { 1910 Register Src = MI.getOperand(1).getReg(); 1911 LLT Ty = MRI.getType(Src); 1912 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1913 1914 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1915 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1916 1917 auto C1 = B.buildFConstant(Ty, C1Val); 1918 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1919 1920 // TODO: Should this propagate fast-math-flags? 1921 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1922 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1923 1924 auto C2 = B.buildFConstant(Ty, C2Val); 1925 auto Fabs = B.buildFAbs(Ty, Src); 1926 1927 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1928 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 bool AMDGPULegalizerInfo::legalizeFceil( 1934 MachineInstr &MI, MachineRegisterInfo &MRI, 1935 MachineIRBuilder &B) const { 1936 1937 const LLT S1 = LLT::scalar(1); 1938 const LLT S64 = LLT::scalar(64); 1939 1940 Register Src = MI.getOperand(1).getReg(); 1941 assert(MRI.getType(Src) == S64); 1942 1943 // result = trunc(src) 1944 // if (src > 0.0 && src != result) 1945 // result += 1.0 1946 1947 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1948 1949 const auto Zero = B.buildFConstant(S64, 0.0); 1950 const auto One = B.buildFConstant(S64, 1.0); 1951 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1952 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1953 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1954 auto Add = B.buildSelect(S64, And, One, Zero); 1955 1956 // TODO: Should this propagate fast-math-flags? 1957 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1958 return true; 1959 } 1960 1961 bool AMDGPULegalizerInfo::legalizeFrem( 1962 MachineInstr &MI, MachineRegisterInfo &MRI, 1963 MachineIRBuilder &B) const { 1964 Register DstReg = MI.getOperand(0).getReg(); 1965 Register Src0Reg = MI.getOperand(1).getReg(); 1966 Register Src1Reg = MI.getOperand(2).getReg(); 1967 auto Flags = MI.getFlags(); 1968 LLT Ty = MRI.getType(DstReg); 1969 1970 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1971 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1972 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1973 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 static MachineInstrBuilder extractF64Exponent(Register Hi, 1979 MachineIRBuilder &B) { 1980 const unsigned FractBits = 52; 1981 const unsigned ExpBits = 11; 1982 LLT S32 = LLT::scalar(32); 1983 1984 auto Const0 = B.buildConstant(S32, FractBits - 32); 1985 auto Const1 = B.buildConstant(S32, ExpBits); 1986 1987 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1988 .addUse(Hi) 1989 .addUse(Const0.getReg(0)) 1990 .addUse(Const1.getReg(0)); 1991 1992 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1993 } 1994 1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B) const { 1998 const LLT S1 = LLT::scalar(1); 1999 const LLT S32 = LLT::scalar(32); 2000 const LLT S64 = LLT::scalar(64); 2001 2002 Register Src = MI.getOperand(1).getReg(); 2003 assert(MRI.getType(Src) == S64); 2004 2005 // TODO: Should this use extract since the low half is unused? 2006 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2007 Register Hi = Unmerge.getReg(1); 2008 2009 // Extract the upper half, since this is where we will find the sign and 2010 // exponent. 2011 auto Exp = extractF64Exponent(Hi, B); 2012 2013 const unsigned FractBits = 52; 2014 2015 // Extract the sign bit. 2016 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2017 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2018 2019 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2020 2021 const auto Zero32 = B.buildConstant(S32, 0); 2022 2023 // Extend back to 64-bits. 2024 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2025 2026 auto Shr = B.buildAShr(S64, FractMask, Exp); 2027 auto Not = B.buildNot(S64, Shr); 2028 auto Tmp0 = B.buildAnd(S64, Src, Not); 2029 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2030 2031 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2032 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2033 2034 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2035 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeITOFP( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B, bool Signed) const { 2043 2044 Register Dst = MI.getOperand(0).getReg(); 2045 Register Src = MI.getOperand(1).getReg(); 2046 2047 const LLT S64 = LLT::scalar(64); 2048 const LLT S32 = LLT::scalar(32); 2049 2050 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2051 2052 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2053 2054 auto CvtHi = Signed ? 2055 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2056 B.buildUITOFP(S64, Unmerge.getReg(1)); 2057 2058 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2059 2060 auto ThirtyTwo = B.buildConstant(S32, 32); 2061 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2062 .addUse(CvtHi.getReg(0)) 2063 .addUse(ThirtyTwo.getReg(0)); 2064 2065 // TODO: Should this propagate fast-math-flags? 2066 B.buildFAdd(Dst, LdExp, CvtLo); 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 // TODO: Copied from DAG implementation. Verify logic and document how this 2072 // actually works. 2073 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2074 MachineRegisterInfo &MRI, 2075 MachineIRBuilder &B, 2076 bool Signed) const { 2077 2078 Register Dst = MI.getOperand(0).getReg(); 2079 Register Src = MI.getOperand(1).getReg(); 2080 2081 const LLT S64 = LLT::scalar(64); 2082 const LLT S32 = LLT::scalar(32); 2083 2084 const LLT SrcLT = MRI.getType(Src); 2085 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2086 2087 unsigned Flags = MI.getFlags(); 2088 2089 // The basic idea of converting a floating point number into a pair of 32-bit 2090 // integers is illustrated as follows: 2091 // 2092 // tf := trunc(val); 2093 // hif := floor(tf * 2^-32); 2094 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2095 // hi := fptoi(hif); 2096 // lo := fptoi(lof); 2097 // 2098 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2099 MachineInstrBuilder Sign; 2100 if (Signed && SrcLT == S32) { 2101 // However, a 32-bit floating point number has only 23 bits mantissa and 2102 // it's not enough to hold all the significant bits of `lof` if val is 2103 // negative. To avoid the loss of precision, We need to take the absolute 2104 // value after truncating and flip the result back based on the original 2105 // signedness. 2106 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2107 Trunc = B.buildFAbs(S32, Trunc, Flags); 2108 } 2109 MachineInstrBuilder K0, K1; 2110 if (SrcLT == S64) { 2111 K0 = B.buildFConstant(S64, 2112 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2113 K1 = B.buildFConstant(S64, 2114 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2115 } else { 2116 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2117 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2118 } 2119 2120 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2121 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2122 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2123 2124 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2125 : B.buildFPTOUI(S32, FloorMul); 2126 auto Lo = B.buildFPTOUI(S32, Fma); 2127 2128 if (Signed && SrcLT == S32) { 2129 // Flip the result based on the signedness, which is either all 0s or 1s. 2130 Sign = B.buildMerge(S64, {Sign, Sign}); 2131 // r := xor({lo, hi}, sign) - sign; 2132 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2133 } else 2134 B.buildMerge(Dst, {Lo, Hi}); 2135 MI.eraseFromParent(); 2136 2137 return true; 2138 } 2139 2140 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2141 MachineInstr &MI) const { 2142 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2143 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2144 2145 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2146 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2147 2148 // With ieee_mode disabled, the instructions have the correct behavior 2149 // already for G_FMINNUM/G_FMAXNUM 2150 if (!MFI->getMode().IEEE) 2151 return !IsIEEEOp; 2152 2153 if (IsIEEEOp) 2154 return true; 2155 2156 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2157 } 2158 2159 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2160 MachineInstr &MI, MachineRegisterInfo &MRI, 2161 MachineIRBuilder &B) const { 2162 // TODO: Should move some of this into LegalizerHelper. 2163 2164 // TODO: Promote dynamic indexing of s16 to s32 2165 2166 // FIXME: Artifact combiner probably should have replaced the truncated 2167 // constant before this, so we shouldn't need 2168 // getConstantVRegValWithLookThrough. 2169 Optional<ValueAndVReg> MaybeIdxVal = 2170 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2171 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2172 return true; 2173 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2174 2175 Register Dst = MI.getOperand(0).getReg(); 2176 Register Vec = MI.getOperand(1).getReg(); 2177 2178 LLT VecTy = MRI.getType(Vec); 2179 LLT EltTy = VecTy.getElementType(); 2180 assert(EltTy == MRI.getType(Dst)); 2181 2182 if (IdxVal < VecTy.getNumElements()) 2183 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2184 else 2185 B.buildUndef(Dst); 2186 2187 MI.eraseFromParent(); 2188 return true; 2189 } 2190 2191 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2192 MachineInstr &MI, MachineRegisterInfo &MRI, 2193 MachineIRBuilder &B) const { 2194 // TODO: Should move some of this into LegalizerHelper. 2195 2196 // TODO: Promote dynamic indexing of s16 to s32 2197 2198 // FIXME: Artifact combiner probably should have replaced the truncated 2199 // constant before this, so we shouldn't need 2200 // getConstantVRegValWithLookThrough. 2201 Optional<ValueAndVReg> MaybeIdxVal = 2202 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2203 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2204 return true; 2205 2206 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2207 Register Dst = MI.getOperand(0).getReg(); 2208 Register Vec = MI.getOperand(1).getReg(); 2209 Register Ins = MI.getOperand(2).getReg(); 2210 2211 LLT VecTy = MRI.getType(Vec); 2212 LLT EltTy = VecTy.getElementType(); 2213 assert(EltTy == MRI.getType(Ins)); 2214 2215 if (IdxVal < VecTy.getNumElements()) 2216 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2217 else 2218 B.buildUndef(Dst); 2219 2220 MI.eraseFromParent(); 2221 return true; 2222 } 2223 2224 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2225 MachineInstr &MI, MachineRegisterInfo &MRI, 2226 MachineIRBuilder &B) const { 2227 const LLT V2S16 = LLT::vector(2, 16); 2228 2229 Register Dst = MI.getOperand(0).getReg(); 2230 Register Src0 = MI.getOperand(1).getReg(); 2231 LLT DstTy = MRI.getType(Dst); 2232 LLT SrcTy = MRI.getType(Src0); 2233 2234 if (SrcTy == V2S16 && DstTy == V2S16 && 2235 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2236 return true; 2237 2238 MachineIRBuilder HelperBuilder(MI); 2239 GISelObserverWrapper DummyObserver; 2240 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2241 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2242 } 2243 2244 bool AMDGPULegalizerInfo::legalizeSinCos( 2245 MachineInstr &MI, MachineRegisterInfo &MRI, 2246 MachineIRBuilder &B) const { 2247 2248 Register DstReg = MI.getOperand(0).getReg(); 2249 Register SrcReg = MI.getOperand(1).getReg(); 2250 LLT Ty = MRI.getType(DstReg); 2251 unsigned Flags = MI.getFlags(); 2252 2253 Register TrigVal; 2254 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2255 if (ST.hasTrigReducedRange()) { 2256 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2257 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2258 .addUse(MulVal.getReg(0)) 2259 .setMIFlags(Flags).getReg(0); 2260 } else 2261 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2262 2263 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2264 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2265 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2266 .addUse(TrigVal) 2267 .setMIFlags(Flags); 2268 MI.eraseFromParent(); 2269 return true; 2270 } 2271 2272 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2273 MachineIRBuilder &B, 2274 const GlobalValue *GV, 2275 int64_t Offset, 2276 unsigned GAFlags) const { 2277 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2278 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2279 // to the following code sequence: 2280 // 2281 // For constant address space: 2282 // s_getpc_b64 s[0:1] 2283 // s_add_u32 s0, s0, $symbol 2284 // s_addc_u32 s1, s1, 0 2285 // 2286 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2287 // a fixup or relocation is emitted to replace $symbol with a literal 2288 // constant, which is a pc-relative offset from the encoding of the $symbol 2289 // operand to the global variable. 2290 // 2291 // For global address space: 2292 // s_getpc_b64 s[0:1] 2293 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2294 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2295 // 2296 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2297 // fixups or relocations are emitted to replace $symbol@*@lo and 2298 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2299 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2300 // operand to the global variable. 2301 // 2302 // What we want here is an offset from the value returned by s_getpc 2303 // (which is the address of the s_add_u32 instruction) to the global 2304 // variable, but since the encoding of $symbol starts 4 bytes after the start 2305 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2306 // small. This requires us to add 4 to the global variable offset in order to 2307 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2308 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2309 // instruction. 2310 2311 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2312 2313 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2314 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2315 2316 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2317 .addDef(PCReg); 2318 2319 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2320 if (GAFlags == SIInstrInfo::MO_NONE) 2321 MIB.addImm(0); 2322 else 2323 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2324 2325 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2326 2327 if (PtrTy.getSizeInBits() == 32) 2328 B.buildExtract(DstReg, PCReg, 0); 2329 return true; 2330 } 2331 2332 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2333 MachineInstr &MI, MachineRegisterInfo &MRI, 2334 MachineIRBuilder &B) const { 2335 Register DstReg = MI.getOperand(0).getReg(); 2336 LLT Ty = MRI.getType(DstReg); 2337 unsigned AS = Ty.getAddressSpace(); 2338 2339 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2340 MachineFunction &MF = B.getMF(); 2341 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2342 2343 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2344 if (!MFI->isModuleEntryFunction() && 2345 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2346 const Function &Fn = MF.getFunction(); 2347 DiagnosticInfoUnsupported BadLDSDecl( 2348 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2349 DS_Warning); 2350 Fn.getContext().diagnose(BadLDSDecl); 2351 2352 // We currently don't have a way to correctly allocate LDS objects that 2353 // aren't directly associated with a kernel. We do force inlining of 2354 // functions that use local objects. However, if these dead functions are 2355 // not eliminated, we don't want a compile time error. Just emit a warning 2356 // and a trap, since there should be no callable path here. 2357 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2358 B.buildUndef(DstReg); 2359 MI.eraseFromParent(); 2360 return true; 2361 } 2362 2363 // TODO: We could emit code to handle the initialization somewhere. 2364 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2365 const SITargetLowering *TLI = ST.getTargetLowering(); 2366 if (!TLI->shouldUseLDSConstAddress(GV)) { 2367 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2368 return true; // Leave in place; 2369 } 2370 2371 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2372 Type *Ty = GV->getValueType(); 2373 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2374 // zero-sized type in other languages to declare the dynamic shared 2375 // memory which size is not known at the compile time. They will be 2376 // allocated by the runtime and placed directly after the static 2377 // allocated ones. They all share the same offset. 2378 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2379 // Adjust alignment for that dynamic shared memory array. 2380 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2381 LLT S32 = LLT::scalar(32); 2382 auto Sz = 2383 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2384 B.buildIntToPtr(DstReg, Sz); 2385 MI.eraseFromParent(); 2386 return true; 2387 } 2388 } 2389 2390 B.buildConstant( 2391 DstReg, 2392 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 const Function &Fn = MF.getFunction(); 2398 DiagnosticInfoUnsupported BadInit( 2399 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2400 Fn.getContext().diagnose(BadInit); 2401 return true; 2402 } 2403 2404 const SITargetLowering *TLI = ST.getTargetLowering(); 2405 2406 if (TLI->shouldEmitFixup(GV)) { 2407 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2408 MI.eraseFromParent(); 2409 return true; 2410 } 2411 2412 if (TLI->shouldEmitPCReloc(GV)) { 2413 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2414 MI.eraseFromParent(); 2415 return true; 2416 } 2417 2418 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2419 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2420 2421 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2422 MachinePointerInfo::getGOT(MF), 2423 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2424 MachineMemOperand::MOInvariant, 2425 8 /*Size*/, Align(8)); 2426 2427 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2428 2429 if (Ty.getSizeInBits() == 32) { 2430 // Truncate if this is a 32-bit constant adrdess. 2431 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2432 B.buildExtract(DstReg, Load, 0); 2433 } else 2434 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2435 2436 MI.eraseFromParent(); 2437 return true; 2438 } 2439 2440 static LLT widenToNextPowerOf2(LLT Ty) { 2441 if (Ty.isVector()) 2442 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2443 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2444 } 2445 2446 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2447 MachineInstr &MI) const { 2448 MachineIRBuilder &B = Helper.MIRBuilder; 2449 MachineRegisterInfo &MRI = *B.getMRI(); 2450 GISelChangeObserver &Observer = Helper.Observer; 2451 2452 Register PtrReg = MI.getOperand(1).getReg(); 2453 LLT PtrTy = MRI.getType(PtrReg); 2454 unsigned AddrSpace = PtrTy.getAddressSpace(); 2455 2456 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2457 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2458 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2459 Observer.changingInstr(MI); 2460 MI.getOperand(1).setReg(Cast.getReg(0)); 2461 Observer.changedInstr(MI); 2462 return true; 2463 } 2464 2465 if (MI.getOpcode() != AMDGPU::G_LOAD) 2466 return false; 2467 2468 Register ValReg = MI.getOperand(0).getReg(); 2469 LLT ValTy = MRI.getType(ValReg); 2470 2471 MachineMemOperand *MMO = *MI.memoperands_begin(); 2472 const unsigned ValSize = ValTy.getSizeInBits(); 2473 const unsigned MemSize = 8 * MMO->getSize(); 2474 const Align MemAlign = MMO->getAlign(); 2475 const unsigned AlignInBits = 8 * MemAlign.value(); 2476 2477 // Widen non-power-of-2 loads to the alignment if needed 2478 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2479 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2480 2481 // This was already the correct extending load result type, so just adjust 2482 // the memory type. 2483 if (WideMemSize == ValSize) { 2484 MachineFunction &MF = B.getMF(); 2485 2486 MachineMemOperand *WideMMO = 2487 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2488 Observer.changingInstr(MI); 2489 MI.setMemRefs(MF, {WideMMO}); 2490 Observer.changedInstr(MI); 2491 return true; 2492 } 2493 2494 // Don't bother handling edge case that should probably never be produced. 2495 if (ValSize > WideMemSize) 2496 return false; 2497 2498 LLT WideTy = widenToNextPowerOf2(ValTy); 2499 2500 Register WideLoad; 2501 if (!WideTy.isVector()) { 2502 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2503 B.buildTrunc(ValReg, WideLoad).getReg(0); 2504 } else { 2505 // Extract the subvector. 2506 2507 if (isRegisterType(ValTy)) { 2508 // If this a case where G_EXTRACT is legal, use it. 2509 // (e.g. <3 x s32> -> <4 x s32>) 2510 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2511 B.buildExtract(ValReg, WideLoad, 0); 2512 } else { 2513 // For cases where the widened type isn't a nice register value, unmerge 2514 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2515 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2516 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2517 B.setInsertPt(B.getMBB(), MI.getIterator()); 2518 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2519 } 2520 } 2521 2522 MI.eraseFromParent(); 2523 return true; 2524 } 2525 2526 return false; 2527 } 2528 2529 bool AMDGPULegalizerInfo::legalizeFMad( 2530 MachineInstr &MI, MachineRegisterInfo &MRI, 2531 MachineIRBuilder &B) const { 2532 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2533 assert(Ty.isScalar()); 2534 2535 MachineFunction &MF = B.getMF(); 2536 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2537 2538 // TODO: Always legal with future ftz flag. 2539 // FIXME: Do we need just output? 2540 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2541 return true; 2542 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2543 return true; 2544 2545 MachineIRBuilder HelperBuilder(MI); 2546 GISelObserverWrapper DummyObserver; 2547 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2548 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2549 } 2550 2551 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2552 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2553 Register DstReg = MI.getOperand(0).getReg(); 2554 Register PtrReg = MI.getOperand(1).getReg(); 2555 Register CmpVal = MI.getOperand(2).getReg(); 2556 Register NewVal = MI.getOperand(3).getReg(); 2557 2558 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2559 "this should not have been custom lowered"); 2560 2561 LLT ValTy = MRI.getType(CmpVal); 2562 LLT VecTy = LLT::vector(2, ValTy); 2563 2564 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2565 2566 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2567 .addDef(DstReg) 2568 .addUse(PtrReg) 2569 .addUse(PackedVal) 2570 .setMemRefs(MI.memoperands()); 2571 2572 MI.eraseFromParent(); 2573 return true; 2574 } 2575 2576 bool AMDGPULegalizerInfo::legalizeFlog( 2577 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2578 Register Dst = MI.getOperand(0).getReg(); 2579 Register Src = MI.getOperand(1).getReg(); 2580 LLT Ty = B.getMRI()->getType(Dst); 2581 unsigned Flags = MI.getFlags(); 2582 2583 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2584 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2585 2586 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2587 MI.eraseFromParent(); 2588 return true; 2589 } 2590 2591 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2592 MachineIRBuilder &B) const { 2593 Register Dst = MI.getOperand(0).getReg(); 2594 Register Src = MI.getOperand(1).getReg(); 2595 unsigned Flags = MI.getFlags(); 2596 LLT Ty = B.getMRI()->getType(Dst); 2597 2598 auto K = B.buildFConstant(Ty, numbers::log2e); 2599 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2600 B.buildFExp2(Dst, Mul, Flags); 2601 MI.eraseFromParent(); 2602 return true; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2606 MachineIRBuilder &B) const { 2607 Register Dst = MI.getOperand(0).getReg(); 2608 Register Src0 = MI.getOperand(1).getReg(); 2609 Register Src1 = MI.getOperand(2).getReg(); 2610 unsigned Flags = MI.getFlags(); 2611 LLT Ty = B.getMRI()->getType(Dst); 2612 const LLT S16 = LLT::scalar(16); 2613 const LLT S32 = LLT::scalar(32); 2614 2615 if (Ty == S32) { 2616 auto Log = B.buildFLog2(S32, Src0, Flags); 2617 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2618 .addUse(Log.getReg(0)) 2619 .addUse(Src1) 2620 .setMIFlags(Flags); 2621 B.buildFExp2(Dst, Mul, Flags); 2622 } else if (Ty == S16) { 2623 // There's no f16 fmul_legacy, so we need to convert for it. 2624 auto Log = B.buildFLog2(S16, Src0, Flags); 2625 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2626 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2627 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2628 .addUse(Ext0.getReg(0)) 2629 .addUse(Ext1.getReg(0)) 2630 .setMIFlags(Flags); 2631 2632 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2633 } else 2634 return false; 2635 2636 MI.eraseFromParent(); 2637 return true; 2638 } 2639 2640 // Find a source register, ignoring any possible source modifiers. 2641 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2642 Register ModSrc = OrigSrc; 2643 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2644 ModSrc = SrcFNeg->getOperand(1).getReg(); 2645 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2646 ModSrc = SrcFAbs->getOperand(1).getReg(); 2647 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2648 ModSrc = SrcFAbs->getOperand(1).getReg(); 2649 return ModSrc; 2650 } 2651 2652 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2653 MachineRegisterInfo &MRI, 2654 MachineIRBuilder &B) const { 2655 2656 const LLT S1 = LLT::scalar(1); 2657 const LLT S64 = LLT::scalar(64); 2658 Register Dst = MI.getOperand(0).getReg(); 2659 Register OrigSrc = MI.getOperand(1).getReg(); 2660 unsigned Flags = MI.getFlags(); 2661 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2662 "this should not have been custom lowered"); 2663 2664 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2665 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2666 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2667 // V_FRACT bug is: 2668 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2669 // 2670 // Convert floor(x) to (x - fract(x)) 2671 2672 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2673 .addUse(OrigSrc) 2674 .setMIFlags(Flags); 2675 2676 // Give source modifier matching some assistance before obscuring a foldable 2677 // pattern. 2678 2679 // TODO: We can avoid the neg on the fract? The input sign to fract 2680 // shouldn't matter? 2681 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2682 2683 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2684 2685 Register Min = MRI.createGenericVirtualRegister(S64); 2686 2687 // We don't need to concern ourselves with the snan handling difference, so 2688 // use the one which will directly select. 2689 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2690 if (MFI->getMode().IEEE) 2691 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2692 else 2693 B.buildFMinNum(Min, Fract, Const, Flags); 2694 2695 Register CorrectedFract = Min; 2696 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2697 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2698 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2699 } 2700 2701 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2702 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2703 2704 MI.eraseFromParent(); 2705 return true; 2706 } 2707 2708 // Turn an illegal packed v2s16 build vector into bit operations. 2709 // TODO: This should probably be a bitcast action in LegalizerHelper. 2710 bool AMDGPULegalizerInfo::legalizeBuildVector( 2711 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2712 Register Dst = MI.getOperand(0).getReg(); 2713 const LLT S32 = LLT::scalar(32); 2714 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2715 2716 Register Src0 = MI.getOperand(1).getReg(); 2717 Register Src1 = MI.getOperand(2).getReg(); 2718 assert(MRI.getType(Src0) == LLT::scalar(16)); 2719 2720 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2721 B.buildBitcast(Dst, Merge); 2722 2723 MI.eraseFromParent(); 2724 return true; 2725 } 2726 2727 // Check that this is a G_XOR x, -1 2728 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2729 if (MI.getOpcode() != TargetOpcode::G_XOR) 2730 return false; 2731 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2732 return ConstVal && *ConstVal == -1; 2733 } 2734 2735 // Return the use branch instruction, otherwise null if the usage is invalid. 2736 static MachineInstr * 2737 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2738 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2739 Register CondDef = MI.getOperand(0).getReg(); 2740 if (!MRI.hasOneNonDBGUse(CondDef)) 2741 return nullptr; 2742 2743 MachineBasicBlock *Parent = MI.getParent(); 2744 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2745 2746 if (isNot(MRI, *UseMI)) { 2747 Register NegatedCond = UseMI->getOperand(0).getReg(); 2748 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2749 return nullptr; 2750 2751 // We're deleting the def of this value, so we need to remove it. 2752 UseMI->eraseFromParent(); 2753 2754 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2755 Negated = true; 2756 } 2757 2758 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2759 return nullptr; 2760 2761 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2762 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2763 if (Next == Parent->end()) { 2764 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2765 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2766 return nullptr; 2767 UncondBrTarget = &*NextMBB; 2768 } else { 2769 if (Next->getOpcode() != AMDGPU::G_BR) 2770 return nullptr; 2771 Br = &*Next; 2772 UncondBrTarget = Br->getOperand(0).getMBB(); 2773 } 2774 2775 return UseMI; 2776 } 2777 2778 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2779 const ArgDescriptor *Arg, 2780 const TargetRegisterClass *ArgRC, 2781 LLT ArgTy) const { 2782 MCRegister SrcReg = Arg->getRegister(); 2783 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2784 assert(DstReg.isVirtual() && "Virtual register expected"); 2785 2786 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2787 ArgTy); 2788 if (Arg->isMasked()) { 2789 // TODO: Should we try to emit this once in the entry block? 2790 const LLT S32 = LLT::scalar(32); 2791 const unsigned Mask = Arg->getMask(); 2792 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2793 2794 Register AndMaskSrc = LiveIn; 2795 2796 if (Shift != 0) { 2797 auto ShiftAmt = B.buildConstant(S32, Shift); 2798 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2799 } 2800 2801 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2802 } else { 2803 B.buildCopy(DstReg, LiveIn); 2804 } 2805 2806 return true; 2807 } 2808 2809 bool AMDGPULegalizerInfo::loadInputValue( 2810 Register DstReg, MachineIRBuilder &B, 2811 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2812 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2813 const ArgDescriptor *Arg; 2814 const TargetRegisterClass *ArgRC; 2815 LLT ArgTy; 2816 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2817 2818 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2819 return false; // TODO: Handle these 2820 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2821 } 2822 2823 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2824 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2825 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2826 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2827 return false; 2828 2829 MI.eraseFromParent(); 2830 return true; 2831 } 2832 2833 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2834 MachineRegisterInfo &MRI, 2835 MachineIRBuilder &B) const { 2836 Register Dst = MI.getOperand(0).getReg(); 2837 LLT DstTy = MRI.getType(Dst); 2838 LLT S16 = LLT::scalar(16); 2839 LLT S32 = LLT::scalar(32); 2840 LLT S64 = LLT::scalar(64); 2841 2842 if (DstTy == S16) 2843 return legalizeFDIV16(MI, MRI, B); 2844 if (DstTy == S32) 2845 return legalizeFDIV32(MI, MRI, B); 2846 if (DstTy == S64) 2847 return legalizeFDIV64(MI, MRI, B); 2848 2849 return false; 2850 } 2851 2852 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2853 Register DstDivReg, 2854 Register DstRemReg, 2855 Register X, 2856 Register Y) const { 2857 const LLT S1 = LLT::scalar(1); 2858 const LLT S32 = LLT::scalar(32); 2859 2860 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2861 // algorithm used here. 2862 2863 // Initial estimate of inv(y). 2864 auto FloatY = B.buildUITOFP(S32, Y); 2865 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2866 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2867 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2868 auto Z = B.buildFPTOUI(S32, ScaledY); 2869 2870 // One round of UNR. 2871 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2872 auto NegYZ = B.buildMul(S32, NegY, Z); 2873 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2874 2875 // Quotient/remainder estimate. 2876 auto Q = B.buildUMulH(S32, X, Z); 2877 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2878 2879 // First quotient/remainder refinement. 2880 auto One = B.buildConstant(S32, 1); 2881 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2882 if (DstDivReg) 2883 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2884 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2885 2886 // Second quotient/remainder refinement. 2887 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2888 if (DstDivReg) 2889 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2890 2891 if (DstRemReg) 2892 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2893 } 2894 2895 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2896 // 2897 // Return lo, hi of result 2898 // 2899 // %cvt.lo = G_UITOFP Val.lo 2900 // %cvt.hi = G_UITOFP Val.hi 2901 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2902 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2903 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2904 // %mul2 = G_FMUL %mul1, 2**(-32) 2905 // %trunc = G_INTRINSIC_TRUNC %mul2 2906 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2907 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2908 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2909 Register Val) { 2910 const LLT S32 = LLT::scalar(32); 2911 auto Unmerge = B.buildUnmerge(S32, Val); 2912 2913 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2914 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2915 2916 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2917 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2918 2919 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2920 auto Mul1 = 2921 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2922 2923 // 2**(-32) 2924 auto Mul2 = 2925 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2926 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2927 2928 // -(2**32) 2929 auto Mad2 = B.buildFMAD(S32, Trunc, 2930 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2931 2932 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2933 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2934 2935 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2936 } 2937 2938 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2939 Register DstDivReg, 2940 Register DstRemReg, 2941 Register Numer, 2942 Register Denom) const { 2943 const LLT S32 = LLT::scalar(32); 2944 const LLT S64 = LLT::scalar(64); 2945 const LLT S1 = LLT::scalar(1); 2946 Register RcpLo, RcpHi; 2947 2948 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2949 2950 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2951 2952 auto Zero64 = B.buildConstant(S64, 0); 2953 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2954 2955 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2956 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2957 2958 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2959 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2960 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2961 2962 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2963 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2964 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2965 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2966 2967 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2968 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2969 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2970 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2971 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2972 2973 auto Zero32 = B.buildConstant(S32, 0); 2974 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2975 auto Add2_HiC = 2976 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2977 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2978 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2979 2980 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2981 Register NumerLo = UnmergeNumer.getReg(0); 2982 Register NumerHi = UnmergeNumer.getReg(1); 2983 2984 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2985 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2986 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2987 Register Mul3_Lo = UnmergeMul3.getReg(0); 2988 Register Mul3_Hi = UnmergeMul3.getReg(1); 2989 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2990 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2991 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2992 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2993 2994 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2995 Register DenomLo = UnmergeDenom.getReg(0); 2996 Register DenomHi = UnmergeDenom.getReg(1); 2997 2998 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2999 auto C1 = B.buildSExt(S32, CmpHi); 3000 3001 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3002 auto C2 = B.buildSExt(S32, CmpLo); 3003 3004 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3005 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3006 3007 // TODO: Here and below portions of the code can be enclosed into if/endif. 3008 // Currently control flow is unconditional and we have 4 selects after 3009 // potential endif to substitute PHIs. 3010 3011 // if C3 != 0 ... 3012 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3013 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3014 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3015 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3016 3017 auto One64 = B.buildConstant(S64, 1); 3018 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3019 3020 auto C4 = 3021 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3022 auto C5 = 3023 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3024 auto C6 = B.buildSelect( 3025 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3026 3027 // if (C6 != 0) 3028 auto Add4 = B.buildAdd(S64, Add3, One64); 3029 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3030 3031 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3032 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3033 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3034 3035 // endif C6 3036 // endif C3 3037 3038 if (DstDivReg) { 3039 auto Sel1 = B.buildSelect( 3040 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3041 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3042 Sel1, MulHi3); 3043 } 3044 3045 if (DstRemReg) { 3046 auto Sel2 = B.buildSelect( 3047 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3048 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3049 Sel2, Sub1); 3050 } 3051 } 3052 3053 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3054 MachineRegisterInfo &MRI, 3055 MachineIRBuilder &B) const { 3056 Register DstDivReg, DstRemReg; 3057 switch (MI.getOpcode()) { 3058 default: 3059 llvm_unreachable("Unexpected opcode!"); 3060 case AMDGPU::G_UDIV: { 3061 DstDivReg = MI.getOperand(0).getReg(); 3062 break; 3063 } 3064 case AMDGPU::G_UREM: { 3065 DstRemReg = MI.getOperand(0).getReg(); 3066 break; 3067 } 3068 case AMDGPU::G_UDIVREM: { 3069 DstDivReg = MI.getOperand(0).getReg(); 3070 DstRemReg = MI.getOperand(1).getReg(); 3071 break; 3072 } 3073 } 3074 3075 const LLT S64 = LLT::scalar(64); 3076 const LLT S32 = LLT::scalar(32); 3077 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3078 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3079 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3080 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3081 3082 if (Ty == S32) 3083 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3084 else if (Ty == S64) 3085 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3086 else 3087 return false; 3088 3089 MI.eraseFromParent(); 3090 return true; 3091 } 3092 3093 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3094 MachineRegisterInfo &MRI, 3095 MachineIRBuilder &B) const { 3096 const LLT S64 = LLT::scalar(64); 3097 const LLT S32 = LLT::scalar(32); 3098 3099 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3100 if (Ty != S32 && Ty != S64) 3101 return false; 3102 3103 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3104 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3105 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3106 3107 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3108 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3109 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3110 3111 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3112 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3113 3114 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3115 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3116 3117 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3118 switch (MI.getOpcode()) { 3119 default: 3120 llvm_unreachable("Unexpected opcode!"); 3121 case AMDGPU::G_SDIV: { 3122 DstDivReg = MI.getOperand(0).getReg(); 3123 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3124 break; 3125 } 3126 case AMDGPU::G_SREM: { 3127 DstRemReg = MI.getOperand(0).getReg(); 3128 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3129 break; 3130 } 3131 case AMDGPU::G_SDIVREM: { 3132 DstDivReg = MI.getOperand(0).getReg(); 3133 DstRemReg = MI.getOperand(1).getReg(); 3134 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3135 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3136 break; 3137 } 3138 } 3139 3140 if (Ty == S32) 3141 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3142 else 3143 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3144 3145 if (DstDivReg) { 3146 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3147 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3148 B.buildSub(DstDivReg, SignXor, Sign); 3149 } 3150 3151 if (DstRemReg) { 3152 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3153 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3154 B.buildSub(DstRemReg, SignXor, Sign); 3155 } 3156 3157 MI.eraseFromParent(); 3158 return true; 3159 } 3160 3161 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3162 MachineRegisterInfo &MRI, 3163 MachineIRBuilder &B) const { 3164 Register Res = MI.getOperand(0).getReg(); 3165 Register LHS = MI.getOperand(1).getReg(); 3166 Register RHS = MI.getOperand(2).getReg(); 3167 uint16_t Flags = MI.getFlags(); 3168 LLT ResTy = MRI.getType(Res); 3169 3170 const MachineFunction &MF = B.getMF(); 3171 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3172 MI.getFlag(MachineInstr::FmAfn); 3173 3174 if (!AllowInaccurateRcp) 3175 return false; 3176 3177 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3178 // 1 / x -> RCP(x) 3179 if (CLHS->isExactlyValue(1.0)) { 3180 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3181 .addUse(RHS) 3182 .setMIFlags(Flags); 3183 3184 MI.eraseFromParent(); 3185 return true; 3186 } 3187 3188 // -1 / x -> RCP( FNEG(x) ) 3189 if (CLHS->isExactlyValue(-1.0)) { 3190 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3191 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3192 .addUse(FNeg.getReg(0)) 3193 .setMIFlags(Flags); 3194 3195 MI.eraseFromParent(); 3196 return true; 3197 } 3198 } 3199 3200 // x / y -> x * (1.0 / y) 3201 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3202 .addUse(RHS) 3203 .setMIFlags(Flags); 3204 B.buildFMul(Res, LHS, RCP, Flags); 3205 3206 MI.eraseFromParent(); 3207 return true; 3208 } 3209 3210 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3211 MachineRegisterInfo &MRI, 3212 MachineIRBuilder &B) const { 3213 Register Res = MI.getOperand(0).getReg(); 3214 Register X = MI.getOperand(1).getReg(); 3215 Register Y = MI.getOperand(2).getReg(); 3216 uint16_t Flags = MI.getFlags(); 3217 LLT ResTy = MRI.getType(Res); 3218 3219 const MachineFunction &MF = B.getMF(); 3220 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3221 MI.getFlag(MachineInstr::FmAfn); 3222 3223 if (!AllowInaccurateRcp) 3224 return false; 3225 3226 auto NegY = B.buildFNeg(ResTy, Y); 3227 auto One = B.buildFConstant(ResTy, 1.0); 3228 3229 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3230 .addUse(Y) 3231 .setMIFlags(Flags); 3232 3233 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3234 R = B.buildFMA(ResTy, Tmp0, R, R); 3235 3236 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3237 R = B.buildFMA(ResTy, Tmp1, R, R); 3238 3239 auto Ret = B.buildFMul(ResTy, X, R); 3240 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3241 3242 B.buildFMA(Res, Tmp2, R, Ret); 3243 MI.eraseFromParent(); 3244 return true; 3245 } 3246 3247 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3248 MachineRegisterInfo &MRI, 3249 MachineIRBuilder &B) const { 3250 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3251 return true; 3252 3253 Register Res = MI.getOperand(0).getReg(); 3254 Register LHS = MI.getOperand(1).getReg(); 3255 Register RHS = MI.getOperand(2).getReg(); 3256 3257 uint16_t Flags = MI.getFlags(); 3258 3259 LLT S16 = LLT::scalar(16); 3260 LLT S32 = LLT::scalar(32); 3261 3262 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3263 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3264 3265 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3266 .addUse(RHSExt.getReg(0)) 3267 .setMIFlags(Flags); 3268 3269 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3270 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3271 3272 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3273 .addUse(RDst.getReg(0)) 3274 .addUse(RHS) 3275 .addUse(LHS) 3276 .setMIFlags(Flags); 3277 3278 MI.eraseFromParent(); 3279 return true; 3280 } 3281 3282 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3283 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3284 static void toggleSPDenormMode(bool Enable, 3285 MachineIRBuilder &B, 3286 const GCNSubtarget &ST, 3287 AMDGPU::SIModeRegisterDefaults Mode) { 3288 // Set SP denorm mode to this value. 3289 unsigned SPDenormMode = 3290 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3291 3292 if (ST.hasDenormModeInst()) { 3293 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3294 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3295 3296 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3297 B.buildInstr(AMDGPU::S_DENORM_MODE) 3298 .addImm(NewDenormModeValue); 3299 3300 } else { 3301 // Select FP32 bit field in mode register. 3302 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3303 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3304 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3305 3306 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3307 .addImm(SPDenormMode) 3308 .addImm(SPDenormModeBitField); 3309 } 3310 } 3311 3312 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3313 MachineRegisterInfo &MRI, 3314 MachineIRBuilder &B) const { 3315 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3316 return true; 3317 3318 Register Res = MI.getOperand(0).getReg(); 3319 Register LHS = MI.getOperand(1).getReg(); 3320 Register RHS = MI.getOperand(2).getReg(); 3321 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3322 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3323 3324 uint16_t Flags = MI.getFlags(); 3325 3326 LLT S32 = LLT::scalar(32); 3327 LLT S1 = LLT::scalar(1); 3328 3329 auto One = B.buildFConstant(S32, 1.0f); 3330 3331 auto DenominatorScaled = 3332 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3333 .addUse(LHS) 3334 .addUse(RHS) 3335 .addImm(0) 3336 .setMIFlags(Flags); 3337 auto NumeratorScaled = 3338 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3339 .addUse(LHS) 3340 .addUse(RHS) 3341 .addImm(1) 3342 .setMIFlags(Flags); 3343 3344 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3345 .addUse(DenominatorScaled.getReg(0)) 3346 .setMIFlags(Flags); 3347 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3348 3349 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3350 // aren't modeled as reading it. 3351 if (!Mode.allFP32Denormals()) 3352 toggleSPDenormMode(true, B, ST, Mode); 3353 3354 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3355 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3356 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3357 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3358 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3359 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3360 3361 if (!Mode.allFP32Denormals()) 3362 toggleSPDenormMode(false, B, ST, Mode); 3363 3364 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3365 .addUse(Fma4.getReg(0)) 3366 .addUse(Fma1.getReg(0)) 3367 .addUse(Fma3.getReg(0)) 3368 .addUse(NumeratorScaled.getReg(1)) 3369 .setMIFlags(Flags); 3370 3371 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3372 .addUse(Fmas.getReg(0)) 3373 .addUse(RHS) 3374 .addUse(LHS) 3375 .setMIFlags(Flags); 3376 3377 MI.eraseFromParent(); 3378 return true; 3379 } 3380 3381 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3382 MachineRegisterInfo &MRI, 3383 MachineIRBuilder &B) const { 3384 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3385 return true; 3386 3387 Register Res = MI.getOperand(0).getReg(); 3388 Register LHS = MI.getOperand(1).getReg(); 3389 Register RHS = MI.getOperand(2).getReg(); 3390 3391 uint16_t Flags = MI.getFlags(); 3392 3393 LLT S64 = LLT::scalar(64); 3394 LLT S1 = LLT::scalar(1); 3395 3396 auto One = B.buildFConstant(S64, 1.0); 3397 3398 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3399 .addUse(LHS) 3400 .addUse(RHS) 3401 .addImm(0) 3402 .setMIFlags(Flags); 3403 3404 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3405 3406 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3407 .addUse(DivScale0.getReg(0)) 3408 .setMIFlags(Flags); 3409 3410 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3411 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3412 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3413 3414 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3415 .addUse(LHS) 3416 .addUse(RHS) 3417 .addImm(1) 3418 .setMIFlags(Flags); 3419 3420 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3421 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3422 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3423 3424 Register Scale; 3425 if (!ST.hasUsableDivScaleConditionOutput()) { 3426 // Workaround a hardware bug on SI where the condition output from div_scale 3427 // is not usable. 3428 3429 LLT S32 = LLT::scalar(32); 3430 3431 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3432 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3433 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3434 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3435 3436 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3437 Scale1Unmerge.getReg(1)); 3438 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3439 Scale0Unmerge.getReg(1)); 3440 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3441 } else { 3442 Scale = DivScale1.getReg(1); 3443 } 3444 3445 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3446 .addUse(Fma4.getReg(0)) 3447 .addUse(Fma3.getReg(0)) 3448 .addUse(Mul.getReg(0)) 3449 .addUse(Scale) 3450 .setMIFlags(Flags); 3451 3452 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3453 .addUse(Fmas.getReg(0)) 3454 .addUse(RHS) 3455 .addUse(LHS) 3456 .setMIFlags(Flags); 3457 3458 MI.eraseFromParent(); 3459 return true; 3460 } 3461 3462 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3463 MachineRegisterInfo &MRI, 3464 MachineIRBuilder &B) const { 3465 Register Res = MI.getOperand(0).getReg(); 3466 Register LHS = MI.getOperand(2).getReg(); 3467 Register RHS = MI.getOperand(3).getReg(); 3468 uint16_t Flags = MI.getFlags(); 3469 3470 LLT S32 = LLT::scalar(32); 3471 LLT S1 = LLT::scalar(1); 3472 3473 auto Abs = B.buildFAbs(S32, RHS, Flags); 3474 const APFloat C0Val(1.0f); 3475 3476 auto C0 = B.buildConstant(S32, 0x6f800000); 3477 auto C1 = B.buildConstant(S32, 0x2f800000); 3478 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3479 3480 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3481 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3482 3483 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3484 3485 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3486 .addUse(Mul0.getReg(0)) 3487 .setMIFlags(Flags); 3488 3489 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3490 3491 B.buildFMul(Res, Sel, Mul1, Flags); 3492 3493 MI.eraseFromParent(); 3494 return true; 3495 } 3496 3497 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3498 // FIXME: Why do we handle this one but not other removed instructions? 3499 // 3500 // Reciprocal square root. The clamp prevents infinite results, clamping 3501 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3502 // +-max_float. 3503 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3504 MachineRegisterInfo &MRI, 3505 MachineIRBuilder &B) const { 3506 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3507 return true; 3508 3509 Register Dst = MI.getOperand(0).getReg(); 3510 Register Src = MI.getOperand(2).getReg(); 3511 auto Flags = MI.getFlags(); 3512 3513 LLT Ty = MRI.getType(Dst); 3514 3515 const fltSemantics *FltSemantics; 3516 if (Ty == LLT::scalar(32)) 3517 FltSemantics = &APFloat::IEEEsingle(); 3518 else if (Ty == LLT::scalar(64)) 3519 FltSemantics = &APFloat::IEEEdouble(); 3520 else 3521 return false; 3522 3523 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3524 .addUse(Src) 3525 .setMIFlags(Flags); 3526 3527 // We don't need to concern ourselves with the snan handling difference, since 3528 // the rsq quieted (or not) so use the one which will directly select. 3529 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3530 const bool UseIEEE = MFI->getMode().IEEE; 3531 3532 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3533 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3534 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3535 3536 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3537 3538 if (UseIEEE) 3539 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3540 else 3541 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3542 MI.eraseFromParent(); 3543 return true; 3544 } 3545 3546 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3547 switch (IID) { 3548 case Intrinsic::amdgcn_ds_fadd: 3549 return AMDGPU::G_ATOMICRMW_FADD; 3550 case Intrinsic::amdgcn_ds_fmin: 3551 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3552 case Intrinsic::amdgcn_ds_fmax: 3553 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3554 default: 3555 llvm_unreachable("not a DS FP intrinsic"); 3556 } 3557 } 3558 3559 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3560 MachineInstr &MI, 3561 Intrinsic::ID IID) const { 3562 GISelChangeObserver &Observer = Helper.Observer; 3563 Observer.changingInstr(MI); 3564 3565 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3566 3567 // The remaining operands were used to set fields in the MemOperand on 3568 // construction. 3569 for (int I = 6; I > 3; --I) 3570 MI.RemoveOperand(I); 3571 3572 MI.RemoveOperand(1); // Remove the intrinsic ID. 3573 Observer.changedInstr(MI); 3574 return true; 3575 } 3576 3577 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3578 MachineRegisterInfo &MRI, 3579 MachineIRBuilder &B) const { 3580 uint64_t Offset = 3581 ST.getTargetLowering()->getImplicitParameterOffset( 3582 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3583 LLT DstTy = MRI.getType(DstReg); 3584 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3585 3586 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3587 if (!loadInputValue(KernargPtrReg, B, 3588 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3589 return false; 3590 3591 // FIXME: This should be nuw 3592 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3593 return true; 3594 } 3595 3596 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3597 MachineRegisterInfo &MRI, 3598 MachineIRBuilder &B) const { 3599 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3600 if (!MFI->isEntryFunction()) { 3601 return legalizePreloadedArgIntrin(MI, MRI, B, 3602 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3603 } 3604 3605 Register DstReg = MI.getOperand(0).getReg(); 3606 if (!getImplicitArgPtr(DstReg, MRI, B)) 3607 return false; 3608 3609 MI.eraseFromParent(); 3610 return true; 3611 } 3612 3613 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3614 MachineRegisterInfo &MRI, 3615 MachineIRBuilder &B, 3616 unsigned AddrSpace) const { 3617 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3618 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3619 Register Hi32 = Unmerge.getReg(1); 3620 3621 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3622 MI.eraseFromParent(); 3623 return true; 3624 } 3625 3626 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3627 // offset (the offset that is included in bounds checking and swizzling, to be 3628 // split between the instruction's voffset and immoffset fields) and soffset 3629 // (the offset that is excluded from bounds checking and swizzling, to go in 3630 // the instruction's soffset field). This function takes the first kind of 3631 // offset and figures out how to split it between voffset and immoffset. 3632 std::tuple<Register, unsigned, unsigned> 3633 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3634 Register OrigOffset) const { 3635 const unsigned MaxImm = 4095; 3636 Register BaseReg; 3637 unsigned TotalConstOffset; 3638 const LLT S32 = LLT::scalar(32); 3639 MachineRegisterInfo &MRI = *B.getMRI(); 3640 3641 std::tie(BaseReg, TotalConstOffset) = 3642 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3643 3644 unsigned ImmOffset = TotalConstOffset; 3645 3646 // If BaseReg is a pointer, convert it to int. 3647 if (MRI.getType(BaseReg).isPointer()) 3648 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3649 3650 // If the immediate value is too big for the immoffset field, put the value 3651 // and -4096 into the immoffset field so that the value that is copied/added 3652 // for the voffset field is a multiple of 4096, and it stands more chance 3653 // of being CSEd with the copy/add for another similar load/store. 3654 // However, do not do that rounding down to a multiple of 4096 if that is a 3655 // negative number, as it appears to be illegal to have a negative offset 3656 // in the vgpr, even if adding the immediate offset makes it positive. 3657 unsigned Overflow = ImmOffset & ~MaxImm; 3658 ImmOffset -= Overflow; 3659 if ((int32_t)Overflow < 0) { 3660 Overflow += ImmOffset; 3661 ImmOffset = 0; 3662 } 3663 3664 if (Overflow != 0) { 3665 if (!BaseReg) { 3666 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3667 } else { 3668 auto OverflowVal = B.buildConstant(S32, Overflow); 3669 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3670 } 3671 } 3672 3673 if (!BaseReg) 3674 BaseReg = B.buildConstant(S32, 0).getReg(0); 3675 3676 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3677 } 3678 3679 /// Handle register layout difference for f16 images for some subtargets. 3680 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3681 MachineRegisterInfo &MRI, 3682 Register Reg, 3683 bool ImageStore) const { 3684 const LLT S16 = LLT::scalar(16); 3685 const LLT S32 = LLT::scalar(32); 3686 LLT StoreVT = MRI.getType(Reg); 3687 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3688 3689 if (ST.hasUnpackedD16VMem()) { 3690 auto Unmerge = B.buildUnmerge(S16, Reg); 3691 3692 SmallVector<Register, 4> WideRegs; 3693 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3694 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3695 3696 int NumElts = StoreVT.getNumElements(); 3697 3698 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3699 } 3700 3701 if (ImageStore && ST.hasImageStoreD16Bug()) { 3702 if (StoreVT.getNumElements() == 2) { 3703 SmallVector<Register, 4> PackedRegs; 3704 Reg = B.buildBitcast(S32, Reg).getReg(0); 3705 PackedRegs.push_back(Reg); 3706 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3707 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3708 } 3709 3710 if (StoreVT.getNumElements() == 3) { 3711 SmallVector<Register, 4> PackedRegs; 3712 auto Unmerge = B.buildUnmerge(S16, Reg); 3713 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3714 PackedRegs.push_back(Unmerge.getReg(I)); 3715 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3716 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3717 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3718 } 3719 3720 if (StoreVT.getNumElements() == 4) { 3721 SmallVector<Register, 4> PackedRegs; 3722 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3723 auto Unmerge = B.buildUnmerge(S32, Reg); 3724 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3725 PackedRegs.push_back(Unmerge.getReg(I)); 3726 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3727 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3728 } 3729 3730 llvm_unreachable("invalid data type"); 3731 } 3732 3733 return Reg; 3734 } 3735 3736 Register AMDGPULegalizerInfo::fixStoreSourceType( 3737 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3738 MachineRegisterInfo *MRI = B.getMRI(); 3739 LLT Ty = MRI->getType(VData); 3740 3741 const LLT S16 = LLT::scalar(16); 3742 3743 // Fixup illegal register types for i8 stores. 3744 if (Ty == LLT::scalar(8) || Ty == S16) { 3745 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3746 return AnyExt; 3747 } 3748 3749 if (Ty.isVector()) { 3750 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3751 if (IsFormat) 3752 return handleD16VData(B, *MRI, VData); 3753 } 3754 } 3755 3756 return VData; 3757 } 3758 3759 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3760 MachineRegisterInfo &MRI, 3761 MachineIRBuilder &B, 3762 bool IsTyped, 3763 bool IsFormat) const { 3764 Register VData = MI.getOperand(1).getReg(); 3765 LLT Ty = MRI.getType(VData); 3766 LLT EltTy = Ty.getScalarType(); 3767 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3768 const LLT S32 = LLT::scalar(32); 3769 3770 VData = fixStoreSourceType(B, VData, IsFormat); 3771 Register RSrc = MI.getOperand(2).getReg(); 3772 3773 MachineMemOperand *MMO = *MI.memoperands_begin(); 3774 const int MemSize = MMO->getSize(); 3775 3776 unsigned ImmOffset; 3777 unsigned TotalOffset; 3778 3779 // The typed intrinsics add an immediate after the registers. 3780 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3781 3782 // The struct intrinsic variants add one additional operand over raw. 3783 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3784 Register VIndex; 3785 int OpOffset = 0; 3786 if (HasVIndex) { 3787 VIndex = MI.getOperand(3).getReg(); 3788 OpOffset = 1; 3789 } 3790 3791 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3792 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3793 3794 unsigned Format = 0; 3795 if (IsTyped) { 3796 Format = MI.getOperand(5 + OpOffset).getImm(); 3797 ++OpOffset; 3798 } 3799 3800 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3801 3802 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3803 if (TotalOffset != 0) 3804 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3805 3806 unsigned Opc; 3807 if (IsTyped) { 3808 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3809 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3810 } else if (IsFormat) { 3811 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3812 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3813 } else { 3814 switch (MemSize) { 3815 case 1: 3816 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3817 break; 3818 case 2: 3819 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3820 break; 3821 default: 3822 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3823 break; 3824 } 3825 } 3826 3827 if (!VIndex) 3828 VIndex = B.buildConstant(S32, 0).getReg(0); 3829 3830 auto MIB = B.buildInstr(Opc) 3831 .addUse(VData) // vdata 3832 .addUse(RSrc) // rsrc 3833 .addUse(VIndex) // vindex 3834 .addUse(VOffset) // voffset 3835 .addUse(SOffset) // soffset 3836 .addImm(ImmOffset); // offset(imm) 3837 3838 if (IsTyped) 3839 MIB.addImm(Format); 3840 3841 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3842 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3843 .addMemOperand(MMO); 3844 3845 MI.eraseFromParent(); 3846 return true; 3847 } 3848 3849 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3850 MachineRegisterInfo &MRI, 3851 MachineIRBuilder &B, 3852 bool IsFormat, 3853 bool IsTyped) const { 3854 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3855 MachineMemOperand *MMO = *MI.memoperands_begin(); 3856 const int MemSize = MMO->getSize(); 3857 const LLT S32 = LLT::scalar(32); 3858 3859 Register Dst = MI.getOperand(0).getReg(); 3860 Register RSrc = MI.getOperand(2).getReg(); 3861 3862 // The typed intrinsics add an immediate after the registers. 3863 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3864 3865 // The struct intrinsic variants add one additional operand over raw. 3866 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3867 Register VIndex; 3868 int OpOffset = 0; 3869 if (HasVIndex) { 3870 VIndex = MI.getOperand(3).getReg(); 3871 OpOffset = 1; 3872 } 3873 3874 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3875 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3876 3877 unsigned Format = 0; 3878 if (IsTyped) { 3879 Format = MI.getOperand(5 + OpOffset).getImm(); 3880 ++OpOffset; 3881 } 3882 3883 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3884 unsigned ImmOffset; 3885 unsigned TotalOffset; 3886 3887 LLT Ty = MRI.getType(Dst); 3888 LLT EltTy = Ty.getScalarType(); 3889 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3890 const bool Unpacked = ST.hasUnpackedD16VMem(); 3891 3892 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3893 if (TotalOffset != 0) 3894 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3895 3896 unsigned Opc; 3897 3898 if (IsTyped) { 3899 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3900 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3901 } else if (IsFormat) { 3902 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3903 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3904 } else { 3905 switch (MemSize) { 3906 case 1: 3907 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3908 break; 3909 case 2: 3910 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3911 break; 3912 default: 3913 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3914 break; 3915 } 3916 } 3917 3918 Register LoadDstReg; 3919 3920 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3921 LLT UnpackedTy = Ty.changeElementSize(32); 3922 3923 if (IsExtLoad) 3924 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3925 else if (Unpacked && IsD16 && Ty.isVector()) 3926 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3927 else 3928 LoadDstReg = Dst; 3929 3930 if (!VIndex) 3931 VIndex = B.buildConstant(S32, 0).getReg(0); 3932 3933 auto MIB = B.buildInstr(Opc) 3934 .addDef(LoadDstReg) // vdata 3935 .addUse(RSrc) // rsrc 3936 .addUse(VIndex) // vindex 3937 .addUse(VOffset) // voffset 3938 .addUse(SOffset) // soffset 3939 .addImm(ImmOffset); // offset(imm) 3940 3941 if (IsTyped) 3942 MIB.addImm(Format); 3943 3944 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3945 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3946 .addMemOperand(MMO); 3947 3948 if (LoadDstReg != Dst) { 3949 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3950 3951 // Widen result for extending loads was widened. 3952 if (IsExtLoad) 3953 B.buildTrunc(Dst, LoadDstReg); 3954 else { 3955 // Repack to original 16-bit vector result 3956 // FIXME: G_TRUNC should work, but legalization currently fails 3957 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3958 SmallVector<Register, 4> Repack; 3959 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3960 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3961 B.buildMerge(Dst, Repack); 3962 } 3963 } 3964 3965 MI.eraseFromParent(); 3966 return true; 3967 } 3968 3969 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3970 MachineIRBuilder &B, 3971 bool IsInc) const { 3972 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3973 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3974 B.buildInstr(Opc) 3975 .addDef(MI.getOperand(0).getReg()) 3976 .addUse(MI.getOperand(2).getReg()) 3977 .addUse(MI.getOperand(3).getReg()) 3978 .cloneMemRefs(MI); 3979 MI.eraseFromParent(); 3980 return true; 3981 } 3982 3983 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3984 switch (IntrID) { 3985 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3986 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3987 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3988 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3989 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3991 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3992 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3994 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3995 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3996 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3997 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3998 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4000 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4001 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4002 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4003 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4004 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4005 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4006 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4007 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4008 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4009 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4010 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4011 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4012 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4013 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4014 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4015 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4016 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4017 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4018 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4019 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4020 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4021 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4023 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4024 case Intrinsic::amdgcn_buffer_atomic_fadd: 4025 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4026 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4027 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4028 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4029 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4030 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4031 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4032 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4033 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4034 default: 4035 llvm_unreachable("unhandled atomic opcode"); 4036 } 4037 } 4038 4039 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4040 MachineIRBuilder &B, 4041 Intrinsic::ID IID) const { 4042 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4043 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4044 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4045 4046 Register Dst; 4047 4048 int OpOffset = 0; 4049 if (HasReturn) { 4050 // A few FP atomics do not support return values. 4051 Dst = MI.getOperand(0).getReg(); 4052 } else { 4053 OpOffset = -1; 4054 } 4055 4056 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4057 Register CmpVal; 4058 4059 if (IsCmpSwap) { 4060 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4061 ++OpOffset; 4062 } 4063 4064 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4065 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4066 4067 // The struct intrinsic variants add one additional operand over raw. 4068 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4069 Register VIndex; 4070 if (HasVIndex) { 4071 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4072 ++OpOffset; 4073 } 4074 4075 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4076 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4077 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4078 4079 MachineMemOperand *MMO = *MI.memoperands_begin(); 4080 4081 unsigned ImmOffset; 4082 unsigned TotalOffset; 4083 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4084 if (TotalOffset != 0) 4085 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4086 4087 if (!VIndex) 4088 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4089 4090 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4091 4092 if (HasReturn) 4093 MIB.addDef(Dst); 4094 4095 MIB.addUse(VData); // vdata 4096 4097 if (IsCmpSwap) 4098 MIB.addReg(CmpVal); 4099 4100 MIB.addUse(RSrc) // rsrc 4101 .addUse(VIndex) // vindex 4102 .addUse(VOffset) // voffset 4103 .addUse(SOffset) // soffset 4104 .addImm(ImmOffset) // offset(imm) 4105 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4106 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4107 .addMemOperand(MMO); 4108 4109 MI.eraseFromParent(); 4110 return true; 4111 } 4112 4113 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4114 /// vector with s16 typed elements. 4115 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4116 SmallVectorImpl<Register> &PackedAddrs, 4117 unsigned ArgOffset, 4118 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4119 bool IsA16, bool IsG16) { 4120 const LLT S16 = LLT::scalar(16); 4121 const LLT V2S16 = LLT::vector(2, 16); 4122 auto EndIdx = Intr->VAddrEnd; 4123 4124 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4125 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4126 if (!SrcOp.isReg()) 4127 continue; // _L to _LZ may have eliminated this. 4128 4129 Register AddrReg = SrcOp.getReg(); 4130 4131 if (I < Intr->GradientStart) { 4132 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4133 PackedAddrs.push_back(AddrReg); 4134 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4135 (I >= Intr->CoordStart && !IsA16)) { 4136 // Handle any gradient or coordinate operands that should not be packed 4137 PackedAddrs.push_back(AddrReg); 4138 } else { 4139 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4140 // derivatives dx/dh and dx/dv are packed with undef. 4141 if (((I + 1) >= EndIdx) || 4142 ((Intr->NumGradients / 2) % 2 == 1 && 4143 (I == static_cast<unsigned>(Intr->GradientStart + 4144 (Intr->NumGradients / 2) - 1) || 4145 I == static_cast<unsigned>(Intr->GradientStart + 4146 Intr->NumGradients - 1))) || 4147 // Check for _L to _LZ optimization 4148 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4149 PackedAddrs.push_back( 4150 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4151 .getReg(0)); 4152 } else { 4153 PackedAddrs.push_back( 4154 B.buildBuildVector( 4155 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4156 .getReg(0)); 4157 ++I; 4158 } 4159 } 4160 } 4161 } 4162 4163 /// Convert from separate vaddr components to a single vector address register, 4164 /// and replace the remaining operands with $noreg. 4165 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4166 int DimIdx, int NumVAddrs) { 4167 const LLT S32 = LLT::scalar(32); 4168 4169 SmallVector<Register, 8> AddrRegs; 4170 for (int I = 0; I != NumVAddrs; ++I) { 4171 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4172 if (SrcOp.isReg()) { 4173 AddrRegs.push_back(SrcOp.getReg()); 4174 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4175 } 4176 } 4177 4178 int NumAddrRegs = AddrRegs.size(); 4179 if (NumAddrRegs != 1) { 4180 // Round up to 8 elements for v5-v7 4181 // FIXME: Missing intermediate sized register classes and instructions. 4182 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4183 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4184 auto Undef = B.buildUndef(S32); 4185 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4186 NumAddrRegs = RoundedNumRegs; 4187 } 4188 4189 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4190 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4191 } 4192 4193 for (int I = 1; I != NumVAddrs; ++I) { 4194 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4195 if (SrcOp.isReg()) 4196 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4197 } 4198 } 4199 4200 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4201 /// 4202 /// Depending on the subtarget, load/store with 16-bit element data need to be 4203 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4204 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4205 /// registers. 4206 /// 4207 /// We don't want to directly select image instructions just yet, but also want 4208 /// to exposes all register repacking to the legalizer/combiners. We also don't 4209 /// want a selected instrution entering RegBankSelect. In order to avoid 4210 /// defining a multitude of intermediate image instructions, directly hack on 4211 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4212 /// now unnecessary arguments with $noreg. 4213 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4214 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4215 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4216 4217 const unsigned NumDefs = MI.getNumExplicitDefs(); 4218 const unsigned ArgOffset = NumDefs + 1; 4219 bool IsTFE = NumDefs == 2; 4220 // We are only processing the operands of d16 image operations on subtargets 4221 // that use the unpacked register layout, or need to repack the TFE result. 4222 4223 // TODO: Do we need to guard against already legalized intrinsics? 4224 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4225 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4226 4227 MachineRegisterInfo *MRI = B.getMRI(); 4228 const LLT S32 = LLT::scalar(32); 4229 const LLT S16 = LLT::scalar(16); 4230 const LLT V2S16 = LLT::vector(2, 16); 4231 4232 unsigned DMask = 0; 4233 4234 // Check for 16 bit addresses and pack if true. 4235 LLT GradTy = 4236 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4237 LLT AddrTy = 4238 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4239 const bool IsG16 = GradTy == S16; 4240 const bool IsA16 = AddrTy == S16; 4241 4242 int DMaskLanes = 0; 4243 if (!BaseOpcode->Atomic) { 4244 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4245 if (BaseOpcode->Gather4) { 4246 DMaskLanes = 4; 4247 } else if (DMask != 0) { 4248 DMaskLanes = countPopulation(DMask); 4249 } else if (!IsTFE && !BaseOpcode->Store) { 4250 // If dmask is 0, this is a no-op load. This can be eliminated. 4251 B.buildUndef(MI.getOperand(0)); 4252 MI.eraseFromParent(); 4253 return true; 4254 } 4255 } 4256 4257 Observer.changingInstr(MI); 4258 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4259 4260 unsigned NewOpcode = NumDefs == 0 ? 4261 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4262 4263 // Track that we legalized this 4264 MI.setDesc(B.getTII().get(NewOpcode)); 4265 4266 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4267 // dmask to be at least 1 otherwise the instruction will fail 4268 if (IsTFE && DMask == 0) { 4269 DMask = 0x1; 4270 DMaskLanes = 1; 4271 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4272 } 4273 4274 if (BaseOpcode->Atomic) { 4275 Register VData0 = MI.getOperand(2).getReg(); 4276 LLT Ty = MRI->getType(VData0); 4277 4278 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4279 if (Ty.isVector()) 4280 return false; 4281 4282 if (BaseOpcode->AtomicX2) { 4283 Register VData1 = MI.getOperand(3).getReg(); 4284 // The two values are packed in one register. 4285 LLT PackedTy = LLT::vector(2, Ty); 4286 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4287 MI.getOperand(2).setReg(Concat.getReg(0)); 4288 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4289 } 4290 } 4291 4292 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4293 4294 // Optimize _L to _LZ when _L is zero 4295 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4296 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4297 const ConstantFP *ConstantLod; 4298 4299 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4300 m_GFCst(ConstantLod))) { 4301 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4302 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4303 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4304 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4305 Intr->Dim); 4306 4307 // The starting indexes should remain in the same place. 4308 --CorrectedNumVAddrs; 4309 4310 MI.getOperand(MI.getNumExplicitDefs()) 4311 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4312 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4313 Intr = NewImageDimIntr; 4314 } 4315 } 4316 } 4317 4318 // Optimize _mip away, when 'lod' is zero 4319 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4320 int64_t ConstantLod; 4321 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4322 m_ICst(ConstantLod))) { 4323 if (ConstantLod == 0) { 4324 // TODO: Change intrinsic opcode and remove operand instead or replacing 4325 // it with 0, as the _L to _LZ handling is done above. 4326 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4327 --CorrectedNumVAddrs; 4328 } 4329 } 4330 } 4331 4332 // Rewrite the addressing register layout before doing anything else. 4333 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4334 // 16 bit gradients are supported, but are tied to the A16 control 4335 // so both gradients and addresses must be 16 bit 4336 return false; 4337 } 4338 4339 if (IsA16 && !ST.hasA16()) { 4340 // A16 not supported 4341 return false; 4342 } 4343 4344 if (IsA16 || IsG16) { 4345 if (Intr->NumVAddrs > 1) { 4346 SmallVector<Register, 4> PackedRegs; 4347 4348 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4349 IsG16); 4350 4351 // See also below in the non-a16 branch 4352 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4353 4354 if (!UseNSA && PackedRegs.size() > 1) { 4355 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4356 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4357 PackedRegs[0] = Concat.getReg(0); 4358 PackedRegs.resize(1); 4359 } 4360 4361 const unsigned NumPacked = PackedRegs.size(); 4362 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4363 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4364 if (!SrcOp.isReg()) { 4365 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4366 continue; 4367 } 4368 4369 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4370 4371 if (I - Intr->VAddrStart < NumPacked) 4372 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4373 else 4374 SrcOp.setReg(AMDGPU::NoRegister); 4375 } 4376 } 4377 } else { 4378 // If the register allocator cannot place the address registers contiguously 4379 // without introducing moves, then using the non-sequential address encoding 4380 // is always preferable, since it saves VALU instructions and is usually a 4381 // wash in terms of code size or even better. 4382 // 4383 // However, we currently have no way of hinting to the register allocator 4384 // that MIMG addresses should be placed contiguously when it is possible to 4385 // do so, so force non-NSA for the common 2-address case as a heuristic. 4386 // 4387 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4388 // allocation when possible. 4389 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4390 4391 if (!UseNSA && Intr->NumVAddrs > 1) 4392 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4393 Intr->NumVAddrs); 4394 } 4395 4396 int Flags = 0; 4397 if (IsA16) 4398 Flags |= 1; 4399 if (IsG16) 4400 Flags |= 2; 4401 MI.addOperand(MachineOperand::CreateImm(Flags)); 4402 4403 if (BaseOpcode->Store) { // No TFE for stores? 4404 // TODO: Handle dmask trim 4405 Register VData = MI.getOperand(1).getReg(); 4406 LLT Ty = MRI->getType(VData); 4407 if (!Ty.isVector() || Ty.getElementType() != S16) 4408 return true; 4409 4410 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4411 if (RepackedReg != VData) { 4412 MI.getOperand(1).setReg(RepackedReg); 4413 } 4414 4415 return true; 4416 } 4417 4418 Register DstReg = MI.getOperand(0).getReg(); 4419 LLT Ty = MRI->getType(DstReg); 4420 const LLT EltTy = Ty.getScalarType(); 4421 const bool IsD16 = Ty.getScalarType() == S16; 4422 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4423 4424 // Confirm that the return type is large enough for the dmask specified 4425 if (NumElts < DMaskLanes) 4426 return false; 4427 4428 if (NumElts > 4 || DMaskLanes > 4) 4429 return false; 4430 4431 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4432 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4433 4434 // The raw dword aligned data component of the load. The only legal cases 4435 // where this matters should be when using the packed D16 format, for 4436 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4437 LLT RoundedTy; 4438 4439 // S32 vector to to cover all data, plus TFE result element. 4440 LLT TFETy; 4441 4442 // Register type to use for each loaded component. Will be S32 or V2S16. 4443 LLT RegTy; 4444 4445 if (IsD16 && ST.hasUnpackedD16VMem()) { 4446 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4447 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4448 RegTy = S32; 4449 } else { 4450 unsigned EltSize = EltTy.getSizeInBits(); 4451 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4452 unsigned RoundedSize = 32 * RoundedElts; 4453 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4454 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4455 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4456 } 4457 4458 // The return type does not need adjustment. 4459 // TODO: Should we change s16 case to s32 or <2 x s16>? 4460 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4461 return true; 4462 4463 Register Dst1Reg; 4464 4465 // Insert after the instruction. 4466 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4467 4468 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4469 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4470 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4471 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4472 4473 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4474 4475 MI.getOperand(0).setReg(NewResultReg); 4476 4477 // In the IR, TFE is supposed to be used with a 2 element struct return 4478 // type. The intruction really returns these two values in one contiguous 4479 // register, with one additional dword beyond the loaded data. Rewrite the 4480 // return type to use a single register result. 4481 4482 if (IsTFE) { 4483 Dst1Reg = MI.getOperand(1).getReg(); 4484 if (MRI->getType(Dst1Reg) != S32) 4485 return false; 4486 4487 // TODO: Make sure the TFE operand bit is set. 4488 MI.RemoveOperand(1); 4489 4490 // Handle the easy case that requires no repack instructions. 4491 if (Ty == S32) { 4492 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4493 return true; 4494 } 4495 } 4496 4497 // Now figure out how to copy the new result register back into the old 4498 // result. 4499 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4500 4501 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4502 4503 if (ResultNumRegs == 1) { 4504 assert(!IsTFE); 4505 ResultRegs[0] = NewResultReg; 4506 } else { 4507 // We have to repack into a new vector of some kind. 4508 for (int I = 0; I != NumDataRegs; ++I) 4509 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4510 B.buildUnmerge(ResultRegs, NewResultReg); 4511 4512 // Drop the final TFE element to get the data part. The TFE result is 4513 // directly written to the right place already. 4514 if (IsTFE) 4515 ResultRegs.resize(NumDataRegs); 4516 } 4517 4518 // For an s16 scalar result, we form an s32 result with a truncate regardless 4519 // of packed vs. unpacked. 4520 if (IsD16 && !Ty.isVector()) { 4521 B.buildTrunc(DstReg, ResultRegs[0]); 4522 return true; 4523 } 4524 4525 // Avoid a build/concat_vector of 1 entry. 4526 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4527 B.buildBitcast(DstReg, ResultRegs[0]); 4528 return true; 4529 } 4530 4531 assert(Ty.isVector()); 4532 4533 if (IsD16) { 4534 // For packed D16 results with TFE enabled, all the data components are 4535 // S32. Cast back to the expected type. 4536 // 4537 // TODO: We don't really need to use load s32 elements. We would only need one 4538 // cast for the TFE result if a multiple of v2s16 was used. 4539 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4540 for (Register &Reg : ResultRegs) 4541 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4542 } else if (ST.hasUnpackedD16VMem()) { 4543 for (Register &Reg : ResultRegs) 4544 Reg = B.buildTrunc(S16, Reg).getReg(0); 4545 } 4546 } 4547 4548 auto padWithUndef = [&](LLT Ty, int NumElts) { 4549 if (NumElts == 0) 4550 return; 4551 Register Undef = B.buildUndef(Ty).getReg(0); 4552 for (int I = 0; I != NumElts; ++I) 4553 ResultRegs.push_back(Undef); 4554 }; 4555 4556 // Pad out any elements eliminated due to the dmask. 4557 LLT ResTy = MRI->getType(ResultRegs[0]); 4558 if (!ResTy.isVector()) { 4559 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4560 B.buildBuildVector(DstReg, ResultRegs); 4561 return true; 4562 } 4563 4564 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4565 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4566 4567 // Deal with the one annoying legal case. 4568 const LLT V3S16 = LLT::vector(3, 16); 4569 if (Ty == V3S16) { 4570 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4571 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4572 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4573 return true; 4574 } 4575 4576 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4577 B.buildConcatVectors(DstReg, ResultRegs); 4578 return true; 4579 } 4580 4581 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4582 LegalizerHelper &Helper, MachineInstr &MI) const { 4583 MachineIRBuilder &B = Helper.MIRBuilder; 4584 GISelChangeObserver &Observer = Helper.Observer; 4585 4586 Register Dst = MI.getOperand(0).getReg(); 4587 LLT Ty = B.getMRI()->getType(Dst); 4588 unsigned Size = Ty.getSizeInBits(); 4589 MachineFunction &MF = B.getMF(); 4590 4591 Observer.changingInstr(MI); 4592 4593 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4594 Ty = getBitcastRegisterType(Ty); 4595 Helper.bitcastDst(MI, Ty, 0); 4596 Dst = MI.getOperand(0).getReg(); 4597 B.setInsertPt(B.getMBB(), MI); 4598 } 4599 4600 // FIXME: We don't really need this intermediate instruction. The intrinsic 4601 // should be fixed to have a memory operand. Since it's readnone, we're not 4602 // allowed to add one. 4603 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4604 MI.RemoveOperand(1); // Remove intrinsic ID 4605 4606 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4607 // TODO: Should this use datalayout alignment? 4608 const unsigned MemSize = (Size + 7) / 8; 4609 const Align MemAlign(4); 4610 MachineMemOperand *MMO = MF.getMachineMemOperand( 4611 MachinePointerInfo(), 4612 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4613 MachineMemOperand::MOInvariant, 4614 MemSize, MemAlign); 4615 MI.addMemOperand(MF, MMO); 4616 4617 // There are no 96-bit result scalar loads, but widening to 128-bit should 4618 // always be legal. We may need to restore this to a 96-bit result if it turns 4619 // out this needs to be converted to a vector load during RegBankSelect. 4620 if (!isPowerOf2_32(Size)) { 4621 if (Ty.isVector()) 4622 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4623 else 4624 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4625 } 4626 4627 Observer.changedInstr(MI); 4628 return true; 4629 } 4630 4631 // TODO: Move to selection 4632 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4633 MachineRegisterInfo &MRI, 4634 MachineIRBuilder &B) const { 4635 if (!ST.isTrapHandlerEnabled() || 4636 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4637 return legalizeTrapEndpgm(MI, MRI, B); 4638 4639 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4640 switch (*HsaAbiVer) { 4641 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4642 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4643 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4644 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4645 return ST.supportsGetDoorbellID() ? 4646 legalizeTrapHsa(MI, MRI, B) : 4647 legalizeTrapHsaQueuePtr(MI, MRI, B); 4648 } 4649 } 4650 4651 llvm_unreachable("Unknown trap handler"); 4652 } 4653 4654 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4655 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4656 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4657 MI.eraseFromParent(); 4658 return true; 4659 } 4660 4661 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4662 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4663 // Pass queue pointer to trap handler as input, and insert trap instruction 4664 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4665 Register LiveIn = 4666 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4667 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4668 return false; 4669 4670 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4671 B.buildCopy(SGPR01, LiveIn); 4672 B.buildInstr(AMDGPU::S_TRAP) 4673 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4674 .addReg(SGPR01, RegState::Implicit); 4675 4676 MI.eraseFromParent(); 4677 return true; 4678 } 4679 4680 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4681 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4682 B.buildInstr(AMDGPU::S_TRAP) 4683 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4684 MI.eraseFromParent(); 4685 return true; 4686 } 4687 4688 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4689 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4690 // Is non-HSA path or trap-handler disabled? then, report a warning 4691 // accordingly 4692 if (!ST.isTrapHandlerEnabled() || 4693 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4694 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4695 "debugtrap handler not supported", 4696 MI.getDebugLoc(), DS_Warning); 4697 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4698 Ctx.diagnose(NoTrap); 4699 } else { 4700 // Insert debug-trap instruction 4701 B.buildInstr(AMDGPU::S_TRAP) 4702 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4703 } 4704 4705 MI.eraseFromParent(); 4706 return true; 4707 } 4708 4709 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4710 MachineIRBuilder &B) const { 4711 MachineRegisterInfo &MRI = *B.getMRI(); 4712 const LLT S16 = LLT::scalar(16); 4713 const LLT S32 = LLT::scalar(32); 4714 4715 Register DstReg = MI.getOperand(0).getReg(); 4716 Register NodePtr = MI.getOperand(2).getReg(); 4717 Register RayExtent = MI.getOperand(3).getReg(); 4718 Register RayOrigin = MI.getOperand(4).getReg(); 4719 Register RayDir = MI.getOperand(5).getReg(); 4720 Register RayInvDir = MI.getOperand(6).getReg(); 4721 Register TDescr = MI.getOperand(7).getReg(); 4722 4723 if (!ST.hasGFX10_AEncoding()) { 4724 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4725 "intrinsic not supported on subtarget", 4726 MI.getDebugLoc()); 4727 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4728 return false; 4729 } 4730 4731 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4732 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4733 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4734 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4735 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4736 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4737 4738 SmallVector<Register, 12> Ops; 4739 if (Is64) { 4740 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4741 Ops.push_back(Unmerge.getReg(0)); 4742 Ops.push_back(Unmerge.getReg(1)); 4743 } else { 4744 Ops.push_back(NodePtr); 4745 } 4746 Ops.push_back(RayExtent); 4747 4748 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4749 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4750 Ops.push_back(Unmerge.getReg(0)); 4751 Ops.push_back(Unmerge.getReg(1)); 4752 Ops.push_back(Unmerge.getReg(2)); 4753 }; 4754 4755 packLanes(RayOrigin); 4756 if (IsA16) { 4757 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4758 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4759 Register R1 = MRI.createGenericVirtualRegister(S32); 4760 Register R2 = MRI.createGenericVirtualRegister(S32); 4761 Register R3 = MRI.createGenericVirtualRegister(S32); 4762 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4763 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4764 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4765 Ops.push_back(R1); 4766 Ops.push_back(R2); 4767 Ops.push_back(R3); 4768 } else { 4769 packLanes(RayDir); 4770 packLanes(RayInvDir); 4771 } 4772 4773 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4774 .addDef(DstReg) 4775 .addImm(Opcode); 4776 4777 for (Register R : Ops) { 4778 MIB.addUse(R); 4779 } 4780 4781 MIB.addUse(TDescr) 4782 .addImm(IsA16 ? 1 : 0) 4783 .cloneMemRefs(MI); 4784 4785 MI.eraseFromParent(); 4786 return true; 4787 } 4788 4789 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4790 MachineInstr &MI) const { 4791 MachineIRBuilder &B = Helper.MIRBuilder; 4792 MachineRegisterInfo &MRI = *B.getMRI(); 4793 4794 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4795 auto IntrID = MI.getIntrinsicID(); 4796 switch (IntrID) { 4797 case Intrinsic::amdgcn_if: 4798 case Intrinsic::amdgcn_else: { 4799 MachineInstr *Br = nullptr; 4800 MachineBasicBlock *UncondBrTarget = nullptr; 4801 bool Negated = false; 4802 if (MachineInstr *BrCond = 4803 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4804 const SIRegisterInfo *TRI 4805 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4806 4807 Register Def = MI.getOperand(1).getReg(); 4808 Register Use = MI.getOperand(3).getReg(); 4809 4810 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4811 4812 if (Negated) 4813 std::swap(CondBrTarget, UncondBrTarget); 4814 4815 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4816 if (IntrID == Intrinsic::amdgcn_if) { 4817 B.buildInstr(AMDGPU::SI_IF) 4818 .addDef(Def) 4819 .addUse(Use) 4820 .addMBB(UncondBrTarget); 4821 } else { 4822 B.buildInstr(AMDGPU::SI_ELSE) 4823 .addDef(Def) 4824 .addUse(Use) 4825 .addMBB(UncondBrTarget); 4826 } 4827 4828 if (Br) { 4829 Br->getOperand(0).setMBB(CondBrTarget); 4830 } else { 4831 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4832 // since we're swapping branch targets it needs to be reinserted. 4833 // FIXME: IRTranslator should probably not do this 4834 B.buildBr(*CondBrTarget); 4835 } 4836 4837 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4838 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4839 MI.eraseFromParent(); 4840 BrCond->eraseFromParent(); 4841 return true; 4842 } 4843 4844 return false; 4845 } 4846 case Intrinsic::amdgcn_loop: { 4847 MachineInstr *Br = nullptr; 4848 MachineBasicBlock *UncondBrTarget = nullptr; 4849 bool Negated = false; 4850 if (MachineInstr *BrCond = 4851 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4852 const SIRegisterInfo *TRI 4853 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4854 4855 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4856 Register Reg = MI.getOperand(2).getReg(); 4857 4858 if (Negated) 4859 std::swap(CondBrTarget, UncondBrTarget); 4860 4861 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4862 B.buildInstr(AMDGPU::SI_LOOP) 4863 .addUse(Reg) 4864 .addMBB(UncondBrTarget); 4865 4866 if (Br) 4867 Br->getOperand(0).setMBB(CondBrTarget); 4868 else 4869 B.buildBr(*CondBrTarget); 4870 4871 MI.eraseFromParent(); 4872 BrCond->eraseFromParent(); 4873 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4874 return true; 4875 } 4876 4877 return false; 4878 } 4879 case Intrinsic::amdgcn_kernarg_segment_ptr: 4880 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4881 // This only makes sense to call in a kernel, so just lower to null. 4882 B.buildConstant(MI.getOperand(0).getReg(), 0); 4883 MI.eraseFromParent(); 4884 return true; 4885 } 4886 4887 return legalizePreloadedArgIntrin( 4888 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4889 case Intrinsic::amdgcn_implicitarg_ptr: 4890 return legalizeImplicitArgPtr(MI, MRI, B); 4891 case Intrinsic::amdgcn_workitem_id_x: 4892 return legalizePreloadedArgIntrin(MI, MRI, B, 4893 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4894 case Intrinsic::amdgcn_workitem_id_y: 4895 return legalizePreloadedArgIntrin(MI, MRI, B, 4896 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4897 case Intrinsic::amdgcn_workitem_id_z: 4898 return legalizePreloadedArgIntrin(MI, MRI, B, 4899 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4900 case Intrinsic::amdgcn_workgroup_id_x: 4901 return legalizePreloadedArgIntrin(MI, MRI, B, 4902 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4903 case Intrinsic::amdgcn_workgroup_id_y: 4904 return legalizePreloadedArgIntrin(MI, MRI, B, 4905 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4906 case Intrinsic::amdgcn_workgroup_id_z: 4907 return legalizePreloadedArgIntrin(MI, MRI, B, 4908 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4909 case Intrinsic::amdgcn_dispatch_ptr: 4910 return legalizePreloadedArgIntrin(MI, MRI, B, 4911 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4912 case Intrinsic::amdgcn_queue_ptr: 4913 return legalizePreloadedArgIntrin(MI, MRI, B, 4914 AMDGPUFunctionArgInfo::QUEUE_PTR); 4915 case Intrinsic::amdgcn_implicit_buffer_ptr: 4916 return legalizePreloadedArgIntrin( 4917 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4918 case Intrinsic::amdgcn_dispatch_id: 4919 return legalizePreloadedArgIntrin(MI, MRI, B, 4920 AMDGPUFunctionArgInfo::DISPATCH_ID); 4921 case Intrinsic::amdgcn_fdiv_fast: 4922 return legalizeFDIVFastIntrin(MI, MRI, B); 4923 case Intrinsic::amdgcn_is_shared: 4924 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4925 case Intrinsic::amdgcn_is_private: 4926 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4927 case Intrinsic::amdgcn_wavefrontsize: { 4928 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4929 MI.eraseFromParent(); 4930 return true; 4931 } 4932 case Intrinsic::amdgcn_s_buffer_load: 4933 return legalizeSBufferLoad(Helper, MI); 4934 case Intrinsic::amdgcn_raw_buffer_store: 4935 case Intrinsic::amdgcn_struct_buffer_store: 4936 return legalizeBufferStore(MI, MRI, B, false, false); 4937 case Intrinsic::amdgcn_raw_buffer_store_format: 4938 case Intrinsic::amdgcn_struct_buffer_store_format: 4939 return legalizeBufferStore(MI, MRI, B, false, true); 4940 case Intrinsic::amdgcn_raw_tbuffer_store: 4941 case Intrinsic::amdgcn_struct_tbuffer_store: 4942 return legalizeBufferStore(MI, MRI, B, true, true); 4943 case Intrinsic::amdgcn_raw_buffer_load: 4944 case Intrinsic::amdgcn_struct_buffer_load: 4945 return legalizeBufferLoad(MI, MRI, B, false, false); 4946 case Intrinsic::amdgcn_raw_buffer_load_format: 4947 case Intrinsic::amdgcn_struct_buffer_load_format: 4948 return legalizeBufferLoad(MI, MRI, B, true, false); 4949 case Intrinsic::amdgcn_raw_tbuffer_load: 4950 case Intrinsic::amdgcn_struct_tbuffer_load: 4951 return legalizeBufferLoad(MI, MRI, B, true, true); 4952 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4953 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4954 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4955 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4956 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4957 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4958 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4959 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4960 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4961 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4962 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4963 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4964 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4965 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4966 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4967 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4968 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4969 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4970 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4971 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4972 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4973 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4974 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4975 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4977 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4978 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4979 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4980 case Intrinsic::amdgcn_buffer_atomic_fadd: 4981 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4982 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4985 return legalizeBufferAtomic(MI, B, IntrID); 4986 case Intrinsic::amdgcn_atomic_inc: 4987 return legalizeAtomicIncDec(MI, B, true); 4988 case Intrinsic::amdgcn_atomic_dec: 4989 return legalizeAtomicIncDec(MI, B, false); 4990 case Intrinsic::trap: 4991 return legalizeTrapIntrinsic(MI, MRI, B); 4992 case Intrinsic::debugtrap: 4993 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4994 case Intrinsic::amdgcn_rsq_clamp: 4995 return legalizeRsqClampIntrinsic(MI, MRI, B); 4996 case Intrinsic::amdgcn_ds_fadd: 4997 case Intrinsic::amdgcn_ds_fmin: 4998 case Intrinsic::amdgcn_ds_fmax: 4999 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5000 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5001 return legalizeBVHIntrinsic(MI, B); 5002 default: { 5003 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5004 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5005 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5006 return true; 5007 } 5008 } 5009 5010 return true; 5011 } 5012