1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 // If we have a truncating store or an extending load with a data size larger 226 // than 32-bits, we need to reduce to a 32-bit type. 227 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // Do not handle extending vector loads. 280 if (Ty.isVector() && MemSize != RegSize) 281 return false; 282 283 // TODO: We should be able to widen loads if the alignment is high enough, but 284 // we also need to modify the memory access size. 285 #if 0 286 // Accept widening loads based on alignment. 287 if (IsLoad && MemSize < Size) 288 MemSize = std::max(MemSize, Align); 289 #endif 290 291 // Only 1-byte and 2-byte to 32-bit extloads are valid. 292 if (MemSize != RegSize && RegSize != 32) 293 return false; 294 295 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 296 return false; 297 298 switch (MemSize) { 299 case 8: 300 case 16: 301 case 32: 302 case 64: 303 case 128: 304 break; 305 case 96: 306 if (!ST.hasDwordx3LoadStores()) 307 return false; 308 break; 309 case 256: 310 case 512: 311 // These may contextually need to be broken down. 312 break; 313 default: 314 return false; 315 } 316 317 assert(RegSize >= MemSize); 318 319 if (AlignBits < MemSize) { 320 const SITargetLowering *TLI = ST.getTargetLowering(); 321 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 322 Align(AlignBits / 8))) 323 return false; 324 } 325 326 return true; 327 } 328 329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 330 // workaround this. Eventually it should ignore the type for loads and only care 331 // about the size. Return true in cases where we will workaround this for now by 332 // bitcasting. 333 static bool loadStoreBitcastWorkaround(const LLT Ty) { 334 if (EnableNewLegality) 335 return false; 336 337 const unsigned Size = Ty.getSizeInBits(); 338 if (Size <= 64) 339 return false; 340 if (!Ty.isVector()) 341 return true; 342 343 LLT EltTy = Ty.getElementType(); 344 if (EltTy.isPointer()) 345 return true; 346 347 unsigned EltSize = EltTy.getSizeInBits(); 348 return EltSize != 32 && EltSize != 64; 349 } 350 351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 352 const LLT Ty = Query.Types[0]; 353 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 354 !loadStoreBitcastWorkaround(Ty); 355 } 356 357 /// Return true if a load or store of the type should be lowered with a bitcast 358 /// to a different type. 359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 360 const unsigned MemSizeInBits) { 361 const unsigned Size = Ty.getSizeInBits(); 362 if (Size != MemSizeInBits) 363 return Size <= 32 && Ty.isVector(); 364 365 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 366 return true; 367 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 368 !isRegisterVectorElementType(Ty.getElementType()); 369 } 370 371 /// Return true if we should legalize a load by widening an odd sized memory 372 /// access up to the alignment. Note this case when the memory access itself 373 /// changes, not the size of the result register. 374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 375 unsigned AlignInBits, unsigned AddrSpace, 376 unsigned Opcode) { 377 // We don't want to widen cases that are naturally legal. 378 if (isPowerOf2_32(SizeInBits)) 379 return false; 380 381 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 382 // end up widening these for a scalar load during RegBankSelect, since there 383 // aren't 96-bit scalar loads. 384 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 385 return false; 386 387 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 388 return false; 389 390 // A load is known dereferenceable up to the alignment, so it's legal to widen 391 // to it. 392 // 393 // TODO: Could check dereferenceable for less aligned cases. 394 unsigned RoundedSize = NextPowerOf2(SizeInBits); 395 if (AlignInBits < RoundedSize) 396 return false; 397 398 // Do not widen if it would introduce a slow unaligned load. 399 const SITargetLowering *TLI = ST.getTargetLowering(); 400 bool Fast = false; 401 return TLI->allowsMisalignedMemoryAccessesImpl( 402 RoundedSize, AddrSpace, Align(AlignInBits / 8), 403 MachineMemOperand::MOLoad, &Fast) && 404 Fast; 405 } 406 407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 408 unsigned Opcode) { 409 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 410 return false; 411 412 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 413 Query.MMODescrs[0].AlignInBits, 414 Query.Types[1].getAddressSpace(), Opcode); 415 } 416 417 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 418 const GCNTargetMachine &TM) 419 : ST(ST_) { 420 using namespace TargetOpcode; 421 422 auto GetAddrSpacePtr = [&TM](unsigned AS) { 423 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 424 }; 425 426 const LLT S1 = LLT::scalar(1); 427 const LLT S8 = LLT::scalar(8); 428 const LLT S16 = LLT::scalar(16); 429 const LLT S32 = LLT::scalar(32); 430 const LLT S64 = LLT::scalar(64); 431 const LLT S128 = LLT::scalar(128); 432 const LLT S256 = LLT::scalar(256); 433 const LLT S512 = LLT::scalar(512); 434 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 435 436 const LLT V2S8 = LLT::vector(2, 8); 437 const LLT V2S16 = LLT::vector(2, 16); 438 const LLT V4S16 = LLT::vector(4, 16); 439 440 const LLT V2S32 = LLT::vector(2, 32); 441 const LLT V3S32 = LLT::vector(3, 32); 442 const LLT V4S32 = LLT::vector(4, 32); 443 const LLT V5S32 = LLT::vector(5, 32); 444 const LLT V6S32 = LLT::vector(6, 32); 445 const LLT V7S32 = LLT::vector(7, 32); 446 const LLT V8S32 = LLT::vector(8, 32); 447 const LLT V9S32 = LLT::vector(9, 32); 448 const LLT V10S32 = LLT::vector(10, 32); 449 const LLT V11S32 = LLT::vector(11, 32); 450 const LLT V12S32 = LLT::vector(12, 32); 451 const LLT V13S32 = LLT::vector(13, 32); 452 const LLT V14S32 = LLT::vector(14, 32); 453 const LLT V15S32 = LLT::vector(15, 32); 454 const LLT V16S32 = LLT::vector(16, 32); 455 const LLT V32S32 = LLT::vector(32, 32); 456 457 const LLT V2S64 = LLT::vector(2, 64); 458 const LLT V3S64 = LLT::vector(3, 64); 459 const LLT V4S64 = LLT::vector(4, 64); 460 const LLT V5S64 = LLT::vector(5, 64); 461 const LLT V6S64 = LLT::vector(6, 64); 462 const LLT V7S64 = LLT::vector(7, 64); 463 const LLT V8S64 = LLT::vector(8, 64); 464 const LLT V16S64 = LLT::vector(16, 64); 465 466 std::initializer_list<LLT> AllS32Vectors = 467 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 468 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 469 std::initializer_list<LLT> AllS64Vectors = 470 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 471 472 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 473 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 474 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 475 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 476 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 477 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 478 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 479 480 const LLT CodePtr = FlatPtr; 481 482 const std::initializer_list<LLT> AddrSpaces64 = { 483 GlobalPtr, ConstantPtr, FlatPtr 484 }; 485 486 const std::initializer_list<LLT> AddrSpaces32 = { 487 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 488 }; 489 490 const std::initializer_list<LLT> FPTypesBase = { 491 S32, S64 492 }; 493 494 const std::initializer_list<LLT> FPTypes16 = { 495 S32, S64, S16 496 }; 497 498 const std::initializer_list<LLT> FPTypesPK16 = { 499 S32, S64, S16, V2S16 500 }; 501 502 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 503 504 auto &LegacyInfo = getLegacyLegalizerInfo(); 505 LegacyInfo.setAction({G_BRCOND, S1}, 506 LegacyLegalizeActions::Legal); // VCC branches 507 LegacyInfo.setAction({G_BRCOND, S32}, 508 LegacyLegalizeActions::Legal); // SCC branches 509 510 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 511 // elements for v3s16 512 getActionDefinitionsBuilder(G_PHI) 513 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 514 .legalFor(AllS32Vectors) 515 .legalFor(AllS64Vectors) 516 .legalFor(AddrSpaces64) 517 .legalFor(AddrSpaces32) 518 .legalIf(isPointer(0)) 519 .clampScalar(0, S16, S256) 520 .widenScalarToNextPow2(0, 32) 521 .clampMaxNumElements(0, S32, 16) 522 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 523 .scalarize(0); 524 525 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 526 // Full set of gfx9 features. 527 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 528 .legalFor({S32, S16, V2S16}) 529 .clampScalar(0, S16, S32) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32); 533 534 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 535 .legalFor({S32, S16, V2S16}) // Clamp modifier 536 .minScalarOrElt(0, S16) 537 .clampMaxNumElements(0, S16, 2) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32) 540 .lower(); 541 } else if (ST.has16BitInsts()) { 542 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 543 .legalFor({S32, S16}) 544 .clampScalar(0, S16, S32) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 547 548 // Technically the saturating operations require clamp bit support, but this 549 // was introduced at the same time as 16-bit operations. 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 551 .legalFor({S32, S16}) // Clamp modifier 552 .minScalar(0, S16) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 16) 555 .lower(); 556 557 // We're just lowering this, but it helps get a better result to try to 558 // coerce to the desired type first. 559 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 560 .minScalar(0, S16) 561 .scalarize(0) 562 .lower(); 563 } else { 564 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 565 .legalFor({S32}) 566 .clampScalar(0, S32, S32) 567 .scalarize(0); 568 569 if (ST.hasIntClamp()) { 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .legalFor({S32}) // Clamp modifier. 572 .scalarize(0) 573 .minScalarOrElt(0, S32) 574 .lower(); 575 } else { 576 // Clamp bit support was added in VI, along with 16-bit operations. 577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 578 .minScalar(0, S32) 579 .scalarize(0) 580 .lower(); 581 } 582 583 // FIXME: DAG expansion gets better results. The widening uses the smaller 584 // range values and goes for the min/max lowering directly. 585 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 586 .minScalar(0, S32) 587 .scalarize(0) 588 .lower(); 589 } 590 591 getActionDefinitionsBuilder( 592 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 593 .customFor({S32, S64}) 594 .clampScalar(0, S32, S64) 595 .widenScalarToNextPow2(0, 32) 596 .scalarize(0); 597 598 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 599 .legalFor({S32}) 600 .maxScalarOrElt(0, S32); 601 602 if (ST.hasVOP3PInsts()) { 603 Mulh 604 .clampMaxNumElements(0, S8, 2) 605 .lowerFor({V2S8}); 606 } 607 608 Mulh 609 .scalarize(0) 610 .lower(); 611 612 // Report legal for any types we can handle anywhere. For the cases only legal 613 // on the SALU, RegBankSelect will be able to re-legalize. 614 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 615 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 616 .clampScalar(0, S32, S64) 617 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 618 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 619 .widenScalarToNextPow2(0) 620 .scalarize(0); 621 622 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 623 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 624 .legalFor({{S32, S1}, {S32, S32}}) 625 .minScalar(0, S32) 626 // TODO: .scalarize(0) 627 .lower(); 628 629 getActionDefinitionsBuilder(G_BITCAST) 630 // Don't worry about the size constraint. 631 .legalIf(all(isRegisterType(0), isRegisterType(1))) 632 .lower(); 633 634 635 getActionDefinitionsBuilder(G_CONSTANT) 636 .legalFor({S1, S32, S64, S16, GlobalPtr, 637 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 638 .legalIf(isPointer(0)) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(0); 641 642 getActionDefinitionsBuilder(G_FCONSTANT) 643 .legalFor({S32, S64, S16}) 644 .clampScalar(0, S16, S64); 645 646 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 647 .legalIf(isRegisterType(0)) 648 // s1 and s16 are special cases because they have legal operations on 649 // them, but don't really occupy registers in the normal way. 650 .legalFor({S1, S16}) 651 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 652 .clampScalarOrElt(0, S32, MaxScalar) 653 .widenScalarToNextPow2(0, 32) 654 .clampMaxNumElements(0, S32, 16); 655 656 LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr}, 657 LegacyLegalizeActions::Legal); 658 659 // If the amount is divergent, we have to do a wave reduction to get the 660 // maximum value, so this is expanded during RegBankSelect. 661 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 662 .legalFor({{PrivatePtr, S32}}); 663 664 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 665 .customIf(typeIsNot(0, PrivatePtr)); 666 667 LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal); 668 669 auto &FPOpActions = getActionDefinitionsBuilder( 670 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 671 .legalFor({S32, S64}); 672 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 673 .customFor({S32, S64}); 674 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 675 .customFor({S32, S64}); 676 677 if (ST.has16BitInsts()) { 678 if (ST.hasVOP3PInsts()) 679 FPOpActions.legalFor({S16, V2S16}); 680 else 681 FPOpActions.legalFor({S16}); 682 683 TrigActions.customFor({S16}); 684 FDIVActions.customFor({S16}); 685 } 686 687 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 688 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 689 690 if (ST.hasVOP3PInsts()) { 691 MinNumMaxNum.customFor(FPTypesPK16) 692 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 693 .clampMaxNumElements(0, S16, 2) 694 .clampScalar(0, S16, S64) 695 .scalarize(0); 696 } else if (ST.has16BitInsts()) { 697 MinNumMaxNum.customFor(FPTypes16) 698 .clampScalar(0, S16, S64) 699 .scalarize(0); 700 } else { 701 MinNumMaxNum.customFor(FPTypesBase) 702 .clampScalar(0, S32, S64) 703 .scalarize(0); 704 } 705 706 if (ST.hasVOP3PInsts()) 707 FPOpActions.clampMaxNumElements(0, S16, 2); 708 709 FPOpActions 710 .scalarize(0) 711 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 712 713 TrigActions 714 .scalarize(0) 715 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 716 717 FDIVActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 722 .legalFor(FPTypesPK16) 723 .clampMaxNumElements(0, S16, 2) 724 .scalarize(0) 725 .clampScalar(0, S16, S64); 726 727 if (ST.has16BitInsts()) { 728 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 729 .legalFor({S32, S64, S16}) 730 .scalarize(0) 731 .clampScalar(0, S16, S64); 732 } else { 733 getActionDefinitionsBuilder(G_FSQRT) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 738 if (ST.hasFractBug()) { 739 getActionDefinitionsBuilder(G_FFLOOR) 740 .customFor({S64}) 741 .legalFor({S32, S64}) 742 .scalarize(0) 743 .clampScalar(0, S32, S64); 744 } else { 745 getActionDefinitionsBuilder(G_FFLOOR) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } 750 } 751 752 getActionDefinitionsBuilder(G_FPTRUNC) 753 .legalFor({{S32, S64}, {S16, S32}}) 754 .scalarize(0) 755 .lower(); 756 757 getActionDefinitionsBuilder(G_FPEXT) 758 .legalFor({{S64, S32}, {S32, S16}}) 759 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 760 .scalarize(0); 761 762 getActionDefinitionsBuilder(G_FSUB) 763 // Use actual fsub instruction 764 .legalFor({S32}) 765 // Must use fadd + fneg 766 .lowerFor({S64, S16, V2S16}) 767 .scalarize(0) 768 .clampScalar(0, S32, S64); 769 770 // Whether this is legal depends on the floating point mode for the function. 771 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 772 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 773 FMad.customFor({S32, S16}); 774 else if (ST.hasMadMacF32Insts()) 775 FMad.customFor({S32}); 776 else if (ST.hasMadF16()) 777 FMad.customFor({S16}); 778 FMad.scalarize(0) 779 .lower(); 780 781 auto &FRem = getActionDefinitionsBuilder(G_FREM); 782 if (ST.has16BitInsts()) { 783 FRem.customFor({S16, S32, S64}); 784 } else { 785 FRem.minScalar(0, S32) 786 .customFor({S32, S64}); 787 } 788 FRem.scalarize(0); 789 790 // TODO: Do we need to clamp maximum bitwidth? 791 getActionDefinitionsBuilder(G_TRUNC) 792 .legalIf(isScalar(0)) 793 .legalFor({{V2S16, V2S32}}) 794 .clampMaxNumElements(0, S16, 2) 795 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 796 // situations (like an invalid implicit use), we don't want to infinite loop 797 // in the legalizer. 798 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 799 .alwaysLegal(); 800 801 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 802 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 803 {S32, S1}, {S64, S1}, {S16, S1}}) 804 .scalarize(0) 805 .clampScalar(0, S32, S64) 806 .widenScalarToNextPow2(1, 32); 807 808 // TODO: Split s1->s64 during regbankselect for VALU. 809 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 810 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 811 .lowerFor({{S32, S64}}) 812 .lowerIf(typeIs(1, S1)) 813 .customFor({{S64, S64}}); 814 if (ST.has16BitInsts()) 815 IToFP.legalFor({{S16, S16}}); 816 IToFP.clampScalar(1, S32, S64) 817 .minScalar(0, S32) 818 .scalarize(0) 819 .widenScalarToNextPow2(1); 820 821 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 822 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 823 .customFor({{S64, S32}, {S64, S64}}) 824 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 825 if (ST.has16BitInsts()) 826 FPToI.legalFor({{S16, S16}}); 827 else 828 FPToI.minScalar(1, S32); 829 830 FPToI.minScalar(0, S32) 831 .widenScalarToNextPow2(0, 32) 832 .scalarize(0) 833 .lower(); 834 835 // Lower roundeven into G_FRINT 836 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 837 .scalarize(0) 838 .lower(); 839 840 if (ST.has16BitInsts()) { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S16, S32, S64}) 843 .clampScalar(0, S16, S64) 844 .scalarize(0); 845 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S32, S64}) 848 .clampScalar(0, S32, S64) 849 .scalarize(0); 850 } else { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32}) 853 .customFor({S64}) 854 .clampScalar(0, S32, S64) 855 .scalarize(0); 856 } 857 858 getActionDefinitionsBuilder(G_PTR_ADD) 859 .legalIf(all(isPointer(0), sameSize(0, 1))) 860 .scalarize(0) 861 .scalarSameSizeAs(1, 0); 862 863 getActionDefinitionsBuilder(G_PTRMASK) 864 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 865 .scalarSameSizeAs(1, 0) 866 .scalarize(0); 867 868 auto &CmpBuilder = 869 getActionDefinitionsBuilder(G_ICMP) 870 // The compare output type differs based on the register bank of the output, 871 // so make both s1 and s32 legal. 872 // 873 // Scalar compares producing output in scc will be promoted to s32, as that 874 // is the allocatable register type that will be needed for the copy from 875 // scc. This will be promoted during RegBankSelect, and we assume something 876 // before that won't try to use s32 result types. 877 // 878 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 879 // bank. 880 .legalForCartesianProduct( 881 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 882 .legalForCartesianProduct( 883 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 884 if (ST.has16BitInsts()) { 885 CmpBuilder.legalFor({{S1, S16}}); 886 } 887 888 CmpBuilder 889 .widenScalarToNextPow2(1) 890 .clampScalar(1, S32, S64) 891 .scalarize(0) 892 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 893 894 getActionDefinitionsBuilder(G_FCMP) 895 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0); 899 900 // FIXME: fpow has a selection pattern that should move to custom lowering. 901 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 902 if (ST.has16BitInsts()) 903 Exp2Ops.legalFor({S32, S16}); 904 else 905 Exp2Ops.legalFor({S32}); 906 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 907 Exp2Ops.scalarize(0); 908 909 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 910 if (ST.has16BitInsts()) 911 ExpOps.customFor({{S32}, {S16}}); 912 else 913 ExpOps.customFor({S32}); 914 ExpOps.clampScalar(0, MinScalarFPTy, S32) 915 .scalarize(0); 916 917 getActionDefinitionsBuilder(G_FPOWI) 918 .clampScalar(0, MinScalarFPTy, S32) 919 .lower(); 920 921 // The 64-bit versions produce 32-bit results, but only on the SALU. 922 getActionDefinitionsBuilder(G_CTPOP) 923 .legalFor({{S32, S32}, {S32, S64}}) 924 .clampScalar(0, S32, S32) 925 .clampScalar(1, S32, S64) 926 .scalarize(0) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32); 929 930 // The hardware instructions return a different result on 0 than the generic 931 // instructions expect. The hardware produces -1, but these produce the 932 // bitwidth. 933 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 934 .scalarize(0) 935 .clampScalar(0, S32, S32) 936 .clampScalar(1, S32, S64) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32) 939 .lower(); 940 941 // The 64-bit versions produce 32-bit results, but only on the SALU. 942 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 943 .legalFor({{S32, S32}, {S32, S64}}) 944 .clampScalar(0, S32, S32) 945 .clampScalar(1, S32, S64) 946 .scalarize(0) 947 .widenScalarToNextPow2(0, 32) 948 .widenScalarToNextPow2(1, 32); 949 950 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 951 // RegBankSelect. 952 getActionDefinitionsBuilder(G_BITREVERSE) 953 .legalFor({S32, S64}) 954 .clampScalar(0, S32, S64) 955 .scalarize(0) 956 .widenScalarToNextPow2(0); 957 958 if (ST.has16BitInsts()) { 959 getActionDefinitionsBuilder(G_BSWAP) 960 .legalFor({S16, S32, V2S16}) 961 .clampMaxNumElements(0, S16, 2) 962 // FIXME: Fixing non-power-of-2 before clamp is workaround for 963 // narrowScalar limitation. 964 .widenScalarToNextPow2(0) 965 .clampScalar(0, S16, S32) 966 .scalarize(0); 967 968 if (ST.hasVOP3PInsts()) { 969 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 970 .legalFor({S32, S16, V2S16}) 971 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 972 .clampMaxNumElements(0, S16, 2) 973 .minScalar(0, S16) 974 .widenScalarToNextPow2(0) 975 .scalarize(0) 976 .lower(); 977 } else { 978 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 979 .legalFor({S32, S16}) 980 .widenScalarToNextPow2(0) 981 .minScalar(0, S16) 982 .scalarize(0) 983 .lower(); 984 } 985 } else { 986 // TODO: Should have same legality without v_perm_b32 987 getActionDefinitionsBuilder(G_BSWAP) 988 .legalFor({S32}) 989 .lowerIf(scalarNarrowerThan(0, 32)) 990 // FIXME: Fixing non-power-of-2 before clamp is workaround for 991 // narrowScalar limitation. 992 .widenScalarToNextPow2(0) 993 .maxScalar(0, S32) 994 .scalarize(0) 995 .lower(); 996 997 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 998 .legalFor({S32}) 999 .minScalar(0, S32) 1000 .widenScalarToNextPow2(0) 1001 .scalarize(0) 1002 .lower(); 1003 } 1004 1005 getActionDefinitionsBuilder(G_INTTOPTR) 1006 // List the common cases 1007 .legalForCartesianProduct(AddrSpaces64, {S64}) 1008 .legalForCartesianProduct(AddrSpaces32, {S32}) 1009 .scalarize(0) 1010 // Accept any address space as long as the size matches 1011 .legalIf(sameSize(0, 1)) 1012 .widenScalarIf(smallerThan(1, 0), 1013 [](const LegalityQuery &Query) { 1014 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1015 }) 1016 .narrowScalarIf(largerThan(1, 0), 1017 [](const LegalityQuery &Query) { 1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1019 }); 1020 1021 getActionDefinitionsBuilder(G_PTRTOINT) 1022 // List the common cases 1023 .legalForCartesianProduct(AddrSpaces64, {S64}) 1024 .legalForCartesianProduct(AddrSpaces32, {S32}) 1025 .scalarize(0) 1026 // Accept any address space as long as the size matches 1027 .legalIf(sameSize(0, 1)) 1028 .widenScalarIf(smallerThan(0, 1), 1029 [](const LegalityQuery &Query) { 1030 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1031 }) 1032 .narrowScalarIf( 1033 largerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }); 1037 1038 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1039 .scalarize(0) 1040 .custom(); 1041 1042 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1043 bool IsLoad) -> bool { 1044 const LLT DstTy = Query.Types[0]; 1045 1046 // Split vector extloads. 1047 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1048 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1049 1050 if (MemSize < DstTy.getSizeInBits()) 1051 MemSize = std::max(MemSize, AlignBits); 1052 1053 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1054 return true; 1055 1056 const LLT PtrTy = Query.Types[1]; 1057 unsigned AS = PtrTy.getAddressSpace(); 1058 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1059 return true; 1060 1061 // Catch weird sized loads that don't evenly divide into the access sizes 1062 // TODO: May be able to widen depending on alignment etc. 1063 unsigned NumRegs = (MemSize + 31) / 32; 1064 if (NumRegs == 3) { 1065 if (!ST.hasDwordx3LoadStores()) 1066 return true; 1067 } else { 1068 // If the alignment allows, these should have been widened. 1069 if (!isPowerOf2_32(NumRegs)) 1070 return true; 1071 } 1072 1073 if (AlignBits < MemSize) { 1074 const SITargetLowering *TLI = ST.getTargetLowering(); 1075 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1076 Align(AlignBits / 8)); 1077 } 1078 1079 return false; 1080 }; 1081 1082 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1083 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1084 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1085 1086 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1087 // LDS 1088 // TODO: Unsupported flat for SI. 1089 1090 for (unsigned Op : {G_LOAD, G_STORE}) { 1091 const bool IsStore = Op == G_STORE; 1092 1093 auto &Actions = getActionDefinitionsBuilder(Op); 1094 // Explicitly list some common cases. 1095 // TODO: Does this help compile time at all? 1096 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1097 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1098 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1099 {S64, GlobalPtr, 64, GlobalAlign32}, 1100 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1101 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1102 {S32, GlobalPtr, 8, GlobalAlign8}, 1103 {S32, GlobalPtr, 16, GlobalAlign16}, 1104 1105 {S32, LocalPtr, 32, 32}, 1106 {S64, LocalPtr, 64, 32}, 1107 {V2S32, LocalPtr, 64, 32}, 1108 {S32, LocalPtr, 8, 8}, 1109 {S32, LocalPtr, 16, 16}, 1110 {V2S16, LocalPtr, 32, 32}, 1111 1112 {S32, PrivatePtr, 32, 32}, 1113 {S32, PrivatePtr, 8, 8}, 1114 {S32, PrivatePtr, 16, 16}, 1115 {V2S16, PrivatePtr, 32, 32}, 1116 1117 {S32, ConstantPtr, 32, GlobalAlign32}, 1118 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1119 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1120 {S64, ConstantPtr, 64, GlobalAlign32}, 1121 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1122 Actions.legalIf( 1123 [=](const LegalityQuery &Query) -> bool { 1124 return isLoadStoreLegal(ST, Query); 1125 }); 1126 1127 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1128 // 64-bits. 1129 // 1130 // TODO: Should generalize bitcast action into coerce, which will also cover 1131 // inserting addrspacecasts. 1132 Actions.customIf(typeIs(1, Constant32Ptr)); 1133 1134 // Turn any illegal element vectors into something easier to deal 1135 // with. These will ultimately produce 32-bit scalar shifts to extract the 1136 // parts anyway. 1137 // 1138 // For odd 16-bit element vectors, prefer to split those into pieces with 1139 // 16-bit vector parts. 1140 Actions.bitcastIf( 1141 [=](const LegalityQuery &Query) -> bool { 1142 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1143 Query.MMODescrs[0].SizeInBits); 1144 }, bitcastToRegisterType(0)); 1145 1146 if (!IsStore) { 1147 // Widen suitably aligned loads by loading extra bytes. The standard 1148 // legalization actions can't properly express widening memory operands. 1149 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1150 return shouldWidenLoad(ST, Query, G_LOAD); 1151 }); 1152 } 1153 1154 // FIXME: load/store narrowing should be moved to lower action 1155 Actions 1156 .narrowScalarIf( 1157 [=](const LegalityQuery &Query) -> bool { 1158 return !Query.Types[0].isVector() && 1159 needToSplitMemOp(Query, Op == G_LOAD); 1160 }, 1161 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1162 const LLT DstTy = Query.Types[0]; 1163 const LLT PtrTy = Query.Types[1]; 1164 1165 const unsigned DstSize = DstTy.getSizeInBits(); 1166 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1167 1168 // Split extloads. 1169 if (DstSize > MemSize) 1170 return std::make_pair(0, LLT::scalar(MemSize)); 1171 1172 if (!isPowerOf2_32(DstSize)) { 1173 // We're probably decomposing an odd sized store. Try to split 1174 // to the widest type. TODO: Account for alignment. As-is it 1175 // should be OK, since the new parts will be further legalized. 1176 unsigned FloorSize = PowerOf2Floor(DstSize); 1177 return std::make_pair(0, LLT::scalar(FloorSize)); 1178 } 1179 1180 if (DstSize > 32 && (DstSize % 32 != 0)) { 1181 // FIXME: Need a way to specify non-extload of larger size if 1182 // suitably aligned. 1183 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1184 } 1185 1186 unsigned MaxSize = maxSizeForAddrSpace(ST, 1187 PtrTy.getAddressSpace(), 1188 Op == G_LOAD); 1189 if (MemSize > MaxSize) 1190 return std::make_pair(0, LLT::scalar(MaxSize)); 1191 1192 unsigned Align = Query.MMODescrs[0].AlignInBits; 1193 return std::make_pair(0, LLT::scalar(Align)); 1194 }) 1195 .fewerElementsIf( 1196 [=](const LegalityQuery &Query) -> bool { 1197 return Query.Types[0].isVector() && 1198 needToSplitMemOp(Query, Op == G_LOAD); 1199 }, 1200 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1201 const LLT DstTy = Query.Types[0]; 1202 const LLT PtrTy = Query.Types[1]; 1203 1204 LLT EltTy = DstTy.getElementType(); 1205 unsigned MaxSize = maxSizeForAddrSpace(ST, 1206 PtrTy.getAddressSpace(), 1207 Op == G_LOAD); 1208 1209 // FIXME: Handle widened to power of 2 results better. This ends 1210 // up scalarizing. 1211 // FIXME: 3 element stores scalarized on SI 1212 1213 // Split if it's too large for the address space. 1214 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1215 unsigned NumElts = DstTy.getNumElements(); 1216 unsigned EltSize = EltTy.getSizeInBits(); 1217 1218 if (MaxSize % EltSize == 0) { 1219 return std::make_pair( 1220 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1221 } 1222 1223 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1224 1225 // FIXME: Refine when odd breakdowns handled 1226 // The scalars will need to be re-legalized. 1227 if (NumPieces == 1 || NumPieces >= NumElts || 1228 NumElts % NumPieces != 0) 1229 return std::make_pair(0, EltTy); 1230 1231 return std::make_pair(0, 1232 LLT::vector(NumElts / NumPieces, EltTy)); 1233 } 1234 1235 // FIXME: We could probably handle weird extending loads better. 1236 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1237 if (DstTy.getSizeInBits() > MemSize) 1238 return std::make_pair(0, EltTy); 1239 1240 unsigned EltSize = EltTy.getSizeInBits(); 1241 unsigned DstSize = DstTy.getSizeInBits(); 1242 if (!isPowerOf2_32(DstSize)) { 1243 // We're probably decomposing an odd sized store. Try to split 1244 // to the widest type. TODO: Account for alignment. As-is it 1245 // should be OK, since the new parts will be further legalized. 1246 unsigned FloorSize = PowerOf2Floor(DstSize); 1247 return std::make_pair( 1248 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1249 } 1250 1251 // Need to split because of alignment. 1252 unsigned Align = Query.MMODescrs[0].AlignInBits; 1253 if (EltSize > Align && 1254 (EltSize / Align < DstTy.getNumElements())) { 1255 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1256 } 1257 1258 // May need relegalization for the scalars. 1259 return std::make_pair(0, EltTy); 1260 }) 1261 .lowerIfMemSizeNotPow2() 1262 .minScalar(0, S32) 1263 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1264 .widenScalarToNextPow2(0) 1265 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1266 .lower(); 1267 } 1268 1269 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1270 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1271 {S32, GlobalPtr, 16, 2 * 8}, 1272 {S32, LocalPtr, 8, 8}, 1273 {S32, LocalPtr, 16, 16}, 1274 {S32, PrivatePtr, 8, 8}, 1275 {S32, PrivatePtr, 16, 16}, 1276 {S32, ConstantPtr, 8, 8}, 1277 {S32, ConstantPtr, 16, 2 * 8}}) 1278 .legalIf( 1279 [=](const LegalityQuery &Query) -> bool { 1280 return isLoadStoreLegal(ST, Query); 1281 }); 1282 1283 if (ST.hasFlatAddressSpace()) { 1284 ExtLoads.legalForTypesWithMemDesc( 1285 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1286 } 1287 1288 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1289 // 64-bits. 1290 // 1291 // TODO: Should generalize bitcast action into coerce, which will also cover 1292 // inserting addrspacecasts. 1293 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1294 1295 ExtLoads.clampScalar(0, S32, S32) 1296 .widenScalarToNextPow2(0) 1297 .unsupportedIfMemSizeNotPow2() 1298 .lower(); 1299 1300 auto &Atomics = getActionDefinitionsBuilder( 1301 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1302 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1303 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1304 G_ATOMICRMW_UMIN}) 1305 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1306 {S64, GlobalPtr}, {S64, LocalPtr}, 1307 {S32, RegionPtr}, {S64, RegionPtr}}); 1308 if (ST.hasFlatAddressSpace()) { 1309 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1310 } 1311 1312 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1313 if (ST.hasLDSFPAtomics()) { 1314 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1315 if (ST.hasGFX90AInsts()) 1316 Atomic.legalFor({{S64, LocalPtr}}); 1317 } 1318 if (ST.hasAtomicFaddInsts()) 1319 Atomic.legalFor({{S32, GlobalPtr}}); 1320 1321 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1322 // demarshalling 1323 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1324 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1325 {S32, FlatPtr}, {S64, FlatPtr}}) 1326 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1327 {S32, RegionPtr}, {S64, RegionPtr}}); 1328 // TODO: Pointer types, any 32-bit or 64-bit vector 1329 1330 // Condition should be s32 for scalar, s1 for vector. 1331 getActionDefinitionsBuilder(G_SELECT) 1332 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1333 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1334 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1335 .clampScalar(0, S16, S64) 1336 .scalarize(1) 1337 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1338 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1339 .clampMaxNumElements(0, S32, 2) 1340 .clampMaxNumElements(0, LocalPtr, 2) 1341 .clampMaxNumElements(0, PrivatePtr, 2) 1342 .scalarize(0) 1343 .widenScalarToNextPow2(0) 1344 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1345 1346 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1347 // be more flexible with the shift amount type. 1348 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1349 .legalFor({{S32, S32}, {S64, S32}}); 1350 if (ST.has16BitInsts()) { 1351 if (ST.hasVOP3PInsts()) { 1352 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1353 .clampMaxNumElements(0, S16, 2); 1354 } else 1355 Shifts.legalFor({{S16, S16}}); 1356 1357 // TODO: Support 16-bit shift amounts for all types 1358 Shifts.widenScalarIf( 1359 [=](const LegalityQuery &Query) { 1360 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1361 // 32-bit amount. 1362 const LLT ValTy = Query.Types[0]; 1363 const LLT AmountTy = Query.Types[1]; 1364 return ValTy.getSizeInBits() <= 16 && 1365 AmountTy.getSizeInBits() < 16; 1366 }, changeTo(1, S16)); 1367 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1368 Shifts.clampScalar(1, S32, S32); 1369 Shifts.clampScalar(0, S16, S64); 1370 Shifts.widenScalarToNextPow2(0, 16); 1371 1372 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1373 .minScalar(0, S16) 1374 .scalarize(0) 1375 .lower(); 1376 } else { 1377 // Make sure we legalize the shift amount type first, as the general 1378 // expansion for the shifted type will produce much worse code if it hasn't 1379 // been truncated already. 1380 Shifts.clampScalar(1, S32, S32); 1381 Shifts.clampScalar(0, S32, S64); 1382 Shifts.widenScalarToNextPow2(0, 32); 1383 1384 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1385 .minScalar(0, S32) 1386 .scalarize(0) 1387 .lower(); 1388 } 1389 Shifts.scalarize(0); 1390 1391 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1392 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1393 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1394 unsigned IdxTypeIdx = 2; 1395 1396 getActionDefinitionsBuilder(Op) 1397 .customIf([=](const LegalityQuery &Query) { 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1401 const unsigned EltSize = EltTy.getSizeInBits(); 1402 return (EltSize == 32 || EltSize == 64) && 1403 VecTy.getSizeInBits() % 32 == 0 && 1404 VecTy.getSizeInBits() <= MaxRegisterSize && 1405 IdxTy.getSizeInBits() == 32; 1406 }) 1407 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1408 bitcastToVectorElement32(VecTypeIdx)) 1409 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1410 .bitcastIf( 1411 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1412 [=](const LegalityQuery &Query) { 1413 // For > 64-bit element types, try to turn this into a 64-bit 1414 // element vector since we may be able to do better indexing 1415 // if this is scalar. If not, fall back to 32. 1416 const LLT EltTy = Query.Types[EltTypeIdx]; 1417 const LLT VecTy = Query.Types[VecTypeIdx]; 1418 const unsigned DstEltSize = EltTy.getSizeInBits(); 1419 const unsigned VecSize = VecTy.getSizeInBits(); 1420 1421 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1422 return std::make_pair( 1423 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1424 }) 1425 .clampScalar(EltTypeIdx, S32, S64) 1426 .clampScalar(VecTypeIdx, S32, S64) 1427 .clampScalar(IdxTypeIdx, S32, S32) 1428 .clampMaxNumElements(VecTypeIdx, S32, 32) 1429 // TODO: Clamp elements for 64-bit vectors? 1430 // It should only be necessary with variable indexes. 1431 // As a last resort, lower to the stack 1432 .lower(); 1433 } 1434 1435 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1436 .unsupportedIf([=](const LegalityQuery &Query) { 1437 const LLT &EltTy = Query.Types[1].getElementType(); 1438 return Query.Types[0] != EltTy; 1439 }); 1440 1441 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1442 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1443 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1444 1445 // FIXME: Doesn't handle extract of illegal sizes. 1446 getActionDefinitionsBuilder(Op) 1447 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 // TODO: Only Try to form v2s16 with legal packed instructions. 1621 getActionDefinitionsBuilder(G_FSHR) 1622 .legalFor({{S32, S32}}) 1623 .lowerFor({{V2S16, V2S16}}) 1624 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1625 .scalarize(0) 1626 .lower(); 1627 1628 if (ST.hasVOP3PInsts()) { 1629 getActionDefinitionsBuilder(G_FSHL) 1630 .lowerFor({{V2S16, V2S16}}) 1631 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1632 .scalarize(0) 1633 .lower(); 1634 } else { 1635 getActionDefinitionsBuilder(G_FSHL) 1636 .scalarize(0) 1637 .lower(); 1638 } 1639 1640 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1641 .legalFor({S64}); 1642 1643 getActionDefinitionsBuilder(G_FENCE) 1644 .alwaysLegal(); 1645 1646 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1647 .scalarize(0) 1648 .minScalar(0, S32) 1649 .lower(); 1650 1651 getActionDefinitionsBuilder({ 1652 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1653 G_FCOPYSIGN, 1654 1655 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1656 G_ATOMICRMW_NAND, 1657 G_ATOMICRMW_FSUB, 1658 G_READ_REGISTER, 1659 G_WRITE_REGISTER, 1660 1661 G_SADDO, G_SSUBO, 1662 1663 // TODO: Implement 1664 G_FMINIMUM, G_FMAXIMUM}).lower(); 1665 1666 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1667 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1668 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1669 .unsupported(); 1670 1671 getLegacyLegalizerInfo().computeTables(); 1672 verify(*ST.getInstrInfo()); 1673 } 1674 1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1676 MachineInstr &MI) const { 1677 MachineIRBuilder &B = Helper.MIRBuilder; 1678 MachineRegisterInfo &MRI = *B.getMRI(); 1679 1680 switch (MI.getOpcode()) { 1681 case TargetOpcode::G_ADDRSPACE_CAST: 1682 return legalizeAddrSpaceCast(MI, MRI, B); 1683 case TargetOpcode::G_FRINT: 1684 return legalizeFrint(MI, MRI, B); 1685 case TargetOpcode::G_FCEIL: 1686 return legalizeFceil(MI, MRI, B); 1687 case TargetOpcode::G_FREM: 1688 return legalizeFrem(MI, MRI, B); 1689 case TargetOpcode::G_INTRINSIC_TRUNC: 1690 return legalizeIntrinsicTrunc(MI, MRI, B); 1691 case TargetOpcode::G_SITOFP: 1692 return legalizeITOFP(MI, MRI, B, true); 1693 case TargetOpcode::G_UITOFP: 1694 return legalizeITOFP(MI, MRI, B, false); 1695 case TargetOpcode::G_FPTOSI: 1696 return legalizeFPTOI(MI, MRI, B, true); 1697 case TargetOpcode::G_FPTOUI: 1698 return legalizeFPTOI(MI, MRI, B, false); 1699 case TargetOpcode::G_FMINNUM: 1700 case TargetOpcode::G_FMAXNUM: 1701 case TargetOpcode::G_FMINNUM_IEEE: 1702 case TargetOpcode::G_FMAXNUM_IEEE: 1703 return legalizeMinNumMaxNum(Helper, MI); 1704 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1705 return legalizeExtractVectorElt(MI, MRI, B); 1706 case TargetOpcode::G_INSERT_VECTOR_ELT: 1707 return legalizeInsertVectorElt(MI, MRI, B); 1708 case TargetOpcode::G_SHUFFLE_VECTOR: 1709 return legalizeShuffleVector(MI, MRI, B); 1710 case TargetOpcode::G_FSIN: 1711 case TargetOpcode::G_FCOS: 1712 return legalizeSinCos(MI, MRI, B); 1713 case TargetOpcode::G_GLOBAL_VALUE: 1714 return legalizeGlobalValue(MI, MRI, B); 1715 case TargetOpcode::G_LOAD: 1716 case TargetOpcode::G_SEXTLOAD: 1717 case TargetOpcode::G_ZEXTLOAD: 1718 return legalizeLoad(Helper, MI); 1719 case TargetOpcode::G_FMAD: 1720 return legalizeFMad(MI, MRI, B); 1721 case TargetOpcode::G_FDIV: 1722 return legalizeFDIV(MI, MRI, B); 1723 case TargetOpcode::G_UDIV: 1724 case TargetOpcode::G_UREM: 1725 case TargetOpcode::G_UDIVREM: 1726 return legalizeUnsignedDIV_REM(MI, MRI, B); 1727 case TargetOpcode::G_SDIV: 1728 case TargetOpcode::G_SREM: 1729 case TargetOpcode::G_SDIVREM: 1730 return legalizeSignedDIV_REM(MI, MRI, B); 1731 case TargetOpcode::G_ATOMIC_CMPXCHG: 1732 return legalizeAtomicCmpXChg(MI, MRI, B); 1733 case TargetOpcode::G_FLOG: 1734 return legalizeFlog(MI, B, numbers::ln2f); 1735 case TargetOpcode::G_FLOG10: 1736 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1737 case TargetOpcode::G_FEXP: 1738 return legalizeFExp(MI, B); 1739 case TargetOpcode::G_FPOW: 1740 return legalizeFPow(MI, B); 1741 case TargetOpcode::G_FFLOOR: 1742 return legalizeFFloor(MI, MRI, B); 1743 case TargetOpcode::G_BUILD_VECTOR: 1744 return legalizeBuildVector(MI, MRI, B); 1745 default: 1746 return false; 1747 } 1748 1749 llvm_unreachable("expected switch to return"); 1750 } 1751 1752 Register AMDGPULegalizerInfo::getSegmentAperture( 1753 unsigned AS, 1754 MachineRegisterInfo &MRI, 1755 MachineIRBuilder &B) const { 1756 MachineFunction &MF = B.getMF(); 1757 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1758 const LLT S32 = LLT::scalar(32); 1759 1760 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1761 1762 if (ST.hasApertureRegs()) { 1763 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1764 // getreg. 1765 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1766 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1767 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1768 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1769 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1770 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1771 unsigned Encoding = 1772 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1773 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1774 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1775 1776 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1777 1778 B.buildInstr(AMDGPU::S_GETREG_B32) 1779 .addDef(GetReg) 1780 .addImm(Encoding); 1781 MRI.setType(GetReg, S32); 1782 1783 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1784 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1785 } 1786 1787 Register QueuePtr = MRI.createGenericVirtualRegister( 1788 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1789 1790 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1791 return Register(); 1792 1793 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1794 // private_segment_aperture_base_hi. 1795 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1796 1797 // TODO: can we be smarter about machine pointer info? 1798 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1799 MachineMemOperand *MMO = MF.getMachineMemOperand( 1800 PtrInfo, 1801 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1802 MachineMemOperand::MOInvariant, 1803 4, commonAlignment(Align(64), StructOffset)); 1804 1805 Register LoadAddr; 1806 1807 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1808 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1809 } 1810 1811 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1812 MachineInstr &MI, MachineRegisterInfo &MRI, 1813 MachineIRBuilder &B) const { 1814 MachineFunction &MF = B.getMF(); 1815 1816 const LLT S32 = LLT::scalar(32); 1817 Register Dst = MI.getOperand(0).getReg(); 1818 Register Src = MI.getOperand(1).getReg(); 1819 1820 LLT DstTy = MRI.getType(Dst); 1821 LLT SrcTy = MRI.getType(Src); 1822 unsigned DestAS = DstTy.getAddressSpace(); 1823 unsigned SrcAS = SrcTy.getAddressSpace(); 1824 1825 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1826 // vector element. 1827 assert(!DstTy.isVector()); 1828 1829 const AMDGPUTargetMachine &TM 1830 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1831 1832 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1833 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1834 return true; 1835 } 1836 1837 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1838 // Truncate. 1839 B.buildExtract(Dst, Src, 0); 1840 MI.eraseFromParent(); 1841 return true; 1842 } 1843 1844 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1845 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1846 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1847 1848 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1849 // another. Merge operands are required to be the same type, but creating an 1850 // extra ptrtoint would be kind of pointless. 1851 auto HighAddr = B.buildConstant( 1852 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1853 B.buildMerge(Dst, {Src, HighAddr}); 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1859 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1860 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1861 unsigned NullVal = TM.getNullPointerValue(DestAS); 1862 1863 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1864 auto FlatNull = B.buildConstant(SrcTy, 0); 1865 1866 // Extract low 32-bits of the pointer. 1867 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1868 1869 auto CmpRes = 1870 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1871 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1872 1873 MI.eraseFromParent(); 1874 return true; 1875 } 1876 1877 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1878 return false; 1879 1880 if (!ST.hasFlatAddressSpace()) 1881 return false; 1882 1883 auto SegmentNull = 1884 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1885 auto FlatNull = 1886 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1887 1888 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1889 if (!ApertureReg.isValid()) 1890 return false; 1891 1892 auto CmpRes = 1893 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1894 1895 // Coerce the type of the low half of the result so we can use merge_values. 1896 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1897 1898 // TODO: Should we allow mismatched types but matching sizes in merges to 1899 // avoid the ptrtoint? 1900 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1901 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1902 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 bool AMDGPULegalizerInfo::legalizeFrint( 1908 MachineInstr &MI, MachineRegisterInfo &MRI, 1909 MachineIRBuilder &B) const { 1910 Register Src = MI.getOperand(1).getReg(); 1911 LLT Ty = MRI.getType(Src); 1912 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1913 1914 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1915 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1916 1917 auto C1 = B.buildFConstant(Ty, C1Val); 1918 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1919 1920 // TODO: Should this propagate fast-math-flags? 1921 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1922 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1923 1924 auto C2 = B.buildFConstant(Ty, C2Val); 1925 auto Fabs = B.buildFAbs(Ty, Src); 1926 1927 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1928 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 bool AMDGPULegalizerInfo::legalizeFceil( 1934 MachineInstr &MI, MachineRegisterInfo &MRI, 1935 MachineIRBuilder &B) const { 1936 1937 const LLT S1 = LLT::scalar(1); 1938 const LLT S64 = LLT::scalar(64); 1939 1940 Register Src = MI.getOperand(1).getReg(); 1941 assert(MRI.getType(Src) == S64); 1942 1943 // result = trunc(src) 1944 // if (src > 0.0 && src != result) 1945 // result += 1.0 1946 1947 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1948 1949 const auto Zero = B.buildFConstant(S64, 0.0); 1950 const auto One = B.buildFConstant(S64, 1.0); 1951 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1952 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1953 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1954 auto Add = B.buildSelect(S64, And, One, Zero); 1955 1956 // TODO: Should this propagate fast-math-flags? 1957 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1958 return true; 1959 } 1960 1961 bool AMDGPULegalizerInfo::legalizeFrem( 1962 MachineInstr &MI, MachineRegisterInfo &MRI, 1963 MachineIRBuilder &B) const { 1964 Register DstReg = MI.getOperand(0).getReg(); 1965 Register Src0Reg = MI.getOperand(1).getReg(); 1966 Register Src1Reg = MI.getOperand(2).getReg(); 1967 auto Flags = MI.getFlags(); 1968 LLT Ty = MRI.getType(DstReg); 1969 1970 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1971 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1972 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1973 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 static MachineInstrBuilder extractF64Exponent(Register Hi, 1979 MachineIRBuilder &B) { 1980 const unsigned FractBits = 52; 1981 const unsigned ExpBits = 11; 1982 LLT S32 = LLT::scalar(32); 1983 1984 auto Const0 = B.buildConstant(S32, FractBits - 32); 1985 auto Const1 = B.buildConstant(S32, ExpBits); 1986 1987 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1988 .addUse(Hi) 1989 .addUse(Const0.getReg(0)) 1990 .addUse(Const1.getReg(0)); 1991 1992 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1993 } 1994 1995 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B) const { 1998 const LLT S1 = LLT::scalar(1); 1999 const LLT S32 = LLT::scalar(32); 2000 const LLT S64 = LLT::scalar(64); 2001 2002 Register Src = MI.getOperand(1).getReg(); 2003 assert(MRI.getType(Src) == S64); 2004 2005 // TODO: Should this use extract since the low half is unused? 2006 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2007 Register Hi = Unmerge.getReg(1); 2008 2009 // Extract the upper half, since this is where we will find the sign and 2010 // exponent. 2011 auto Exp = extractF64Exponent(Hi, B); 2012 2013 const unsigned FractBits = 52; 2014 2015 // Extract the sign bit. 2016 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2017 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2018 2019 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2020 2021 const auto Zero32 = B.buildConstant(S32, 0); 2022 2023 // Extend back to 64-bits. 2024 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2025 2026 auto Shr = B.buildAShr(S64, FractMask, Exp); 2027 auto Not = B.buildNot(S64, Shr); 2028 auto Tmp0 = B.buildAnd(S64, Src, Not); 2029 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2030 2031 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2032 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2033 2034 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2035 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeITOFP( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B, bool Signed) const { 2043 2044 Register Dst = MI.getOperand(0).getReg(); 2045 Register Src = MI.getOperand(1).getReg(); 2046 2047 const LLT S64 = LLT::scalar(64); 2048 const LLT S32 = LLT::scalar(32); 2049 2050 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2051 2052 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2053 2054 auto CvtHi = Signed ? 2055 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2056 B.buildUITOFP(S64, Unmerge.getReg(1)); 2057 2058 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2059 2060 auto ThirtyTwo = B.buildConstant(S32, 32); 2061 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2062 .addUse(CvtHi.getReg(0)) 2063 .addUse(ThirtyTwo.getReg(0)); 2064 2065 // TODO: Should this propagate fast-math-flags? 2066 B.buildFAdd(Dst, LdExp, CvtLo); 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 // TODO: Copied from DAG implementation. Verify logic and document how this 2072 // actually works. 2073 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2074 MachineRegisterInfo &MRI, 2075 MachineIRBuilder &B, 2076 bool Signed) const { 2077 2078 Register Dst = MI.getOperand(0).getReg(); 2079 Register Src = MI.getOperand(1).getReg(); 2080 2081 const LLT S64 = LLT::scalar(64); 2082 const LLT S32 = LLT::scalar(32); 2083 2084 const LLT SrcLT = MRI.getType(Src); 2085 const LLT DstLT = MRI.getType(Dst); 2086 2087 assert((SrcLT == S32 || SrcLT == S64) && DstLT == S64); 2088 2089 unsigned Flags = MI.getFlags(); 2090 2091 // The basic idea of converting a floating point number into a pair of 32-bit 2092 // integers is illustrated as follows: 2093 // 2094 // tf := trunc(val); 2095 // hif := floor(tf * 2^-32); 2096 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2097 // hi := fptoi(hif); 2098 // lo := fptoi(lof); 2099 // 2100 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2101 MachineInstrBuilder Sign; 2102 if (Signed && SrcLT == S32) { 2103 // However, a 32-bit floating point number has only 23 bits mantissa and 2104 // it's not enough to hold all the significant bits of `lof` if val is 2105 // negative. To avoid the loss of precision, We need to take the absolute 2106 // value after truncating and flip the result back based on the original 2107 // signedness. 2108 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2109 Trunc = B.buildFAbs(S32, Trunc, Flags); 2110 } 2111 MachineInstrBuilder K0, K1; 2112 if (SrcLT == S64) { 2113 K0 = B.buildFConstant(S64, 2114 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2115 K1 = B.buildFConstant(S64, 2116 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2117 } else { 2118 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2119 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2120 } 2121 2122 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2123 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2124 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2125 2126 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2127 : B.buildFPTOUI(S32, FloorMul); 2128 auto Lo = B.buildFPTOUI(S32, Fma); 2129 2130 if (Signed && SrcLT == S32) { 2131 // Flip the result based on the signedness, which is either all 0s or 1s. 2132 Sign = B.buildMerge(S64, {Sign, Sign}); 2133 // r := xor({lo, hi}, sign) - sign; 2134 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2135 } else 2136 B.buildMerge(Dst, {Lo, Hi}); 2137 MI.eraseFromParent(); 2138 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2143 MachineInstr &MI) const { 2144 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2145 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2146 2147 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2148 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2149 2150 // With ieee_mode disabled, the instructions have the correct behavior 2151 // already for G_FMINNUM/G_FMAXNUM 2152 if (!MFI->getMode().IEEE) 2153 return !IsIEEEOp; 2154 2155 if (IsIEEEOp) 2156 return true; 2157 2158 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2162 MachineInstr &MI, MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 // TODO: Should move some of this into LegalizerHelper. 2165 2166 // TODO: Promote dynamic indexing of s16 to s32 2167 2168 // FIXME: Artifact combiner probably should have replaced the truncated 2169 // constant before this, so we shouldn't need 2170 // getConstantVRegValWithLookThrough. 2171 Optional<ValueAndVReg> MaybeIdxVal = 2172 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2173 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2174 return true; 2175 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2176 2177 Register Dst = MI.getOperand(0).getReg(); 2178 Register Vec = MI.getOperand(1).getReg(); 2179 2180 LLT VecTy = MRI.getType(Vec); 2181 LLT EltTy = VecTy.getElementType(); 2182 assert(EltTy == MRI.getType(Dst)); 2183 2184 if (IdxVal < VecTy.getNumElements()) 2185 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2186 else 2187 B.buildUndef(Dst); 2188 2189 MI.eraseFromParent(); 2190 return true; 2191 } 2192 2193 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2194 MachineInstr &MI, MachineRegisterInfo &MRI, 2195 MachineIRBuilder &B) const { 2196 // TODO: Should move some of this into LegalizerHelper. 2197 2198 // TODO: Promote dynamic indexing of s16 to s32 2199 2200 // FIXME: Artifact combiner probably should have replaced the truncated 2201 // constant before this, so we shouldn't need 2202 // getConstantVRegValWithLookThrough. 2203 Optional<ValueAndVReg> MaybeIdxVal = 2204 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2205 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2206 return true; 2207 2208 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2209 Register Dst = MI.getOperand(0).getReg(); 2210 Register Vec = MI.getOperand(1).getReg(); 2211 Register Ins = MI.getOperand(2).getReg(); 2212 2213 LLT VecTy = MRI.getType(Vec); 2214 LLT EltTy = VecTy.getElementType(); 2215 assert(EltTy == MRI.getType(Ins)); 2216 2217 if (IdxVal < VecTy.getNumElements()) 2218 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2219 else 2220 B.buildUndef(Dst); 2221 2222 MI.eraseFromParent(); 2223 return true; 2224 } 2225 2226 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2227 MachineInstr &MI, MachineRegisterInfo &MRI, 2228 MachineIRBuilder &B) const { 2229 const LLT V2S16 = LLT::vector(2, 16); 2230 2231 Register Dst = MI.getOperand(0).getReg(); 2232 Register Src0 = MI.getOperand(1).getReg(); 2233 LLT DstTy = MRI.getType(Dst); 2234 LLT SrcTy = MRI.getType(Src0); 2235 2236 if (SrcTy == V2S16 && DstTy == V2S16 && 2237 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2238 return true; 2239 2240 MachineIRBuilder HelperBuilder(MI); 2241 GISelObserverWrapper DummyObserver; 2242 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2243 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeSinCos( 2247 MachineInstr &MI, MachineRegisterInfo &MRI, 2248 MachineIRBuilder &B) const { 2249 2250 Register DstReg = MI.getOperand(0).getReg(); 2251 Register SrcReg = MI.getOperand(1).getReg(); 2252 LLT Ty = MRI.getType(DstReg); 2253 unsigned Flags = MI.getFlags(); 2254 2255 Register TrigVal; 2256 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2257 if (ST.hasTrigReducedRange()) { 2258 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2259 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2260 .addUse(MulVal.getReg(0)) 2261 .setMIFlags(Flags).getReg(0); 2262 } else 2263 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2264 2265 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2266 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2267 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2268 .addUse(TrigVal) 2269 .setMIFlags(Flags); 2270 MI.eraseFromParent(); 2271 return true; 2272 } 2273 2274 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2275 MachineIRBuilder &B, 2276 const GlobalValue *GV, 2277 int64_t Offset, 2278 unsigned GAFlags) const { 2279 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2280 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2281 // to the following code sequence: 2282 // 2283 // For constant address space: 2284 // s_getpc_b64 s[0:1] 2285 // s_add_u32 s0, s0, $symbol 2286 // s_addc_u32 s1, s1, 0 2287 // 2288 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2289 // a fixup or relocation is emitted to replace $symbol with a literal 2290 // constant, which is a pc-relative offset from the encoding of the $symbol 2291 // operand to the global variable. 2292 // 2293 // For global address space: 2294 // s_getpc_b64 s[0:1] 2295 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2296 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2297 // 2298 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2299 // fixups or relocations are emitted to replace $symbol@*@lo and 2300 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2301 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2302 // operand to the global variable. 2303 // 2304 // What we want here is an offset from the value returned by s_getpc 2305 // (which is the address of the s_add_u32 instruction) to the global 2306 // variable, but since the encoding of $symbol starts 4 bytes after the start 2307 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2308 // small. This requires us to add 4 to the global variable offset in order to 2309 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2310 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2311 // instruction. 2312 2313 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2314 2315 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2316 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2317 2318 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2319 .addDef(PCReg); 2320 2321 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2322 if (GAFlags == SIInstrInfo::MO_NONE) 2323 MIB.addImm(0); 2324 else 2325 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2326 2327 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2328 2329 if (PtrTy.getSizeInBits() == 32) 2330 B.buildExtract(DstReg, PCReg, 0); 2331 return true; 2332 } 2333 2334 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2335 MachineInstr &MI, MachineRegisterInfo &MRI, 2336 MachineIRBuilder &B) const { 2337 Register DstReg = MI.getOperand(0).getReg(); 2338 LLT Ty = MRI.getType(DstReg); 2339 unsigned AS = Ty.getAddressSpace(); 2340 2341 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2342 MachineFunction &MF = B.getMF(); 2343 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2344 2345 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2346 if (!MFI->isModuleEntryFunction() && 2347 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2348 const Function &Fn = MF.getFunction(); 2349 DiagnosticInfoUnsupported BadLDSDecl( 2350 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2351 DS_Warning); 2352 Fn.getContext().diagnose(BadLDSDecl); 2353 2354 // We currently don't have a way to correctly allocate LDS objects that 2355 // aren't directly associated with a kernel. We do force inlining of 2356 // functions that use local objects. However, if these dead functions are 2357 // not eliminated, we don't want a compile time error. Just emit a warning 2358 // and a trap, since there should be no callable path here. 2359 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2360 B.buildUndef(DstReg); 2361 MI.eraseFromParent(); 2362 return true; 2363 } 2364 2365 // TODO: We could emit code to handle the initialization somewhere. 2366 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2367 const SITargetLowering *TLI = ST.getTargetLowering(); 2368 if (!TLI->shouldUseLDSConstAddress(GV)) { 2369 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2370 return true; // Leave in place; 2371 } 2372 2373 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2374 Type *Ty = GV->getValueType(); 2375 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2376 // zero-sized type in other languages to declare the dynamic shared 2377 // memory which size is not known at the compile time. They will be 2378 // allocated by the runtime and placed directly after the static 2379 // allocated ones. They all share the same offset. 2380 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2381 // Adjust alignment for that dynamic shared memory array. 2382 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2383 LLT S32 = LLT::scalar(32); 2384 auto Sz = 2385 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2386 B.buildIntToPtr(DstReg, Sz); 2387 MI.eraseFromParent(); 2388 return true; 2389 } 2390 } 2391 2392 B.buildConstant( 2393 DstReg, 2394 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2395 MI.eraseFromParent(); 2396 return true; 2397 } 2398 2399 const Function &Fn = MF.getFunction(); 2400 DiagnosticInfoUnsupported BadInit( 2401 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2402 Fn.getContext().diagnose(BadInit); 2403 return true; 2404 } 2405 2406 const SITargetLowering *TLI = ST.getTargetLowering(); 2407 2408 if (TLI->shouldEmitFixup(GV)) { 2409 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2410 MI.eraseFromParent(); 2411 return true; 2412 } 2413 2414 if (TLI->shouldEmitPCReloc(GV)) { 2415 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2416 MI.eraseFromParent(); 2417 return true; 2418 } 2419 2420 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2421 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2422 2423 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2424 MachinePointerInfo::getGOT(MF), 2425 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2426 MachineMemOperand::MOInvariant, 2427 8 /*Size*/, Align(8)); 2428 2429 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2430 2431 if (Ty.getSizeInBits() == 32) { 2432 // Truncate if this is a 32-bit constant adrdess. 2433 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2434 B.buildExtract(DstReg, Load, 0); 2435 } else 2436 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2437 2438 MI.eraseFromParent(); 2439 return true; 2440 } 2441 2442 static LLT widenToNextPowerOf2(LLT Ty) { 2443 if (Ty.isVector()) 2444 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2445 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2446 } 2447 2448 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2449 MachineInstr &MI) const { 2450 MachineIRBuilder &B = Helper.MIRBuilder; 2451 MachineRegisterInfo &MRI = *B.getMRI(); 2452 GISelChangeObserver &Observer = Helper.Observer; 2453 2454 Register PtrReg = MI.getOperand(1).getReg(); 2455 LLT PtrTy = MRI.getType(PtrReg); 2456 unsigned AddrSpace = PtrTy.getAddressSpace(); 2457 2458 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2459 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2460 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2461 Observer.changingInstr(MI); 2462 MI.getOperand(1).setReg(Cast.getReg(0)); 2463 Observer.changedInstr(MI); 2464 return true; 2465 } 2466 2467 if (MI.getOpcode() != AMDGPU::G_LOAD) 2468 return false; 2469 2470 Register ValReg = MI.getOperand(0).getReg(); 2471 LLT ValTy = MRI.getType(ValReg); 2472 2473 MachineMemOperand *MMO = *MI.memoperands_begin(); 2474 const unsigned ValSize = ValTy.getSizeInBits(); 2475 const unsigned MemSize = 8 * MMO->getSize(); 2476 const Align MemAlign = MMO->getAlign(); 2477 const unsigned AlignInBits = 8 * MemAlign.value(); 2478 2479 // Widen non-power-of-2 loads to the alignment if needed 2480 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2481 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2482 2483 // This was already the correct extending load result type, so just adjust 2484 // the memory type. 2485 if (WideMemSize == ValSize) { 2486 MachineFunction &MF = B.getMF(); 2487 2488 MachineMemOperand *WideMMO = 2489 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2490 Observer.changingInstr(MI); 2491 MI.setMemRefs(MF, {WideMMO}); 2492 Observer.changedInstr(MI); 2493 return true; 2494 } 2495 2496 // Don't bother handling edge case that should probably never be produced. 2497 if (ValSize > WideMemSize) 2498 return false; 2499 2500 LLT WideTy = widenToNextPowerOf2(ValTy); 2501 2502 Register WideLoad; 2503 if (!WideTy.isVector()) { 2504 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2505 B.buildTrunc(ValReg, WideLoad).getReg(0); 2506 } else { 2507 // Extract the subvector. 2508 2509 if (isRegisterType(ValTy)) { 2510 // If this a case where G_EXTRACT is legal, use it. 2511 // (e.g. <3 x s32> -> <4 x s32>) 2512 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2513 B.buildExtract(ValReg, WideLoad, 0); 2514 } else { 2515 // For cases where the widened type isn't a nice register value, unmerge 2516 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2517 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2518 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2519 B.setInsertPt(B.getMBB(), MI.getIterator()); 2520 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2521 } 2522 } 2523 2524 MI.eraseFromParent(); 2525 return true; 2526 } 2527 2528 return false; 2529 } 2530 2531 bool AMDGPULegalizerInfo::legalizeFMad( 2532 MachineInstr &MI, MachineRegisterInfo &MRI, 2533 MachineIRBuilder &B) const { 2534 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2535 assert(Ty.isScalar()); 2536 2537 MachineFunction &MF = B.getMF(); 2538 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2539 2540 // TODO: Always legal with future ftz flag. 2541 // FIXME: Do we need just output? 2542 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2543 return true; 2544 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2545 return true; 2546 2547 MachineIRBuilder HelperBuilder(MI); 2548 GISelObserverWrapper DummyObserver; 2549 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2550 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2551 } 2552 2553 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2554 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2555 Register DstReg = MI.getOperand(0).getReg(); 2556 Register PtrReg = MI.getOperand(1).getReg(); 2557 Register CmpVal = MI.getOperand(2).getReg(); 2558 Register NewVal = MI.getOperand(3).getReg(); 2559 2560 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2561 "this should not have been custom lowered"); 2562 2563 LLT ValTy = MRI.getType(CmpVal); 2564 LLT VecTy = LLT::vector(2, ValTy); 2565 2566 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2567 2568 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2569 .addDef(DstReg) 2570 .addUse(PtrReg) 2571 .addUse(PackedVal) 2572 .setMemRefs(MI.memoperands()); 2573 2574 MI.eraseFromParent(); 2575 return true; 2576 } 2577 2578 bool AMDGPULegalizerInfo::legalizeFlog( 2579 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2580 Register Dst = MI.getOperand(0).getReg(); 2581 Register Src = MI.getOperand(1).getReg(); 2582 LLT Ty = B.getMRI()->getType(Dst); 2583 unsigned Flags = MI.getFlags(); 2584 2585 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2586 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2587 2588 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2589 MI.eraseFromParent(); 2590 return true; 2591 } 2592 2593 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2594 MachineIRBuilder &B) const { 2595 Register Dst = MI.getOperand(0).getReg(); 2596 Register Src = MI.getOperand(1).getReg(); 2597 unsigned Flags = MI.getFlags(); 2598 LLT Ty = B.getMRI()->getType(Dst); 2599 2600 auto K = B.buildFConstant(Ty, numbers::log2e); 2601 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2602 B.buildFExp2(Dst, Mul, Flags); 2603 MI.eraseFromParent(); 2604 return true; 2605 } 2606 2607 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2608 MachineIRBuilder &B) const { 2609 Register Dst = MI.getOperand(0).getReg(); 2610 Register Src0 = MI.getOperand(1).getReg(); 2611 Register Src1 = MI.getOperand(2).getReg(); 2612 unsigned Flags = MI.getFlags(); 2613 LLT Ty = B.getMRI()->getType(Dst); 2614 const LLT S16 = LLT::scalar(16); 2615 const LLT S32 = LLT::scalar(32); 2616 2617 if (Ty == S32) { 2618 auto Log = B.buildFLog2(S32, Src0, Flags); 2619 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2620 .addUse(Log.getReg(0)) 2621 .addUse(Src1) 2622 .setMIFlags(Flags); 2623 B.buildFExp2(Dst, Mul, Flags); 2624 } else if (Ty == S16) { 2625 // There's no f16 fmul_legacy, so we need to convert for it. 2626 auto Log = B.buildFLog2(S16, Src0, Flags); 2627 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2628 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2629 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2630 .addUse(Ext0.getReg(0)) 2631 .addUse(Ext1.getReg(0)) 2632 .setMIFlags(Flags); 2633 2634 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2635 } else 2636 return false; 2637 2638 MI.eraseFromParent(); 2639 return true; 2640 } 2641 2642 // Find a source register, ignoring any possible source modifiers. 2643 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2644 Register ModSrc = OrigSrc; 2645 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2646 ModSrc = SrcFNeg->getOperand(1).getReg(); 2647 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2648 ModSrc = SrcFAbs->getOperand(1).getReg(); 2649 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2650 ModSrc = SrcFAbs->getOperand(1).getReg(); 2651 return ModSrc; 2652 } 2653 2654 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2655 MachineRegisterInfo &MRI, 2656 MachineIRBuilder &B) const { 2657 2658 const LLT S1 = LLT::scalar(1); 2659 const LLT S64 = LLT::scalar(64); 2660 Register Dst = MI.getOperand(0).getReg(); 2661 Register OrigSrc = MI.getOperand(1).getReg(); 2662 unsigned Flags = MI.getFlags(); 2663 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2664 "this should not have been custom lowered"); 2665 2666 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2667 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2668 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2669 // V_FRACT bug is: 2670 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2671 // 2672 // Convert floor(x) to (x - fract(x)) 2673 2674 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2675 .addUse(OrigSrc) 2676 .setMIFlags(Flags); 2677 2678 // Give source modifier matching some assistance before obscuring a foldable 2679 // pattern. 2680 2681 // TODO: We can avoid the neg on the fract? The input sign to fract 2682 // shouldn't matter? 2683 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2684 2685 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2686 2687 Register Min = MRI.createGenericVirtualRegister(S64); 2688 2689 // We don't need to concern ourselves with the snan handling difference, so 2690 // use the one which will directly select. 2691 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2692 if (MFI->getMode().IEEE) 2693 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2694 else 2695 B.buildFMinNum(Min, Fract, Const, Flags); 2696 2697 Register CorrectedFract = Min; 2698 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2699 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2700 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2701 } 2702 2703 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2704 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2705 2706 MI.eraseFromParent(); 2707 return true; 2708 } 2709 2710 // Turn an illegal packed v2s16 build vector into bit operations. 2711 // TODO: This should probably be a bitcast action in LegalizerHelper. 2712 bool AMDGPULegalizerInfo::legalizeBuildVector( 2713 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2714 Register Dst = MI.getOperand(0).getReg(); 2715 const LLT S32 = LLT::scalar(32); 2716 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2717 2718 Register Src0 = MI.getOperand(1).getReg(); 2719 Register Src1 = MI.getOperand(2).getReg(); 2720 assert(MRI.getType(Src0) == LLT::scalar(16)); 2721 2722 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2723 B.buildBitcast(Dst, Merge); 2724 2725 MI.eraseFromParent(); 2726 return true; 2727 } 2728 2729 // Check that this is a G_XOR x, -1 2730 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2731 if (MI.getOpcode() != TargetOpcode::G_XOR) 2732 return false; 2733 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2734 return ConstVal && *ConstVal == -1; 2735 } 2736 2737 // Return the use branch instruction, otherwise null if the usage is invalid. 2738 static MachineInstr * 2739 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2740 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2741 Register CondDef = MI.getOperand(0).getReg(); 2742 if (!MRI.hasOneNonDBGUse(CondDef)) 2743 return nullptr; 2744 2745 MachineBasicBlock *Parent = MI.getParent(); 2746 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2747 2748 if (isNot(MRI, *UseMI)) { 2749 Register NegatedCond = UseMI->getOperand(0).getReg(); 2750 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2751 return nullptr; 2752 2753 // We're deleting the def of this value, so we need to remove it. 2754 UseMI->eraseFromParent(); 2755 2756 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2757 Negated = true; 2758 } 2759 2760 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2761 return nullptr; 2762 2763 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2764 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2765 if (Next == Parent->end()) { 2766 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2767 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2768 return nullptr; 2769 UncondBrTarget = &*NextMBB; 2770 } else { 2771 if (Next->getOpcode() != AMDGPU::G_BR) 2772 return nullptr; 2773 Br = &*Next; 2774 UncondBrTarget = Br->getOperand(0).getMBB(); 2775 } 2776 2777 return UseMI; 2778 } 2779 2780 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2781 const ArgDescriptor *Arg, 2782 const TargetRegisterClass *ArgRC, 2783 LLT ArgTy) const { 2784 MCRegister SrcReg = Arg->getRegister(); 2785 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2786 assert(DstReg.isVirtual() && "Virtual register expected"); 2787 2788 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2789 ArgTy); 2790 if (Arg->isMasked()) { 2791 // TODO: Should we try to emit this once in the entry block? 2792 const LLT S32 = LLT::scalar(32); 2793 const unsigned Mask = Arg->getMask(); 2794 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2795 2796 Register AndMaskSrc = LiveIn; 2797 2798 if (Shift != 0) { 2799 auto ShiftAmt = B.buildConstant(S32, Shift); 2800 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2801 } 2802 2803 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2804 } else { 2805 B.buildCopy(DstReg, LiveIn); 2806 } 2807 2808 return true; 2809 } 2810 2811 bool AMDGPULegalizerInfo::loadInputValue( 2812 Register DstReg, MachineIRBuilder &B, 2813 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2814 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2815 const ArgDescriptor *Arg; 2816 const TargetRegisterClass *ArgRC; 2817 LLT ArgTy; 2818 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2819 2820 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2821 return false; // TODO: Handle these 2822 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2823 } 2824 2825 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2826 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2827 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2828 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2829 return false; 2830 2831 MI.eraseFromParent(); 2832 return true; 2833 } 2834 2835 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2836 MachineRegisterInfo &MRI, 2837 MachineIRBuilder &B) const { 2838 Register Dst = MI.getOperand(0).getReg(); 2839 LLT DstTy = MRI.getType(Dst); 2840 LLT S16 = LLT::scalar(16); 2841 LLT S32 = LLT::scalar(32); 2842 LLT S64 = LLT::scalar(64); 2843 2844 if (DstTy == S16) 2845 return legalizeFDIV16(MI, MRI, B); 2846 if (DstTy == S32) 2847 return legalizeFDIV32(MI, MRI, B); 2848 if (DstTy == S64) 2849 return legalizeFDIV64(MI, MRI, B); 2850 2851 return false; 2852 } 2853 2854 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2855 Register DstDivReg, 2856 Register DstRemReg, 2857 Register X, 2858 Register Y) const { 2859 const LLT S1 = LLT::scalar(1); 2860 const LLT S32 = LLT::scalar(32); 2861 2862 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2863 // algorithm used here. 2864 2865 // Initial estimate of inv(y). 2866 auto FloatY = B.buildUITOFP(S32, Y); 2867 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2868 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2869 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2870 auto Z = B.buildFPTOUI(S32, ScaledY); 2871 2872 // One round of UNR. 2873 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2874 auto NegYZ = B.buildMul(S32, NegY, Z); 2875 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2876 2877 // Quotient/remainder estimate. 2878 auto Q = B.buildUMulH(S32, X, Z); 2879 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2880 2881 // First quotient/remainder refinement. 2882 auto One = B.buildConstant(S32, 1); 2883 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2884 if (DstDivReg) 2885 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2886 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2887 2888 // Second quotient/remainder refinement. 2889 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2890 if (DstDivReg) 2891 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2892 2893 if (DstRemReg) 2894 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2895 } 2896 2897 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2898 // 2899 // Return lo, hi of result 2900 // 2901 // %cvt.lo = G_UITOFP Val.lo 2902 // %cvt.hi = G_UITOFP Val.hi 2903 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2904 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2905 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2906 // %mul2 = G_FMUL %mul1, 2**(-32) 2907 // %trunc = G_INTRINSIC_TRUNC %mul2 2908 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2909 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2910 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2911 Register Val) { 2912 const LLT S32 = LLT::scalar(32); 2913 auto Unmerge = B.buildUnmerge(S32, Val); 2914 2915 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2916 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2917 2918 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2919 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2920 2921 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2922 auto Mul1 = 2923 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2924 2925 // 2**(-32) 2926 auto Mul2 = 2927 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2928 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2929 2930 // -(2**32) 2931 auto Mad2 = B.buildFMAD(S32, Trunc, 2932 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2933 2934 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2935 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2936 2937 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2938 } 2939 2940 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2941 Register DstDivReg, 2942 Register DstRemReg, 2943 Register Numer, 2944 Register Denom) const { 2945 const LLT S32 = LLT::scalar(32); 2946 const LLT S64 = LLT::scalar(64); 2947 const LLT S1 = LLT::scalar(1); 2948 Register RcpLo, RcpHi; 2949 2950 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2951 2952 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2953 2954 auto Zero64 = B.buildConstant(S64, 0); 2955 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2956 2957 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2958 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2959 2960 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2961 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2962 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2963 2964 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2965 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2966 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2967 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2968 2969 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2970 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2971 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2972 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2973 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2974 2975 auto Zero32 = B.buildConstant(S32, 0); 2976 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2977 auto Add2_HiC = 2978 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2979 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2980 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2981 2982 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2983 Register NumerLo = UnmergeNumer.getReg(0); 2984 Register NumerHi = UnmergeNumer.getReg(1); 2985 2986 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2987 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2988 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2989 Register Mul3_Lo = UnmergeMul3.getReg(0); 2990 Register Mul3_Hi = UnmergeMul3.getReg(1); 2991 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2992 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2993 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2994 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2995 2996 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2997 Register DenomLo = UnmergeDenom.getReg(0); 2998 Register DenomHi = UnmergeDenom.getReg(1); 2999 3000 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3001 auto C1 = B.buildSExt(S32, CmpHi); 3002 3003 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3004 auto C2 = B.buildSExt(S32, CmpLo); 3005 3006 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3007 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3008 3009 // TODO: Here and below portions of the code can be enclosed into if/endif. 3010 // Currently control flow is unconditional and we have 4 selects after 3011 // potential endif to substitute PHIs. 3012 3013 // if C3 != 0 ... 3014 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3015 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3016 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3017 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3018 3019 auto One64 = B.buildConstant(S64, 1); 3020 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3021 3022 auto C4 = 3023 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3024 auto C5 = 3025 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3026 auto C6 = B.buildSelect( 3027 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3028 3029 // if (C6 != 0) 3030 auto Add4 = B.buildAdd(S64, Add3, One64); 3031 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3032 3033 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3034 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3035 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3036 3037 // endif C6 3038 // endif C3 3039 3040 if (DstDivReg) { 3041 auto Sel1 = B.buildSelect( 3042 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3043 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3044 Sel1, MulHi3); 3045 } 3046 3047 if (DstRemReg) { 3048 auto Sel2 = B.buildSelect( 3049 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3050 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3051 Sel2, Sub1); 3052 } 3053 } 3054 3055 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3056 MachineRegisterInfo &MRI, 3057 MachineIRBuilder &B) const { 3058 Register DstDivReg, DstRemReg; 3059 switch (MI.getOpcode()) { 3060 default: 3061 llvm_unreachable("Unexpected opcode!"); 3062 case AMDGPU::G_UDIV: { 3063 DstDivReg = MI.getOperand(0).getReg(); 3064 break; 3065 } 3066 case AMDGPU::G_UREM: { 3067 DstRemReg = MI.getOperand(0).getReg(); 3068 break; 3069 } 3070 case AMDGPU::G_UDIVREM: { 3071 DstDivReg = MI.getOperand(0).getReg(); 3072 DstRemReg = MI.getOperand(1).getReg(); 3073 break; 3074 } 3075 } 3076 3077 const LLT S64 = LLT::scalar(64); 3078 const LLT S32 = LLT::scalar(32); 3079 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3080 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3081 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3082 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3083 3084 if (Ty == S32) 3085 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3086 else if (Ty == S64) 3087 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3088 else 3089 return false; 3090 3091 MI.eraseFromParent(); 3092 return true; 3093 } 3094 3095 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3096 MachineRegisterInfo &MRI, 3097 MachineIRBuilder &B) const { 3098 const LLT S64 = LLT::scalar(64); 3099 const LLT S32 = LLT::scalar(32); 3100 3101 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3102 if (Ty != S32 && Ty != S64) 3103 return false; 3104 3105 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3106 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3107 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3108 3109 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3110 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3111 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3112 3113 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3114 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3115 3116 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3117 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3118 3119 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3120 switch (MI.getOpcode()) { 3121 default: 3122 llvm_unreachable("Unexpected opcode!"); 3123 case AMDGPU::G_SDIV: { 3124 DstDivReg = MI.getOperand(0).getReg(); 3125 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3126 break; 3127 } 3128 case AMDGPU::G_SREM: { 3129 DstRemReg = MI.getOperand(0).getReg(); 3130 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3131 break; 3132 } 3133 case AMDGPU::G_SDIVREM: { 3134 DstDivReg = MI.getOperand(0).getReg(); 3135 DstRemReg = MI.getOperand(1).getReg(); 3136 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3137 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3138 break; 3139 } 3140 } 3141 3142 if (Ty == S32) 3143 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3144 else 3145 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3146 3147 if (DstDivReg) { 3148 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3149 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3150 B.buildSub(DstDivReg, SignXor, Sign); 3151 } 3152 3153 if (DstRemReg) { 3154 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3155 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3156 B.buildSub(DstRemReg, SignXor, Sign); 3157 } 3158 3159 MI.eraseFromParent(); 3160 return true; 3161 } 3162 3163 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3164 MachineRegisterInfo &MRI, 3165 MachineIRBuilder &B) const { 3166 Register Res = MI.getOperand(0).getReg(); 3167 Register LHS = MI.getOperand(1).getReg(); 3168 Register RHS = MI.getOperand(2).getReg(); 3169 uint16_t Flags = MI.getFlags(); 3170 LLT ResTy = MRI.getType(Res); 3171 3172 const MachineFunction &MF = B.getMF(); 3173 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3174 MI.getFlag(MachineInstr::FmAfn); 3175 3176 if (!AllowInaccurateRcp) 3177 return false; 3178 3179 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3180 // 1 / x -> RCP(x) 3181 if (CLHS->isExactlyValue(1.0)) { 3182 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3183 .addUse(RHS) 3184 .setMIFlags(Flags); 3185 3186 MI.eraseFromParent(); 3187 return true; 3188 } 3189 3190 // -1 / x -> RCP( FNEG(x) ) 3191 if (CLHS->isExactlyValue(-1.0)) { 3192 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3193 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3194 .addUse(FNeg.getReg(0)) 3195 .setMIFlags(Flags); 3196 3197 MI.eraseFromParent(); 3198 return true; 3199 } 3200 } 3201 3202 // x / y -> x * (1.0 / y) 3203 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3204 .addUse(RHS) 3205 .setMIFlags(Flags); 3206 B.buildFMul(Res, LHS, RCP, Flags); 3207 3208 MI.eraseFromParent(); 3209 return true; 3210 } 3211 3212 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3213 MachineRegisterInfo &MRI, 3214 MachineIRBuilder &B) const { 3215 Register Res = MI.getOperand(0).getReg(); 3216 Register X = MI.getOperand(1).getReg(); 3217 Register Y = MI.getOperand(2).getReg(); 3218 uint16_t Flags = MI.getFlags(); 3219 LLT ResTy = MRI.getType(Res); 3220 3221 const MachineFunction &MF = B.getMF(); 3222 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3223 MI.getFlag(MachineInstr::FmAfn); 3224 3225 if (!AllowInaccurateRcp) 3226 return false; 3227 3228 auto NegY = B.buildFNeg(ResTy, Y); 3229 auto One = B.buildFConstant(ResTy, 1.0); 3230 3231 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3232 .addUse(Y) 3233 .setMIFlags(Flags); 3234 3235 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3236 R = B.buildFMA(ResTy, Tmp0, R, R); 3237 3238 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3239 R = B.buildFMA(ResTy, Tmp1, R, R); 3240 3241 auto Ret = B.buildFMul(ResTy, X, R); 3242 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3243 3244 B.buildFMA(Res, Tmp2, R, Ret); 3245 MI.eraseFromParent(); 3246 return true; 3247 } 3248 3249 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3250 MachineRegisterInfo &MRI, 3251 MachineIRBuilder &B) const { 3252 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3253 return true; 3254 3255 Register Res = MI.getOperand(0).getReg(); 3256 Register LHS = MI.getOperand(1).getReg(); 3257 Register RHS = MI.getOperand(2).getReg(); 3258 3259 uint16_t Flags = MI.getFlags(); 3260 3261 LLT S16 = LLT::scalar(16); 3262 LLT S32 = LLT::scalar(32); 3263 3264 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3265 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3266 3267 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3268 .addUse(RHSExt.getReg(0)) 3269 .setMIFlags(Flags); 3270 3271 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3272 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3273 3274 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3275 .addUse(RDst.getReg(0)) 3276 .addUse(RHS) 3277 .addUse(LHS) 3278 .setMIFlags(Flags); 3279 3280 MI.eraseFromParent(); 3281 return true; 3282 } 3283 3284 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3285 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3286 static void toggleSPDenormMode(bool Enable, 3287 MachineIRBuilder &B, 3288 const GCNSubtarget &ST, 3289 AMDGPU::SIModeRegisterDefaults Mode) { 3290 // Set SP denorm mode to this value. 3291 unsigned SPDenormMode = 3292 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3293 3294 if (ST.hasDenormModeInst()) { 3295 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3296 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3297 3298 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3299 B.buildInstr(AMDGPU::S_DENORM_MODE) 3300 .addImm(NewDenormModeValue); 3301 3302 } else { 3303 // Select FP32 bit field in mode register. 3304 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3305 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3306 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3307 3308 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3309 .addImm(SPDenormMode) 3310 .addImm(SPDenormModeBitField); 3311 } 3312 } 3313 3314 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3315 MachineRegisterInfo &MRI, 3316 MachineIRBuilder &B) const { 3317 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3318 return true; 3319 3320 Register Res = MI.getOperand(0).getReg(); 3321 Register LHS = MI.getOperand(1).getReg(); 3322 Register RHS = MI.getOperand(2).getReg(); 3323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3324 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3325 3326 uint16_t Flags = MI.getFlags(); 3327 3328 LLT S32 = LLT::scalar(32); 3329 LLT S1 = LLT::scalar(1); 3330 3331 auto One = B.buildFConstant(S32, 1.0f); 3332 3333 auto DenominatorScaled = 3334 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3335 .addUse(LHS) 3336 .addUse(RHS) 3337 .addImm(0) 3338 .setMIFlags(Flags); 3339 auto NumeratorScaled = 3340 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3341 .addUse(LHS) 3342 .addUse(RHS) 3343 .addImm(1) 3344 .setMIFlags(Flags); 3345 3346 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3347 .addUse(DenominatorScaled.getReg(0)) 3348 .setMIFlags(Flags); 3349 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3350 3351 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3352 // aren't modeled as reading it. 3353 if (!Mode.allFP32Denormals()) 3354 toggleSPDenormMode(true, B, ST, Mode); 3355 3356 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3357 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3358 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3359 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3360 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3361 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3362 3363 if (!Mode.allFP32Denormals()) 3364 toggleSPDenormMode(false, B, ST, Mode); 3365 3366 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3367 .addUse(Fma4.getReg(0)) 3368 .addUse(Fma1.getReg(0)) 3369 .addUse(Fma3.getReg(0)) 3370 .addUse(NumeratorScaled.getReg(1)) 3371 .setMIFlags(Flags); 3372 3373 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3374 .addUse(Fmas.getReg(0)) 3375 .addUse(RHS) 3376 .addUse(LHS) 3377 .setMIFlags(Flags); 3378 3379 MI.eraseFromParent(); 3380 return true; 3381 } 3382 3383 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3384 MachineRegisterInfo &MRI, 3385 MachineIRBuilder &B) const { 3386 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3387 return true; 3388 3389 Register Res = MI.getOperand(0).getReg(); 3390 Register LHS = MI.getOperand(1).getReg(); 3391 Register RHS = MI.getOperand(2).getReg(); 3392 3393 uint16_t Flags = MI.getFlags(); 3394 3395 LLT S64 = LLT::scalar(64); 3396 LLT S1 = LLT::scalar(1); 3397 3398 auto One = B.buildFConstant(S64, 1.0); 3399 3400 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3401 .addUse(LHS) 3402 .addUse(RHS) 3403 .addImm(0) 3404 .setMIFlags(Flags); 3405 3406 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3407 3408 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3409 .addUse(DivScale0.getReg(0)) 3410 .setMIFlags(Flags); 3411 3412 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3413 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3414 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3415 3416 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3417 .addUse(LHS) 3418 .addUse(RHS) 3419 .addImm(1) 3420 .setMIFlags(Flags); 3421 3422 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3423 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3424 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3425 3426 Register Scale; 3427 if (!ST.hasUsableDivScaleConditionOutput()) { 3428 // Workaround a hardware bug on SI where the condition output from div_scale 3429 // is not usable. 3430 3431 LLT S32 = LLT::scalar(32); 3432 3433 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3434 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3435 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3436 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3437 3438 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3439 Scale1Unmerge.getReg(1)); 3440 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3441 Scale0Unmerge.getReg(1)); 3442 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3443 } else { 3444 Scale = DivScale1.getReg(1); 3445 } 3446 3447 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3448 .addUse(Fma4.getReg(0)) 3449 .addUse(Fma3.getReg(0)) 3450 .addUse(Mul.getReg(0)) 3451 .addUse(Scale) 3452 .setMIFlags(Flags); 3453 3454 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3455 .addUse(Fmas.getReg(0)) 3456 .addUse(RHS) 3457 .addUse(LHS) 3458 .setMIFlags(Flags); 3459 3460 MI.eraseFromParent(); 3461 return true; 3462 } 3463 3464 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3465 MachineRegisterInfo &MRI, 3466 MachineIRBuilder &B) const { 3467 Register Res = MI.getOperand(0).getReg(); 3468 Register LHS = MI.getOperand(2).getReg(); 3469 Register RHS = MI.getOperand(3).getReg(); 3470 uint16_t Flags = MI.getFlags(); 3471 3472 LLT S32 = LLT::scalar(32); 3473 LLT S1 = LLT::scalar(1); 3474 3475 auto Abs = B.buildFAbs(S32, RHS, Flags); 3476 const APFloat C0Val(1.0f); 3477 3478 auto C0 = B.buildConstant(S32, 0x6f800000); 3479 auto C1 = B.buildConstant(S32, 0x2f800000); 3480 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3481 3482 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3483 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3484 3485 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3486 3487 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3488 .addUse(Mul0.getReg(0)) 3489 .setMIFlags(Flags); 3490 3491 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3492 3493 B.buildFMul(Res, Sel, Mul1, Flags); 3494 3495 MI.eraseFromParent(); 3496 return true; 3497 } 3498 3499 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3500 // FIXME: Why do we handle this one but not other removed instructions? 3501 // 3502 // Reciprocal square root. The clamp prevents infinite results, clamping 3503 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3504 // +-max_float. 3505 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3506 MachineRegisterInfo &MRI, 3507 MachineIRBuilder &B) const { 3508 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3509 return true; 3510 3511 Register Dst = MI.getOperand(0).getReg(); 3512 Register Src = MI.getOperand(2).getReg(); 3513 auto Flags = MI.getFlags(); 3514 3515 LLT Ty = MRI.getType(Dst); 3516 3517 const fltSemantics *FltSemantics; 3518 if (Ty == LLT::scalar(32)) 3519 FltSemantics = &APFloat::IEEEsingle(); 3520 else if (Ty == LLT::scalar(64)) 3521 FltSemantics = &APFloat::IEEEdouble(); 3522 else 3523 return false; 3524 3525 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3526 .addUse(Src) 3527 .setMIFlags(Flags); 3528 3529 // We don't need to concern ourselves with the snan handling difference, since 3530 // the rsq quieted (or not) so use the one which will directly select. 3531 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3532 const bool UseIEEE = MFI->getMode().IEEE; 3533 3534 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3535 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3536 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3537 3538 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3539 3540 if (UseIEEE) 3541 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3542 else 3543 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3544 MI.eraseFromParent(); 3545 return true; 3546 } 3547 3548 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3549 switch (IID) { 3550 case Intrinsic::amdgcn_ds_fadd: 3551 return AMDGPU::G_ATOMICRMW_FADD; 3552 case Intrinsic::amdgcn_ds_fmin: 3553 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3554 case Intrinsic::amdgcn_ds_fmax: 3555 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3556 default: 3557 llvm_unreachable("not a DS FP intrinsic"); 3558 } 3559 } 3560 3561 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3562 MachineInstr &MI, 3563 Intrinsic::ID IID) const { 3564 GISelChangeObserver &Observer = Helper.Observer; 3565 Observer.changingInstr(MI); 3566 3567 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3568 3569 // The remaining operands were used to set fields in the MemOperand on 3570 // construction. 3571 for (int I = 6; I > 3; --I) 3572 MI.RemoveOperand(I); 3573 3574 MI.RemoveOperand(1); // Remove the intrinsic ID. 3575 Observer.changedInstr(MI); 3576 return true; 3577 } 3578 3579 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3580 MachineRegisterInfo &MRI, 3581 MachineIRBuilder &B) const { 3582 uint64_t Offset = 3583 ST.getTargetLowering()->getImplicitParameterOffset( 3584 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3585 LLT DstTy = MRI.getType(DstReg); 3586 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3587 3588 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3589 if (!loadInputValue(KernargPtrReg, B, 3590 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3591 return false; 3592 3593 // FIXME: This should be nuw 3594 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3595 return true; 3596 } 3597 3598 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3599 MachineRegisterInfo &MRI, 3600 MachineIRBuilder &B) const { 3601 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3602 if (!MFI->isEntryFunction()) { 3603 return legalizePreloadedArgIntrin(MI, MRI, B, 3604 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3605 } 3606 3607 Register DstReg = MI.getOperand(0).getReg(); 3608 if (!getImplicitArgPtr(DstReg, MRI, B)) 3609 return false; 3610 3611 MI.eraseFromParent(); 3612 return true; 3613 } 3614 3615 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3616 MachineRegisterInfo &MRI, 3617 MachineIRBuilder &B, 3618 unsigned AddrSpace) const { 3619 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3620 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3621 Register Hi32 = Unmerge.getReg(1); 3622 3623 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3624 MI.eraseFromParent(); 3625 return true; 3626 } 3627 3628 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3629 // offset (the offset that is included in bounds checking and swizzling, to be 3630 // split between the instruction's voffset and immoffset fields) and soffset 3631 // (the offset that is excluded from bounds checking and swizzling, to go in 3632 // the instruction's soffset field). This function takes the first kind of 3633 // offset and figures out how to split it between voffset and immoffset. 3634 std::tuple<Register, unsigned, unsigned> 3635 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3636 Register OrigOffset) const { 3637 const unsigned MaxImm = 4095; 3638 Register BaseReg; 3639 unsigned TotalConstOffset; 3640 const LLT S32 = LLT::scalar(32); 3641 MachineRegisterInfo &MRI = *B.getMRI(); 3642 3643 std::tie(BaseReg, TotalConstOffset) = 3644 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3645 3646 unsigned ImmOffset = TotalConstOffset; 3647 3648 // If BaseReg is a pointer, convert it to int. 3649 if (MRI.getType(BaseReg).isPointer()) 3650 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3651 3652 // If the immediate value is too big for the immoffset field, put the value 3653 // and -4096 into the immoffset field so that the value that is copied/added 3654 // for the voffset field is a multiple of 4096, and it stands more chance 3655 // of being CSEd with the copy/add for another similar load/store. 3656 // However, do not do that rounding down to a multiple of 4096 if that is a 3657 // negative number, as it appears to be illegal to have a negative offset 3658 // in the vgpr, even if adding the immediate offset makes it positive. 3659 unsigned Overflow = ImmOffset & ~MaxImm; 3660 ImmOffset -= Overflow; 3661 if ((int32_t)Overflow < 0) { 3662 Overflow += ImmOffset; 3663 ImmOffset = 0; 3664 } 3665 3666 if (Overflow != 0) { 3667 if (!BaseReg) { 3668 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3669 } else { 3670 auto OverflowVal = B.buildConstant(S32, Overflow); 3671 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3672 } 3673 } 3674 3675 if (!BaseReg) 3676 BaseReg = B.buildConstant(S32, 0).getReg(0); 3677 3678 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3679 } 3680 3681 /// Handle register layout difference for f16 images for some subtargets. 3682 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3683 MachineRegisterInfo &MRI, 3684 Register Reg, 3685 bool ImageStore) const { 3686 const LLT S16 = LLT::scalar(16); 3687 const LLT S32 = LLT::scalar(32); 3688 LLT StoreVT = MRI.getType(Reg); 3689 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3690 3691 if (ST.hasUnpackedD16VMem()) { 3692 auto Unmerge = B.buildUnmerge(S16, Reg); 3693 3694 SmallVector<Register, 4> WideRegs; 3695 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3696 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3697 3698 int NumElts = StoreVT.getNumElements(); 3699 3700 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3701 } 3702 3703 if (ImageStore && ST.hasImageStoreD16Bug()) { 3704 if (StoreVT.getNumElements() == 2) { 3705 SmallVector<Register, 4> PackedRegs; 3706 Reg = B.buildBitcast(S32, Reg).getReg(0); 3707 PackedRegs.push_back(Reg); 3708 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3709 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3710 } 3711 3712 if (StoreVT.getNumElements() == 3) { 3713 SmallVector<Register, 4> PackedRegs; 3714 auto Unmerge = B.buildUnmerge(S16, Reg); 3715 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3716 PackedRegs.push_back(Unmerge.getReg(I)); 3717 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3718 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3719 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3720 } 3721 3722 if (StoreVT.getNumElements() == 4) { 3723 SmallVector<Register, 4> PackedRegs; 3724 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3725 auto Unmerge = B.buildUnmerge(S32, Reg); 3726 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3727 PackedRegs.push_back(Unmerge.getReg(I)); 3728 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3729 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3730 } 3731 3732 llvm_unreachable("invalid data type"); 3733 } 3734 3735 return Reg; 3736 } 3737 3738 Register AMDGPULegalizerInfo::fixStoreSourceType( 3739 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3740 MachineRegisterInfo *MRI = B.getMRI(); 3741 LLT Ty = MRI->getType(VData); 3742 3743 const LLT S16 = LLT::scalar(16); 3744 3745 // Fixup illegal register types for i8 stores. 3746 if (Ty == LLT::scalar(8) || Ty == S16) { 3747 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3748 return AnyExt; 3749 } 3750 3751 if (Ty.isVector()) { 3752 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3753 if (IsFormat) 3754 return handleD16VData(B, *MRI, VData); 3755 } 3756 } 3757 3758 return VData; 3759 } 3760 3761 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3762 MachineRegisterInfo &MRI, 3763 MachineIRBuilder &B, 3764 bool IsTyped, 3765 bool IsFormat) const { 3766 Register VData = MI.getOperand(1).getReg(); 3767 LLT Ty = MRI.getType(VData); 3768 LLT EltTy = Ty.getScalarType(); 3769 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3770 const LLT S32 = LLT::scalar(32); 3771 3772 VData = fixStoreSourceType(B, VData, IsFormat); 3773 Register RSrc = MI.getOperand(2).getReg(); 3774 3775 MachineMemOperand *MMO = *MI.memoperands_begin(); 3776 const int MemSize = MMO->getSize(); 3777 3778 unsigned ImmOffset; 3779 unsigned TotalOffset; 3780 3781 // The typed intrinsics add an immediate after the registers. 3782 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3783 3784 // The struct intrinsic variants add one additional operand over raw. 3785 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3786 Register VIndex; 3787 int OpOffset = 0; 3788 if (HasVIndex) { 3789 VIndex = MI.getOperand(3).getReg(); 3790 OpOffset = 1; 3791 } 3792 3793 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3794 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3795 3796 unsigned Format = 0; 3797 if (IsTyped) { 3798 Format = MI.getOperand(5 + OpOffset).getImm(); 3799 ++OpOffset; 3800 } 3801 3802 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3803 3804 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3805 if (TotalOffset != 0) 3806 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3807 3808 unsigned Opc; 3809 if (IsTyped) { 3810 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3811 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3812 } else if (IsFormat) { 3813 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3814 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3815 } else { 3816 switch (MemSize) { 3817 case 1: 3818 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3819 break; 3820 case 2: 3821 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3822 break; 3823 default: 3824 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3825 break; 3826 } 3827 } 3828 3829 if (!VIndex) 3830 VIndex = B.buildConstant(S32, 0).getReg(0); 3831 3832 auto MIB = B.buildInstr(Opc) 3833 .addUse(VData) // vdata 3834 .addUse(RSrc) // rsrc 3835 .addUse(VIndex) // vindex 3836 .addUse(VOffset) // voffset 3837 .addUse(SOffset) // soffset 3838 .addImm(ImmOffset); // offset(imm) 3839 3840 if (IsTyped) 3841 MIB.addImm(Format); 3842 3843 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3844 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3845 .addMemOperand(MMO); 3846 3847 MI.eraseFromParent(); 3848 return true; 3849 } 3850 3851 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3852 MachineRegisterInfo &MRI, 3853 MachineIRBuilder &B, 3854 bool IsFormat, 3855 bool IsTyped) const { 3856 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3857 MachineMemOperand *MMO = *MI.memoperands_begin(); 3858 const int MemSize = MMO->getSize(); 3859 const LLT S32 = LLT::scalar(32); 3860 3861 Register Dst = MI.getOperand(0).getReg(); 3862 Register RSrc = MI.getOperand(2).getReg(); 3863 3864 // The typed intrinsics add an immediate after the registers. 3865 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3866 3867 // The struct intrinsic variants add one additional operand over raw. 3868 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3869 Register VIndex; 3870 int OpOffset = 0; 3871 if (HasVIndex) { 3872 VIndex = MI.getOperand(3).getReg(); 3873 OpOffset = 1; 3874 } 3875 3876 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3877 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3878 3879 unsigned Format = 0; 3880 if (IsTyped) { 3881 Format = MI.getOperand(5 + OpOffset).getImm(); 3882 ++OpOffset; 3883 } 3884 3885 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3886 unsigned ImmOffset; 3887 unsigned TotalOffset; 3888 3889 LLT Ty = MRI.getType(Dst); 3890 LLT EltTy = Ty.getScalarType(); 3891 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3892 const bool Unpacked = ST.hasUnpackedD16VMem(); 3893 3894 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3895 if (TotalOffset != 0) 3896 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3897 3898 unsigned Opc; 3899 3900 if (IsTyped) { 3901 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3902 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3903 } else if (IsFormat) { 3904 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3905 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3906 } else { 3907 switch (MemSize) { 3908 case 1: 3909 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3910 break; 3911 case 2: 3912 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3913 break; 3914 default: 3915 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3916 break; 3917 } 3918 } 3919 3920 Register LoadDstReg; 3921 3922 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3923 LLT UnpackedTy = Ty.changeElementSize(32); 3924 3925 if (IsExtLoad) 3926 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3927 else if (Unpacked && IsD16 && Ty.isVector()) 3928 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3929 else 3930 LoadDstReg = Dst; 3931 3932 if (!VIndex) 3933 VIndex = B.buildConstant(S32, 0).getReg(0); 3934 3935 auto MIB = B.buildInstr(Opc) 3936 .addDef(LoadDstReg) // vdata 3937 .addUse(RSrc) // rsrc 3938 .addUse(VIndex) // vindex 3939 .addUse(VOffset) // voffset 3940 .addUse(SOffset) // soffset 3941 .addImm(ImmOffset); // offset(imm) 3942 3943 if (IsTyped) 3944 MIB.addImm(Format); 3945 3946 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3947 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3948 .addMemOperand(MMO); 3949 3950 if (LoadDstReg != Dst) { 3951 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3952 3953 // Widen result for extending loads was widened. 3954 if (IsExtLoad) 3955 B.buildTrunc(Dst, LoadDstReg); 3956 else { 3957 // Repack to original 16-bit vector result 3958 // FIXME: G_TRUNC should work, but legalization currently fails 3959 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3960 SmallVector<Register, 4> Repack; 3961 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3962 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3963 B.buildMerge(Dst, Repack); 3964 } 3965 } 3966 3967 MI.eraseFromParent(); 3968 return true; 3969 } 3970 3971 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3972 MachineIRBuilder &B, 3973 bool IsInc) const { 3974 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3975 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3976 B.buildInstr(Opc) 3977 .addDef(MI.getOperand(0).getReg()) 3978 .addUse(MI.getOperand(2).getReg()) 3979 .addUse(MI.getOperand(3).getReg()) 3980 .cloneMemRefs(MI); 3981 MI.eraseFromParent(); 3982 return true; 3983 } 3984 3985 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3986 switch (IntrID) { 3987 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3988 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3990 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3991 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3992 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3993 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3994 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3996 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3997 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3999 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4000 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4001 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4002 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4003 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4005 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4006 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4007 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4008 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4009 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4010 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4011 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4012 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4014 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4015 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4016 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4017 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4018 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4019 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4020 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4021 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4022 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4023 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4024 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4025 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4026 case Intrinsic::amdgcn_buffer_atomic_fadd: 4027 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4028 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4029 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4030 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4031 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4032 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4033 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4034 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4035 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4036 default: 4037 llvm_unreachable("unhandled atomic opcode"); 4038 } 4039 } 4040 4041 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4042 MachineIRBuilder &B, 4043 Intrinsic::ID IID) const { 4044 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4045 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4046 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4047 4048 Register Dst; 4049 4050 int OpOffset = 0; 4051 if (HasReturn) { 4052 // A few FP atomics do not support return values. 4053 Dst = MI.getOperand(0).getReg(); 4054 } else { 4055 OpOffset = -1; 4056 } 4057 4058 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4059 Register CmpVal; 4060 4061 if (IsCmpSwap) { 4062 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4063 ++OpOffset; 4064 } 4065 4066 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4067 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4068 4069 // The struct intrinsic variants add one additional operand over raw. 4070 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4071 Register VIndex; 4072 if (HasVIndex) { 4073 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4074 ++OpOffset; 4075 } 4076 4077 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4078 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4079 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4080 4081 MachineMemOperand *MMO = *MI.memoperands_begin(); 4082 4083 unsigned ImmOffset; 4084 unsigned TotalOffset; 4085 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 4086 if (TotalOffset != 0) 4087 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 4088 4089 if (!VIndex) 4090 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4091 4092 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4093 4094 if (HasReturn) 4095 MIB.addDef(Dst); 4096 4097 MIB.addUse(VData); // vdata 4098 4099 if (IsCmpSwap) 4100 MIB.addReg(CmpVal); 4101 4102 MIB.addUse(RSrc) // rsrc 4103 .addUse(VIndex) // vindex 4104 .addUse(VOffset) // voffset 4105 .addUse(SOffset) // soffset 4106 .addImm(ImmOffset) // offset(imm) 4107 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4108 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4109 .addMemOperand(MMO); 4110 4111 MI.eraseFromParent(); 4112 return true; 4113 } 4114 4115 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4116 /// vector with s16 typed elements. 4117 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4118 SmallVectorImpl<Register> &PackedAddrs, 4119 unsigned ArgOffset, 4120 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4121 bool IsA16, bool IsG16) { 4122 const LLT S16 = LLT::scalar(16); 4123 const LLT V2S16 = LLT::vector(2, 16); 4124 auto EndIdx = Intr->VAddrEnd; 4125 4126 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4127 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4128 if (!SrcOp.isReg()) 4129 continue; // _L to _LZ may have eliminated this. 4130 4131 Register AddrReg = SrcOp.getReg(); 4132 4133 if (I < Intr->GradientStart) { 4134 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4135 PackedAddrs.push_back(AddrReg); 4136 } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4137 (I >= Intr->CoordStart && !IsA16)) { 4138 // Handle any gradient or coordinate operands that should not be packed 4139 PackedAddrs.push_back(AddrReg); 4140 } else { 4141 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4142 // derivatives dx/dh and dx/dv are packed with undef. 4143 if (((I + 1) >= EndIdx) || 4144 ((Intr->NumGradients / 2) % 2 == 1 && 4145 (I == static_cast<unsigned>(Intr->GradientStart + 4146 (Intr->NumGradients / 2) - 1) || 4147 I == static_cast<unsigned>(Intr->GradientStart + 4148 Intr->NumGradients - 1))) || 4149 // Check for _L to _LZ optimization 4150 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4151 PackedAddrs.push_back( 4152 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4153 .getReg(0)); 4154 } else { 4155 PackedAddrs.push_back( 4156 B.buildBuildVector( 4157 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4158 .getReg(0)); 4159 ++I; 4160 } 4161 } 4162 } 4163 } 4164 4165 /// Convert from separate vaddr components to a single vector address register, 4166 /// and replace the remaining operands with $noreg. 4167 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4168 int DimIdx, int NumVAddrs) { 4169 const LLT S32 = LLT::scalar(32); 4170 4171 SmallVector<Register, 8> AddrRegs; 4172 for (int I = 0; I != NumVAddrs; ++I) { 4173 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4174 if (SrcOp.isReg()) { 4175 AddrRegs.push_back(SrcOp.getReg()); 4176 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4177 } 4178 } 4179 4180 int NumAddrRegs = AddrRegs.size(); 4181 if (NumAddrRegs != 1) { 4182 // Round up to 8 elements for v5-v7 4183 // FIXME: Missing intermediate sized register classes and instructions. 4184 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4185 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4186 auto Undef = B.buildUndef(S32); 4187 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4188 NumAddrRegs = RoundedNumRegs; 4189 } 4190 4191 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4192 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4193 } 4194 4195 for (int I = 1; I != NumVAddrs; ++I) { 4196 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4197 if (SrcOp.isReg()) 4198 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4199 } 4200 } 4201 4202 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4203 /// 4204 /// Depending on the subtarget, load/store with 16-bit element data need to be 4205 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4206 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4207 /// registers. 4208 /// 4209 /// We don't want to directly select image instructions just yet, but also want 4210 /// to exposes all register repacking to the legalizer/combiners. We also don't 4211 /// want a selected instrution entering RegBankSelect. In order to avoid 4212 /// defining a multitude of intermediate image instructions, directly hack on 4213 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4214 /// now unnecessary arguments with $noreg. 4215 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4216 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4217 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4218 4219 const unsigned NumDefs = MI.getNumExplicitDefs(); 4220 const unsigned ArgOffset = NumDefs + 1; 4221 bool IsTFE = NumDefs == 2; 4222 // We are only processing the operands of d16 image operations on subtargets 4223 // that use the unpacked register layout, or need to repack the TFE result. 4224 4225 // TODO: Do we need to guard against already legalized intrinsics? 4226 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4227 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4228 4229 MachineRegisterInfo *MRI = B.getMRI(); 4230 const LLT S32 = LLT::scalar(32); 4231 const LLT S16 = LLT::scalar(16); 4232 const LLT V2S16 = LLT::vector(2, 16); 4233 4234 unsigned DMask = 0; 4235 4236 // Check for 16 bit addresses and pack if true. 4237 LLT GradTy = 4238 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4239 LLT AddrTy = 4240 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4241 const bool IsG16 = GradTy == S16; 4242 const bool IsA16 = AddrTy == S16; 4243 4244 int DMaskLanes = 0; 4245 if (!BaseOpcode->Atomic) { 4246 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4247 if (BaseOpcode->Gather4) { 4248 DMaskLanes = 4; 4249 } else if (DMask != 0) { 4250 DMaskLanes = countPopulation(DMask); 4251 } else if (!IsTFE && !BaseOpcode->Store) { 4252 // If dmask is 0, this is a no-op load. This can be eliminated. 4253 B.buildUndef(MI.getOperand(0)); 4254 MI.eraseFromParent(); 4255 return true; 4256 } 4257 } 4258 4259 Observer.changingInstr(MI); 4260 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4261 4262 unsigned NewOpcode = NumDefs == 0 ? 4263 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4264 4265 // Track that we legalized this 4266 MI.setDesc(B.getTII().get(NewOpcode)); 4267 4268 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4269 // dmask to be at least 1 otherwise the instruction will fail 4270 if (IsTFE && DMask == 0) { 4271 DMask = 0x1; 4272 DMaskLanes = 1; 4273 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4274 } 4275 4276 if (BaseOpcode->Atomic) { 4277 Register VData0 = MI.getOperand(2).getReg(); 4278 LLT Ty = MRI->getType(VData0); 4279 4280 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4281 if (Ty.isVector()) 4282 return false; 4283 4284 if (BaseOpcode->AtomicX2) { 4285 Register VData1 = MI.getOperand(3).getReg(); 4286 // The two values are packed in one register. 4287 LLT PackedTy = LLT::vector(2, Ty); 4288 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4289 MI.getOperand(2).setReg(Concat.getReg(0)); 4290 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4291 } 4292 } 4293 4294 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4295 4296 // Optimize _L to _LZ when _L is zero 4297 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4298 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4299 const ConstantFP *ConstantLod; 4300 4301 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4302 m_GFCst(ConstantLod))) { 4303 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4304 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4305 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4306 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4307 Intr->Dim); 4308 4309 // The starting indexes should remain in the same place. 4310 --CorrectedNumVAddrs; 4311 4312 MI.getOperand(MI.getNumExplicitDefs()) 4313 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4314 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4315 Intr = NewImageDimIntr; 4316 } 4317 } 4318 } 4319 4320 // Optimize _mip away, when 'lod' is zero 4321 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4322 int64_t ConstantLod; 4323 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4324 m_ICst(ConstantLod))) { 4325 if (ConstantLod == 0) { 4326 // TODO: Change intrinsic opcode and remove operand instead or replacing 4327 // it with 0, as the _L to _LZ handling is done above. 4328 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4329 --CorrectedNumVAddrs; 4330 } 4331 } 4332 } 4333 4334 // Rewrite the addressing register layout before doing anything else. 4335 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4336 // 16 bit gradients are supported, but are tied to the A16 control 4337 // so both gradients and addresses must be 16 bit 4338 return false; 4339 } 4340 4341 if (IsA16 && !ST.hasA16()) { 4342 // A16 not supported 4343 return false; 4344 } 4345 4346 if (IsA16 || IsG16) { 4347 if (Intr->NumVAddrs > 1) { 4348 SmallVector<Register, 4> PackedRegs; 4349 4350 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4351 IsG16); 4352 4353 // See also below in the non-a16 branch 4354 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4355 4356 if (!UseNSA && PackedRegs.size() > 1) { 4357 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4358 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4359 PackedRegs[0] = Concat.getReg(0); 4360 PackedRegs.resize(1); 4361 } 4362 4363 const unsigned NumPacked = PackedRegs.size(); 4364 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4365 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4366 if (!SrcOp.isReg()) { 4367 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4368 continue; 4369 } 4370 4371 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4372 4373 if (I - Intr->VAddrStart < NumPacked) 4374 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4375 else 4376 SrcOp.setReg(AMDGPU::NoRegister); 4377 } 4378 } 4379 } else { 4380 // If the register allocator cannot place the address registers contiguously 4381 // without introducing moves, then using the non-sequential address encoding 4382 // is always preferable, since it saves VALU instructions and is usually a 4383 // wash in terms of code size or even better. 4384 // 4385 // However, we currently have no way of hinting to the register allocator 4386 // that MIMG addresses should be placed contiguously when it is possible to 4387 // do so, so force non-NSA for the common 2-address case as a heuristic. 4388 // 4389 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4390 // allocation when possible. 4391 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4392 4393 if (!UseNSA && Intr->NumVAddrs > 1) 4394 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4395 Intr->NumVAddrs); 4396 } 4397 4398 int Flags = 0; 4399 if (IsA16) 4400 Flags |= 1; 4401 if (IsG16) 4402 Flags |= 2; 4403 MI.addOperand(MachineOperand::CreateImm(Flags)); 4404 4405 if (BaseOpcode->Store) { // No TFE for stores? 4406 // TODO: Handle dmask trim 4407 Register VData = MI.getOperand(1).getReg(); 4408 LLT Ty = MRI->getType(VData); 4409 if (!Ty.isVector() || Ty.getElementType() != S16) 4410 return true; 4411 4412 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4413 if (RepackedReg != VData) { 4414 MI.getOperand(1).setReg(RepackedReg); 4415 } 4416 4417 return true; 4418 } 4419 4420 Register DstReg = MI.getOperand(0).getReg(); 4421 LLT Ty = MRI->getType(DstReg); 4422 const LLT EltTy = Ty.getScalarType(); 4423 const bool IsD16 = Ty.getScalarType() == S16; 4424 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4425 4426 // Confirm that the return type is large enough for the dmask specified 4427 if (NumElts < DMaskLanes) 4428 return false; 4429 4430 if (NumElts > 4 || DMaskLanes > 4) 4431 return false; 4432 4433 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4434 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4435 4436 // The raw dword aligned data component of the load. The only legal cases 4437 // where this matters should be when using the packed D16 format, for 4438 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4439 LLT RoundedTy; 4440 4441 // S32 vector to to cover all data, plus TFE result element. 4442 LLT TFETy; 4443 4444 // Register type to use for each loaded component. Will be S32 or V2S16. 4445 LLT RegTy; 4446 4447 if (IsD16 && ST.hasUnpackedD16VMem()) { 4448 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4449 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4450 RegTy = S32; 4451 } else { 4452 unsigned EltSize = EltTy.getSizeInBits(); 4453 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4454 unsigned RoundedSize = 32 * RoundedElts; 4455 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4456 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4457 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4458 } 4459 4460 // The return type does not need adjustment. 4461 // TODO: Should we change s16 case to s32 or <2 x s16>? 4462 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4463 return true; 4464 4465 Register Dst1Reg; 4466 4467 // Insert after the instruction. 4468 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4469 4470 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4471 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4472 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4473 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4474 4475 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4476 4477 MI.getOperand(0).setReg(NewResultReg); 4478 4479 // In the IR, TFE is supposed to be used with a 2 element struct return 4480 // type. The intruction really returns these two values in one contiguous 4481 // register, with one additional dword beyond the loaded data. Rewrite the 4482 // return type to use a single register result. 4483 4484 if (IsTFE) { 4485 Dst1Reg = MI.getOperand(1).getReg(); 4486 if (MRI->getType(Dst1Reg) != S32) 4487 return false; 4488 4489 // TODO: Make sure the TFE operand bit is set. 4490 MI.RemoveOperand(1); 4491 4492 // Handle the easy case that requires no repack instructions. 4493 if (Ty == S32) { 4494 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4495 return true; 4496 } 4497 } 4498 4499 // Now figure out how to copy the new result register back into the old 4500 // result. 4501 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4502 4503 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4504 4505 if (ResultNumRegs == 1) { 4506 assert(!IsTFE); 4507 ResultRegs[0] = NewResultReg; 4508 } else { 4509 // We have to repack into a new vector of some kind. 4510 for (int I = 0; I != NumDataRegs; ++I) 4511 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4512 B.buildUnmerge(ResultRegs, NewResultReg); 4513 4514 // Drop the final TFE element to get the data part. The TFE result is 4515 // directly written to the right place already. 4516 if (IsTFE) 4517 ResultRegs.resize(NumDataRegs); 4518 } 4519 4520 // For an s16 scalar result, we form an s32 result with a truncate regardless 4521 // of packed vs. unpacked. 4522 if (IsD16 && !Ty.isVector()) { 4523 B.buildTrunc(DstReg, ResultRegs[0]); 4524 return true; 4525 } 4526 4527 // Avoid a build/concat_vector of 1 entry. 4528 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4529 B.buildBitcast(DstReg, ResultRegs[0]); 4530 return true; 4531 } 4532 4533 assert(Ty.isVector()); 4534 4535 if (IsD16) { 4536 // For packed D16 results with TFE enabled, all the data components are 4537 // S32. Cast back to the expected type. 4538 // 4539 // TODO: We don't really need to use load s32 elements. We would only need one 4540 // cast for the TFE result if a multiple of v2s16 was used. 4541 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4542 for (Register &Reg : ResultRegs) 4543 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4544 } else if (ST.hasUnpackedD16VMem()) { 4545 for (Register &Reg : ResultRegs) 4546 Reg = B.buildTrunc(S16, Reg).getReg(0); 4547 } 4548 } 4549 4550 auto padWithUndef = [&](LLT Ty, int NumElts) { 4551 if (NumElts == 0) 4552 return; 4553 Register Undef = B.buildUndef(Ty).getReg(0); 4554 for (int I = 0; I != NumElts; ++I) 4555 ResultRegs.push_back(Undef); 4556 }; 4557 4558 // Pad out any elements eliminated due to the dmask. 4559 LLT ResTy = MRI->getType(ResultRegs[0]); 4560 if (!ResTy.isVector()) { 4561 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4562 B.buildBuildVector(DstReg, ResultRegs); 4563 return true; 4564 } 4565 4566 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4567 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4568 4569 // Deal with the one annoying legal case. 4570 const LLT V3S16 = LLT::vector(3, 16); 4571 if (Ty == V3S16) { 4572 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4573 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4574 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4575 return true; 4576 } 4577 4578 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4579 B.buildConcatVectors(DstReg, ResultRegs); 4580 return true; 4581 } 4582 4583 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4584 LegalizerHelper &Helper, MachineInstr &MI) const { 4585 MachineIRBuilder &B = Helper.MIRBuilder; 4586 GISelChangeObserver &Observer = Helper.Observer; 4587 4588 Register Dst = MI.getOperand(0).getReg(); 4589 LLT Ty = B.getMRI()->getType(Dst); 4590 unsigned Size = Ty.getSizeInBits(); 4591 MachineFunction &MF = B.getMF(); 4592 4593 Observer.changingInstr(MI); 4594 4595 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4596 Ty = getBitcastRegisterType(Ty); 4597 Helper.bitcastDst(MI, Ty, 0); 4598 Dst = MI.getOperand(0).getReg(); 4599 B.setInsertPt(B.getMBB(), MI); 4600 } 4601 4602 // FIXME: We don't really need this intermediate instruction. The intrinsic 4603 // should be fixed to have a memory operand. Since it's readnone, we're not 4604 // allowed to add one. 4605 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4606 MI.RemoveOperand(1); // Remove intrinsic ID 4607 4608 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4609 // TODO: Should this use datalayout alignment? 4610 const unsigned MemSize = (Size + 7) / 8; 4611 const Align MemAlign(4); 4612 MachineMemOperand *MMO = MF.getMachineMemOperand( 4613 MachinePointerInfo(), 4614 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4615 MachineMemOperand::MOInvariant, 4616 MemSize, MemAlign); 4617 MI.addMemOperand(MF, MMO); 4618 4619 // There are no 96-bit result scalar loads, but widening to 128-bit should 4620 // always be legal. We may need to restore this to a 96-bit result if it turns 4621 // out this needs to be converted to a vector load during RegBankSelect. 4622 if (!isPowerOf2_32(Size)) { 4623 if (Ty.isVector()) 4624 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4625 else 4626 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4627 } 4628 4629 Observer.changedInstr(MI); 4630 return true; 4631 } 4632 4633 // TODO: Move to selection 4634 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4635 MachineRegisterInfo &MRI, 4636 MachineIRBuilder &B) const { 4637 if (!ST.isTrapHandlerEnabled() || 4638 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4639 return legalizeTrapEndpgm(MI, MRI, B); 4640 4641 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4642 switch (*HsaAbiVer) { 4643 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4644 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4645 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4646 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4647 return ST.supportsGetDoorbellID() ? 4648 legalizeTrapHsa(MI, MRI, B) : 4649 legalizeTrapHsaQueuePtr(MI, MRI, B); 4650 } 4651 } 4652 4653 llvm_unreachable("Unknown trap handler"); 4654 } 4655 4656 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4657 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4658 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4659 MI.eraseFromParent(); 4660 return true; 4661 } 4662 4663 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4664 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4665 // Pass queue pointer to trap handler as input, and insert trap instruction 4666 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4667 Register LiveIn = 4668 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4669 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4670 return false; 4671 4672 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4673 B.buildCopy(SGPR01, LiveIn); 4674 B.buildInstr(AMDGPU::S_TRAP) 4675 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4676 .addReg(SGPR01, RegState::Implicit); 4677 4678 MI.eraseFromParent(); 4679 return true; 4680 } 4681 4682 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4683 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4684 B.buildInstr(AMDGPU::S_TRAP) 4685 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4686 MI.eraseFromParent(); 4687 return true; 4688 } 4689 4690 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4691 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4692 // Is non-HSA path or trap-handler disabled? then, report a warning 4693 // accordingly 4694 if (!ST.isTrapHandlerEnabled() || 4695 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4696 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4697 "debugtrap handler not supported", 4698 MI.getDebugLoc(), DS_Warning); 4699 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4700 Ctx.diagnose(NoTrap); 4701 } else { 4702 // Insert debug-trap instruction 4703 B.buildInstr(AMDGPU::S_TRAP) 4704 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4705 } 4706 4707 MI.eraseFromParent(); 4708 return true; 4709 } 4710 4711 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4712 MachineIRBuilder &B) const { 4713 MachineRegisterInfo &MRI = *B.getMRI(); 4714 const LLT S16 = LLT::scalar(16); 4715 const LLT S32 = LLT::scalar(32); 4716 4717 Register DstReg = MI.getOperand(0).getReg(); 4718 Register NodePtr = MI.getOperand(2).getReg(); 4719 Register RayExtent = MI.getOperand(3).getReg(); 4720 Register RayOrigin = MI.getOperand(4).getReg(); 4721 Register RayDir = MI.getOperand(5).getReg(); 4722 Register RayInvDir = MI.getOperand(6).getReg(); 4723 Register TDescr = MI.getOperand(7).getReg(); 4724 4725 if (!ST.hasGFX10_AEncoding()) { 4726 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4727 "intrinsic not supported on subtarget", 4728 MI.getDebugLoc()); 4729 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4730 return false; 4731 } 4732 4733 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4734 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4735 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4736 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4737 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4738 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4739 4740 SmallVector<Register, 12> Ops; 4741 if (Is64) { 4742 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4743 Ops.push_back(Unmerge.getReg(0)); 4744 Ops.push_back(Unmerge.getReg(1)); 4745 } else { 4746 Ops.push_back(NodePtr); 4747 } 4748 Ops.push_back(RayExtent); 4749 4750 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4751 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4752 Ops.push_back(Unmerge.getReg(0)); 4753 Ops.push_back(Unmerge.getReg(1)); 4754 Ops.push_back(Unmerge.getReg(2)); 4755 }; 4756 4757 packLanes(RayOrigin); 4758 if (IsA16) { 4759 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4760 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4761 Register R1 = MRI.createGenericVirtualRegister(S32); 4762 Register R2 = MRI.createGenericVirtualRegister(S32); 4763 Register R3 = MRI.createGenericVirtualRegister(S32); 4764 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4765 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4766 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4767 Ops.push_back(R1); 4768 Ops.push_back(R2); 4769 Ops.push_back(R3); 4770 } else { 4771 packLanes(RayDir); 4772 packLanes(RayInvDir); 4773 } 4774 4775 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4776 .addDef(DstReg) 4777 .addImm(Opcode); 4778 4779 for (Register R : Ops) { 4780 MIB.addUse(R); 4781 } 4782 4783 MIB.addUse(TDescr) 4784 .addImm(IsA16 ? 1 : 0) 4785 .cloneMemRefs(MI); 4786 4787 MI.eraseFromParent(); 4788 return true; 4789 } 4790 4791 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4792 MachineInstr &MI) const { 4793 MachineIRBuilder &B = Helper.MIRBuilder; 4794 MachineRegisterInfo &MRI = *B.getMRI(); 4795 4796 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4797 auto IntrID = MI.getIntrinsicID(); 4798 switch (IntrID) { 4799 case Intrinsic::amdgcn_if: 4800 case Intrinsic::amdgcn_else: { 4801 MachineInstr *Br = nullptr; 4802 MachineBasicBlock *UncondBrTarget = nullptr; 4803 bool Negated = false; 4804 if (MachineInstr *BrCond = 4805 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4806 const SIRegisterInfo *TRI 4807 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4808 4809 Register Def = MI.getOperand(1).getReg(); 4810 Register Use = MI.getOperand(3).getReg(); 4811 4812 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4813 4814 if (Negated) 4815 std::swap(CondBrTarget, UncondBrTarget); 4816 4817 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4818 if (IntrID == Intrinsic::amdgcn_if) { 4819 B.buildInstr(AMDGPU::SI_IF) 4820 .addDef(Def) 4821 .addUse(Use) 4822 .addMBB(UncondBrTarget); 4823 } else { 4824 B.buildInstr(AMDGPU::SI_ELSE) 4825 .addDef(Def) 4826 .addUse(Use) 4827 .addMBB(UncondBrTarget); 4828 } 4829 4830 if (Br) { 4831 Br->getOperand(0).setMBB(CondBrTarget); 4832 } else { 4833 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4834 // since we're swapping branch targets it needs to be reinserted. 4835 // FIXME: IRTranslator should probably not do this 4836 B.buildBr(*CondBrTarget); 4837 } 4838 4839 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4840 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4841 MI.eraseFromParent(); 4842 BrCond->eraseFromParent(); 4843 return true; 4844 } 4845 4846 return false; 4847 } 4848 case Intrinsic::amdgcn_loop: { 4849 MachineInstr *Br = nullptr; 4850 MachineBasicBlock *UncondBrTarget = nullptr; 4851 bool Negated = false; 4852 if (MachineInstr *BrCond = 4853 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4854 const SIRegisterInfo *TRI 4855 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4856 4857 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4858 Register Reg = MI.getOperand(2).getReg(); 4859 4860 if (Negated) 4861 std::swap(CondBrTarget, UncondBrTarget); 4862 4863 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4864 B.buildInstr(AMDGPU::SI_LOOP) 4865 .addUse(Reg) 4866 .addMBB(UncondBrTarget); 4867 4868 if (Br) 4869 Br->getOperand(0).setMBB(CondBrTarget); 4870 else 4871 B.buildBr(*CondBrTarget); 4872 4873 MI.eraseFromParent(); 4874 BrCond->eraseFromParent(); 4875 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4876 return true; 4877 } 4878 4879 return false; 4880 } 4881 case Intrinsic::amdgcn_kernarg_segment_ptr: 4882 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4883 // This only makes sense to call in a kernel, so just lower to null. 4884 B.buildConstant(MI.getOperand(0).getReg(), 0); 4885 MI.eraseFromParent(); 4886 return true; 4887 } 4888 4889 return legalizePreloadedArgIntrin( 4890 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4891 case Intrinsic::amdgcn_implicitarg_ptr: 4892 return legalizeImplicitArgPtr(MI, MRI, B); 4893 case Intrinsic::amdgcn_workitem_id_x: 4894 return legalizePreloadedArgIntrin(MI, MRI, B, 4895 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4896 case Intrinsic::amdgcn_workitem_id_y: 4897 return legalizePreloadedArgIntrin(MI, MRI, B, 4898 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4899 case Intrinsic::amdgcn_workitem_id_z: 4900 return legalizePreloadedArgIntrin(MI, MRI, B, 4901 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4902 case Intrinsic::amdgcn_workgroup_id_x: 4903 return legalizePreloadedArgIntrin(MI, MRI, B, 4904 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4905 case Intrinsic::amdgcn_workgroup_id_y: 4906 return legalizePreloadedArgIntrin(MI, MRI, B, 4907 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4908 case Intrinsic::amdgcn_workgroup_id_z: 4909 return legalizePreloadedArgIntrin(MI, MRI, B, 4910 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4911 case Intrinsic::amdgcn_dispatch_ptr: 4912 return legalizePreloadedArgIntrin(MI, MRI, B, 4913 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4914 case Intrinsic::amdgcn_queue_ptr: 4915 return legalizePreloadedArgIntrin(MI, MRI, B, 4916 AMDGPUFunctionArgInfo::QUEUE_PTR); 4917 case Intrinsic::amdgcn_implicit_buffer_ptr: 4918 return legalizePreloadedArgIntrin( 4919 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4920 case Intrinsic::amdgcn_dispatch_id: 4921 return legalizePreloadedArgIntrin(MI, MRI, B, 4922 AMDGPUFunctionArgInfo::DISPATCH_ID); 4923 case Intrinsic::amdgcn_fdiv_fast: 4924 return legalizeFDIVFastIntrin(MI, MRI, B); 4925 case Intrinsic::amdgcn_is_shared: 4926 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4927 case Intrinsic::amdgcn_is_private: 4928 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4929 case Intrinsic::amdgcn_wavefrontsize: { 4930 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4931 MI.eraseFromParent(); 4932 return true; 4933 } 4934 case Intrinsic::amdgcn_s_buffer_load: 4935 return legalizeSBufferLoad(Helper, MI); 4936 case Intrinsic::amdgcn_raw_buffer_store: 4937 case Intrinsic::amdgcn_struct_buffer_store: 4938 return legalizeBufferStore(MI, MRI, B, false, false); 4939 case Intrinsic::amdgcn_raw_buffer_store_format: 4940 case Intrinsic::amdgcn_struct_buffer_store_format: 4941 return legalizeBufferStore(MI, MRI, B, false, true); 4942 case Intrinsic::amdgcn_raw_tbuffer_store: 4943 case Intrinsic::amdgcn_struct_tbuffer_store: 4944 return legalizeBufferStore(MI, MRI, B, true, true); 4945 case Intrinsic::amdgcn_raw_buffer_load: 4946 case Intrinsic::amdgcn_struct_buffer_load: 4947 return legalizeBufferLoad(MI, MRI, B, false, false); 4948 case Intrinsic::amdgcn_raw_buffer_load_format: 4949 case Intrinsic::amdgcn_struct_buffer_load_format: 4950 return legalizeBufferLoad(MI, MRI, B, true, false); 4951 case Intrinsic::amdgcn_raw_tbuffer_load: 4952 case Intrinsic::amdgcn_struct_tbuffer_load: 4953 return legalizeBufferLoad(MI, MRI, B, true, true); 4954 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4955 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4956 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4957 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4958 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4959 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4960 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4961 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4962 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4963 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4964 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4965 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4966 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4967 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4968 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4969 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4970 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4971 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4972 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4973 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4974 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4975 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4976 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4977 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4978 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4979 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4980 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4981 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4982 case Intrinsic::amdgcn_buffer_atomic_fadd: 4983 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4984 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4985 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4986 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4987 return legalizeBufferAtomic(MI, B, IntrID); 4988 case Intrinsic::amdgcn_atomic_inc: 4989 return legalizeAtomicIncDec(MI, B, true); 4990 case Intrinsic::amdgcn_atomic_dec: 4991 return legalizeAtomicIncDec(MI, B, false); 4992 case Intrinsic::trap: 4993 return legalizeTrapIntrinsic(MI, MRI, B); 4994 case Intrinsic::debugtrap: 4995 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4996 case Intrinsic::amdgcn_rsq_clamp: 4997 return legalizeRsqClampIntrinsic(MI, MRI, B); 4998 case Intrinsic::amdgcn_ds_fadd: 4999 case Intrinsic::amdgcn_ds_fmin: 5000 case Intrinsic::amdgcn_ds_fmax: 5001 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5002 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5003 return legalizeBVHIntrinsic(MI, B); 5004 default: { 5005 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5006 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5007 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5008 return true; 5009 } 5010 } 5011 5012 return true; 5013 } 5014