1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeNumElements(Pow2NElts); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 99 }; 100 } 101 102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 const LLT EltTy = Ty.getElementType(); 106 unsigned Size = Ty.getSizeInBits(); 107 unsigned Pieces = (Size + 63) / 64; 108 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 109 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 110 }; 111 } 112 113 // Increase the number of vector elements to reach the next multiple of 32-bit 114 // type. 115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 116 return [=](const LegalityQuery &Query) { 117 const LLT Ty = Query.Types[TypeIdx]; 118 119 const LLT EltTy = Ty.getElementType(); 120 const int Size = Ty.getSizeInBits(); 121 const int EltSize = EltTy.getSizeInBits(); 122 const int NextMul32 = (Size + 31) / 32; 123 124 assert(EltSize < 32); 125 126 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 127 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 128 }; 129 } 130 131 static LLT getBitcastRegisterType(const LLT Ty) { 132 const unsigned Size = Ty.getSizeInBits(); 133 134 LLT CoercedTy; 135 if (Size <= 32) { 136 // <2 x s8> -> s16 137 // <4 x s8> -> s32 138 return LLT::scalar(Size); 139 } 140 141 return LLT::scalarOrVector(Size / 32, 32); 142 } 143 144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT Ty = Query.Types[TypeIdx]; 147 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 148 }; 149 } 150 151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 152 return [=](const LegalityQuery &Query) { 153 const LLT Ty = Query.Types[TypeIdx]; 154 unsigned Size = Ty.getSizeInBits(); 155 assert(Size % 32 == 0); 156 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 157 }; 158 } 159 160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 161 return [=](const LegalityQuery &Query) { 162 const LLT QueryTy = Query.Types[TypeIdx]; 163 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 164 }; 165 } 166 167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 171 }; 172 } 173 174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 178 }; 179 } 180 181 static bool isRegisterSize(unsigned Size) { 182 return Size % 32 == 0 && Size <= MaxRegisterSize; 183 } 184 185 static bool isRegisterVectorElementType(LLT EltTy) { 186 const int EltSize = EltTy.getSizeInBits(); 187 return EltSize == 16 || EltSize % 32 == 0; 188 } 189 190 static bool isRegisterVectorType(LLT Ty) { 191 const int EltSize = Ty.getElementType().getSizeInBits(); 192 return EltSize == 32 || EltSize == 64 || 193 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 194 EltSize == 128 || EltSize == 256; 195 } 196 197 static bool isRegisterType(LLT Ty) { 198 if (!isRegisterSize(Ty.getSizeInBits())) 199 return false; 200 201 if (Ty.isVector()) 202 return isRegisterVectorType(Ty); 203 204 return true; 205 } 206 207 // Any combination of 32 or 64-bit elements up the maximum register size, and 208 // multiples of v2s16. 209 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 return isRegisterType(Query.Types[TypeIdx]); 212 }; 213 } 214 215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 216 return [=](const LegalityQuery &Query) { 217 const LLT QueryTy = Query.Types[TypeIdx]; 218 if (!QueryTy.isVector()) 219 return false; 220 const LLT EltTy = QueryTy.getElementType(); 221 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 222 }; 223 } 224 225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 226 return [=](const LegalityQuery &Query) { 227 const LLT Ty = Query.Types[TypeIdx]; 228 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 229 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 230 }; 231 } 232 233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 234 // handle some operations by just promoting the register during 235 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 237 bool IsLoad) { 238 switch (AS) { 239 case AMDGPUAS::PRIVATE_ADDRESS: 240 // FIXME: Private element size. 241 return ST.enableFlatScratch() ? 128 : 32; 242 case AMDGPUAS::LOCAL_ADDRESS: 243 return ST.useDS128() ? 128 : 64; 244 case AMDGPUAS::GLOBAL_ADDRESS: 245 case AMDGPUAS::CONSTANT_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 247 // Treat constant and global as identical. SMRD loads are sometimes usable for 248 // global loads (ideally constant address space should be eliminated) 249 // depending on the context. Legality cannot be context dependent, but 250 // RegBankSelect can split the load as necessary depending on the pointer 251 // register bank/uniformity and if the memory is invariant or not written in a 252 // kernel. 253 return IsLoad ? 512 : 128; 254 default: 255 // Flat addresses may contextually need to be split to 32-bit parts if they 256 // may alias scratch depending on the subtarget. 257 return 128; 258 } 259 } 260 261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 262 const LegalityQuery &Query, 263 unsigned Opcode) { 264 const LLT Ty = Query.Types[0]; 265 266 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 267 const bool IsLoad = Opcode != AMDGPU::G_STORE; 268 269 unsigned RegSize = Ty.getSizeInBits(); 270 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 271 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 272 unsigned AS = Query.Types[1].getAddressSpace(); 273 274 // All of these need to be custom lowered to cast the pointer operand. 275 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 276 return false; 277 278 // TODO: We should be able to widen loads if the alignment is high enough, but 279 // we also need to modify the memory access size. 280 #if 0 281 // Accept widening loads based on alignment. 282 if (IsLoad && MemSize < Size) 283 MemSize = std::max(MemSize, Align); 284 #endif 285 286 // Only 1-byte and 2-byte to 32-bit extloads are valid. 287 if (MemSize != RegSize && RegSize != 32) 288 return false; 289 290 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 291 return false; 292 293 switch (MemSize) { 294 case 8: 295 case 16: 296 case 32: 297 case 64: 298 case 128: 299 break; 300 case 96: 301 if (!ST.hasDwordx3LoadStores()) 302 return false; 303 break; 304 case 256: 305 case 512: 306 // These may contextually need to be broken down. 307 break; 308 default: 309 return false; 310 } 311 312 assert(RegSize >= MemSize); 313 314 if (AlignBits < MemSize) { 315 const SITargetLowering *TLI = ST.getTargetLowering(); 316 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 317 Align(AlignBits / 8))) 318 return false; 319 } 320 321 return true; 322 } 323 324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 325 // workaround this. Eventually it should ignore the type for loads and only care 326 // about the size. Return true in cases where we will workaround this for now by 327 // bitcasting. 328 static bool loadStoreBitcastWorkaround(const LLT Ty) { 329 if (EnableNewLegality) 330 return false; 331 332 const unsigned Size = Ty.getSizeInBits(); 333 if (Size <= 64) 334 return false; 335 if (!Ty.isVector()) 336 return true; 337 338 LLT EltTy = Ty.getElementType(); 339 if (EltTy.isPointer()) 340 return true; 341 342 unsigned EltSize = EltTy.getSizeInBits(); 343 return EltSize != 32 && EltSize != 64; 344 } 345 346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 347 unsigned Opcode) { 348 const LLT Ty = Query.Types[0]; 349 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 350 !loadStoreBitcastWorkaround(Ty); 351 } 352 353 /// Return true if a load or store of the type should be lowered with a bitcast 354 /// to a different type. 355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 356 const unsigned MemSizeInBits) { 357 const unsigned Size = Ty.getSizeInBits(); 358 if (Size != MemSizeInBits) 359 return Size <= 32 && Ty.isVector(); 360 361 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 362 return true; 363 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 364 !isRegisterVectorElementType(Ty.getElementType()); 365 } 366 367 /// Return true if we should legalize a load by widening an odd sized memory 368 /// access up to the alignment. Note this case when the memory access itself 369 /// changes, not the size of the result register. 370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 371 unsigned AlignInBits, unsigned AddrSpace, 372 unsigned Opcode) { 373 // We don't want to widen cases that are naturally legal. 374 if (isPowerOf2_32(SizeInBits)) 375 return false; 376 377 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 378 // end up widening these for a scalar load during RegBankSelect, since there 379 // aren't 96-bit scalar loads. 380 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 381 return false; 382 383 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 384 return false; 385 386 // A load is known dereferenceable up to the alignment, so it's legal to widen 387 // to it. 388 // 389 // TODO: Could check dereferenceable for less aligned cases. 390 unsigned RoundedSize = NextPowerOf2(SizeInBits); 391 if (AlignInBits < RoundedSize) 392 return false; 393 394 // Do not widen if it would introduce a slow unaligned load. 395 const SITargetLowering *TLI = ST.getTargetLowering(); 396 bool Fast = false; 397 return TLI->allowsMisalignedMemoryAccessesImpl( 398 RoundedSize, AddrSpace, Align(AlignInBits / 8), 399 MachineMemOperand::MOLoad, &Fast) && 400 Fast; 401 } 402 403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 404 unsigned Opcode) { 405 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 406 return false; 407 408 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 409 Query.MMODescrs[0].AlignInBits, 410 Query.Types[1].getAddressSpace(), Opcode); 411 } 412 413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 414 const GCNTargetMachine &TM) 415 : ST(ST_) { 416 using namespace TargetOpcode; 417 418 auto GetAddrSpacePtr = [&TM](unsigned AS) { 419 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 420 }; 421 422 const LLT S1 = LLT::scalar(1); 423 const LLT S8 = LLT::scalar(8); 424 const LLT S16 = LLT::scalar(16); 425 const LLT S32 = LLT::scalar(32); 426 const LLT S64 = LLT::scalar(64); 427 const LLT S128 = LLT::scalar(128); 428 const LLT S256 = LLT::scalar(256); 429 const LLT S512 = LLT::scalar(512); 430 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 431 432 const LLT V2S8 = LLT::vector(2, 8); 433 const LLT V2S16 = LLT::vector(2, 16); 434 const LLT V4S16 = LLT::vector(4, 16); 435 436 const LLT V2S32 = LLT::vector(2, 32); 437 const LLT V3S32 = LLT::vector(3, 32); 438 const LLT V4S32 = LLT::vector(4, 32); 439 const LLT V5S32 = LLT::vector(5, 32); 440 const LLT V6S32 = LLT::vector(6, 32); 441 const LLT V7S32 = LLT::vector(7, 32); 442 const LLT V8S32 = LLT::vector(8, 32); 443 const LLT V9S32 = LLT::vector(9, 32); 444 const LLT V10S32 = LLT::vector(10, 32); 445 const LLT V11S32 = LLT::vector(11, 32); 446 const LLT V12S32 = LLT::vector(12, 32); 447 const LLT V13S32 = LLT::vector(13, 32); 448 const LLT V14S32 = LLT::vector(14, 32); 449 const LLT V15S32 = LLT::vector(15, 32); 450 const LLT V16S32 = LLT::vector(16, 32); 451 const LLT V32S32 = LLT::vector(32, 32); 452 453 const LLT V2S64 = LLT::vector(2, 64); 454 const LLT V3S64 = LLT::vector(3, 64); 455 const LLT V4S64 = LLT::vector(4, 64); 456 const LLT V5S64 = LLT::vector(5, 64); 457 const LLT V6S64 = LLT::vector(6, 64); 458 const LLT V7S64 = LLT::vector(7, 64); 459 const LLT V8S64 = LLT::vector(8, 64); 460 const LLT V16S64 = LLT::vector(16, 64); 461 462 std::initializer_list<LLT> AllS32Vectors = 463 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 464 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 465 std::initializer_list<LLT> AllS64Vectors = 466 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 467 468 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 469 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 470 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 471 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 472 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 473 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 474 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 475 476 const LLT CodePtr = FlatPtr; 477 478 const std::initializer_list<LLT> AddrSpaces64 = { 479 GlobalPtr, ConstantPtr, FlatPtr 480 }; 481 482 const std::initializer_list<LLT> AddrSpaces32 = { 483 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 484 }; 485 486 const std::initializer_list<LLT> FPTypesBase = { 487 S32, S64 488 }; 489 490 const std::initializer_list<LLT> FPTypes16 = { 491 S32, S64, S16 492 }; 493 494 const std::initializer_list<LLT> FPTypesPK16 = { 495 S32, S64, S16, V2S16 496 }; 497 498 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 499 500 setAction({G_BRCOND, S1}, Legal); // VCC branches 501 setAction({G_BRCOND, S32}, Legal); // SCC branches 502 503 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 504 // elements for v3s16 505 getActionDefinitionsBuilder(G_PHI) 506 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 507 .legalFor(AllS32Vectors) 508 .legalFor(AllS64Vectors) 509 .legalFor(AddrSpaces64) 510 .legalFor(AddrSpaces32) 511 .legalIf(isPointer(0)) 512 .clampScalar(0, S16, S256) 513 .widenScalarToNextPow2(0, 32) 514 .clampMaxNumElements(0, S32, 16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .scalarize(0); 517 518 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 519 // Full set of gfx9 features. 520 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 521 .legalFor({S32, S16, V2S16}) 522 .clampScalar(0, S16, S32) 523 .clampMaxNumElements(0, S16, 2) 524 .scalarize(0) 525 .widenScalarToNextPow2(0, 32); 526 527 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 528 .legalFor({S32, S16, V2S16}) // Clamp modifier 529 .minScalarOrElt(0, S16) 530 .clampMaxNumElements(0, S16, 2) 531 .scalarize(0) 532 .widenScalarToNextPow2(0, 32) 533 .lower(); 534 } else if (ST.has16BitInsts()) { 535 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 536 .legalFor({S32, S16}) 537 .clampScalar(0, S16, S32) 538 .scalarize(0) 539 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 540 541 // Technically the saturating operations require clamp bit support, but this 542 // was introduced at the same time as 16-bit operations. 543 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 544 .legalFor({S32, S16}) // Clamp modifier 545 .minScalar(0, S16) 546 .scalarize(0) 547 .widenScalarToNextPow2(0, 16) 548 .lower(); 549 550 // We're just lowering this, but it helps get a better result to try to 551 // coerce to the desired type first. 552 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 553 .minScalar(0, S16) 554 .scalarize(0) 555 .lower(); 556 } else { 557 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.hasIntClamp()) { 563 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 564 .legalFor({S32}) // Clamp modifier. 565 .scalarize(0) 566 .minScalarOrElt(0, S32) 567 .lower(); 568 } else { 569 // Clamp bit support was added in VI, along with 16-bit operations. 570 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 571 .minScalar(0, S32) 572 .scalarize(0) 573 .lower(); 574 } 575 576 // FIXME: DAG expansion gets better results. The widening uses the smaller 577 // range values and goes for the min/max lowering directly. 578 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 579 .minScalar(0, S32) 580 .scalarize(0) 581 .lower(); 582 } 583 584 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 585 .customFor({S32, S64}) 586 .clampScalar(0, S32, S64) 587 .widenScalarToNextPow2(0, 32) 588 .scalarize(0); 589 590 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 591 .legalFor({S32}) 592 .maxScalarOrElt(0, S32); 593 594 if (ST.hasVOP3PInsts()) { 595 Mulh 596 .clampMaxNumElements(0, S8, 2) 597 .lowerFor({V2S8}); 598 } 599 600 Mulh 601 .scalarize(0) 602 .lower(); 603 604 // Report legal for any types we can handle anywhere. For the cases only legal 605 // on the SALU, RegBankSelect will be able to re-legalize. 606 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 607 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 608 .clampScalar(0, S32, S64) 609 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 610 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 611 .widenScalarToNextPow2(0) 612 .scalarize(0); 613 614 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 615 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 616 .legalFor({{S32, S1}, {S32, S32}}) 617 .minScalar(0, S32) 618 // TODO: .scalarize(0) 619 .lower(); 620 621 getActionDefinitionsBuilder(G_BITCAST) 622 // Don't worry about the size constraint. 623 .legalIf(all(isRegisterType(0), isRegisterType(1))) 624 .lower(); 625 626 627 getActionDefinitionsBuilder(G_CONSTANT) 628 .legalFor({S1, S32, S64, S16, GlobalPtr, 629 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 630 .legalIf(isPointer(0)) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0); 633 634 getActionDefinitionsBuilder(G_FCONSTANT) 635 .legalFor({S32, S64, S16}) 636 .clampScalar(0, S16, S64); 637 638 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 639 .legalIf(isRegisterType(0)) 640 // s1 and s16 are special cases because they have legal operations on 641 // them, but don't really occupy registers in the normal way. 642 .legalFor({S1, S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampScalarOrElt(0, S32, MaxScalar) 645 .widenScalarToNextPow2(0, 32) 646 .clampMaxNumElements(0, S32, 16); 647 648 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 649 650 // If the amount is divergent, we have to do a wave reduction to get the 651 // maximum value, so this is expanded during RegBankSelect. 652 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 653 .legalFor({{PrivatePtr, S32}}); 654 655 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 656 .customIf(typeIsNot(0, PrivatePtr)); 657 658 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 659 660 auto &FPOpActions = getActionDefinitionsBuilder( 661 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 662 .legalFor({S32, S64}); 663 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 664 .customFor({S32, S64}); 665 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 666 .customFor({S32, S64}); 667 668 if (ST.has16BitInsts()) { 669 if (ST.hasVOP3PInsts()) 670 FPOpActions.legalFor({S16, V2S16}); 671 else 672 FPOpActions.legalFor({S16}); 673 674 TrigActions.customFor({S16}); 675 FDIVActions.customFor({S16}); 676 } 677 678 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 679 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 680 681 if (ST.hasVOP3PInsts()) { 682 MinNumMaxNum.customFor(FPTypesPK16) 683 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 684 .clampMaxNumElements(0, S16, 2) 685 .clampScalar(0, S16, S64) 686 .scalarize(0); 687 } else if (ST.has16BitInsts()) { 688 MinNumMaxNum.customFor(FPTypes16) 689 .clampScalar(0, S16, S64) 690 .scalarize(0); 691 } else { 692 MinNumMaxNum.customFor(FPTypesBase) 693 .clampScalar(0, S32, S64) 694 .scalarize(0); 695 } 696 697 if (ST.hasVOP3PInsts()) 698 FPOpActions.clampMaxNumElements(0, S16, 2); 699 700 FPOpActions 701 .scalarize(0) 702 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 703 704 TrigActions 705 .scalarize(0) 706 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 707 708 FDIVActions 709 .scalarize(0) 710 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 711 712 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 713 .legalFor(FPTypesPK16) 714 .clampMaxNumElements(0, S16, 2) 715 .scalarize(0) 716 .clampScalar(0, S16, S64); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 720 .legalFor({S32, S64, S16}) 721 .scalarize(0) 722 .clampScalar(0, S16, S64); 723 } else { 724 getActionDefinitionsBuilder(G_FSQRT) 725 .legalFor({S32, S64}) 726 .scalarize(0) 727 .clampScalar(0, S32, S64); 728 729 if (ST.hasFractBug()) { 730 getActionDefinitionsBuilder(G_FFLOOR) 731 .customFor({S64}) 732 .legalFor({S32, S64}) 733 .scalarize(0) 734 .clampScalar(0, S32, S64); 735 } else { 736 getActionDefinitionsBuilder(G_FFLOOR) 737 .legalFor({S32, S64}) 738 .scalarize(0) 739 .clampScalar(0, S32, S64); 740 } 741 } 742 743 getActionDefinitionsBuilder(G_FPTRUNC) 744 .legalFor({{S32, S64}, {S16, S32}}) 745 .scalarize(0) 746 .lower(); 747 748 getActionDefinitionsBuilder(G_FPEXT) 749 .legalFor({{S64, S32}, {S32, S16}}) 750 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 751 .scalarize(0); 752 753 getActionDefinitionsBuilder(G_FSUB) 754 // Use actual fsub instruction 755 .legalFor({S32}) 756 // Must use fadd + fneg 757 .lowerFor({S64, S16, V2S16}) 758 .scalarize(0) 759 .clampScalar(0, S32, S64); 760 761 // Whether this is legal depends on the floating point mode for the function. 762 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 763 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 764 FMad.customFor({S32, S16}); 765 else if (ST.hasMadMacF32Insts()) 766 FMad.customFor({S32}); 767 else if (ST.hasMadF16()) 768 FMad.customFor({S16}); 769 FMad.scalarize(0) 770 .lower(); 771 772 auto &FRem = getActionDefinitionsBuilder(G_FREM); 773 if (ST.has16BitInsts()) { 774 FRem.customFor({S16, S32, S64}); 775 } else { 776 FRem.minScalar(0, S32) 777 .customFor({S32, S64}); 778 } 779 FRem.scalarize(0); 780 781 // TODO: Do we need to clamp maximum bitwidth? 782 getActionDefinitionsBuilder(G_TRUNC) 783 .legalIf(isScalar(0)) 784 .legalFor({{V2S16, V2S32}}) 785 .clampMaxNumElements(0, S16, 2) 786 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 787 // situations (like an invalid implicit use), we don't want to infinite loop 788 // in the legalizer. 789 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 790 .alwaysLegal(); 791 792 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 793 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 794 {S32, S1}, {S64, S1}, {S16, S1}}) 795 .scalarize(0) 796 .clampScalar(0, S32, S64) 797 .widenScalarToNextPow2(1, 32); 798 799 // TODO: Split s1->s64 during regbankselect for VALU. 800 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 801 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 802 .lowerFor({{S32, S64}}) 803 .lowerIf(typeIs(1, S1)) 804 .customFor({{S64, S64}}); 805 if (ST.has16BitInsts()) 806 IToFP.legalFor({{S16, S16}}); 807 IToFP.clampScalar(1, S32, S64) 808 .minScalar(0, S32) 809 .scalarize(0) 810 .widenScalarToNextPow2(1); 811 812 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 813 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 814 .customFor({{S64, S64}}) 815 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 816 if (ST.has16BitInsts()) 817 FPToI.legalFor({{S16, S16}}); 818 else 819 FPToI.minScalar(1, S32); 820 821 FPToI.minScalar(0, S32) 822 .scalarize(0) 823 .lower(); 824 825 // Lower roundeven into G_FRINT 826 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 827 .scalarize(0) 828 .lower(); 829 830 if (ST.has16BitInsts()) { 831 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 832 .legalFor({S16, S32, S64}) 833 .clampScalar(0, S16, S64) 834 .scalarize(0); 835 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 836 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 837 .legalFor({S32, S64}) 838 .clampScalar(0, S32, S64) 839 .scalarize(0); 840 } else { 841 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 842 .legalFor({S32}) 843 .customFor({S64}) 844 .clampScalar(0, S32, S64) 845 .scalarize(0); 846 } 847 848 getActionDefinitionsBuilder(G_PTR_ADD) 849 .legalIf(all(isPointer(0), sameSize(0, 1))) 850 .scalarize(0) 851 .scalarSameSizeAs(1, 0); 852 853 getActionDefinitionsBuilder(G_PTRMASK) 854 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 855 .scalarSameSizeAs(1, 0) 856 .scalarize(0); 857 858 auto &CmpBuilder = 859 getActionDefinitionsBuilder(G_ICMP) 860 // The compare output type differs based on the register bank of the output, 861 // so make both s1 and s32 legal. 862 // 863 // Scalar compares producing output in scc will be promoted to s32, as that 864 // is the allocatable register type that will be needed for the copy from 865 // scc. This will be promoted during RegBankSelect, and we assume something 866 // before that won't try to use s32 result types. 867 // 868 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 869 // bank. 870 .legalForCartesianProduct( 871 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 872 .legalForCartesianProduct( 873 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 874 if (ST.has16BitInsts()) { 875 CmpBuilder.legalFor({{S1, S16}}); 876 } 877 878 CmpBuilder 879 .widenScalarToNextPow2(1) 880 .clampScalar(1, S32, S64) 881 .scalarize(0) 882 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 883 884 getActionDefinitionsBuilder(G_FCMP) 885 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 886 .widenScalarToNextPow2(1) 887 .clampScalar(1, S32, S64) 888 .scalarize(0); 889 890 // FIXME: fpow has a selection pattern that should move to custom lowering. 891 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 892 if (ST.has16BitInsts()) 893 Exp2Ops.legalFor({S32, S16}); 894 else 895 Exp2Ops.legalFor({S32}); 896 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 897 Exp2Ops.scalarize(0); 898 899 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 900 if (ST.has16BitInsts()) 901 ExpOps.customFor({{S32}, {S16}}); 902 else 903 ExpOps.customFor({S32}); 904 ExpOps.clampScalar(0, MinScalarFPTy, S32) 905 .scalarize(0); 906 907 getActionDefinitionsBuilder(G_FPOWI) 908 .clampScalar(0, MinScalarFPTy, S32) 909 .lower(); 910 911 // The 64-bit versions produce 32-bit results, but only on the SALU. 912 getActionDefinitionsBuilder(G_CTPOP) 913 .legalFor({{S32, S32}, {S32, S64}}) 914 .clampScalar(0, S32, S32) 915 .clampScalar(1, S32, S64) 916 .scalarize(0) 917 .widenScalarToNextPow2(0, 32) 918 .widenScalarToNextPow2(1, 32); 919 920 // The hardware instructions return a different result on 0 than the generic 921 // instructions expect. The hardware produces -1, but these produce the 922 // bitwidth. 923 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 924 .scalarize(0) 925 .clampScalar(0, S32, S32) 926 .clampScalar(1, S32, S64) 927 .widenScalarToNextPow2(0, 32) 928 .widenScalarToNextPow2(1, 32) 929 .lower(); 930 931 // The 64-bit versions produce 32-bit results, but only on the SALU. 932 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 933 .legalFor({{S32, S32}, {S32, S64}}) 934 .clampScalar(0, S32, S32) 935 .clampScalar(1, S32, S64) 936 .scalarize(0) 937 .widenScalarToNextPow2(0, 32) 938 .widenScalarToNextPow2(1, 32); 939 940 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 941 // RegBankSelect. 942 getActionDefinitionsBuilder(G_BITREVERSE) 943 .legalFor({S32, S64}) 944 .clampScalar(0, S32, S64) 945 .scalarize(0) 946 .widenScalarToNextPow2(0); 947 948 if (ST.has16BitInsts()) { 949 getActionDefinitionsBuilder(G_BSWAP) 950 .legalFor({S16, S32, V2S16}) 951 .clampMaxNumElements(0, S16, 2) 952 // FIXME: Fixing non-power-of-2 before clamp is workaround for 953 // narrowScalar limitation. 954 .widenScalarToNextPow2(0) 955 .clampScalar(0, S16, S32) 956 .scalarize(0); 957 958 if (ST.hasVOP3PInsts()) { 959 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 960 .legalFor({S32, S16, V2S16}) 961 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 962 .clampMaxNumElements(0, S16, 2) 963 .minScalar(0, S16) 964 .widenScalarToNextPow2(0) 965 .scalarize(0) 966 .lower(); 967 } else { 968 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 969 .legalFor({S32, S16}) 970 .widenScalarToNextPow2(0) 971 .minScalar(0, S16) 972 .scalarize(0) 973 .lower(); 974 } 975 } else { 976 // TODO: Should have same legality without v_perm_b32 977 getActionDefinitionsBuilder(G_BSWAP) 978 .legalFor({S32}) 979 .lowerIf(scalarNarrowerThan(0, 32)) 980 // FIXME: Fixing non-power-of-2 before clamp is workaround for 981 // narrowScalar limitation. 982 .widenScalarToNextPow2(0) 983 .maxScalar(0, S32) 984 .scalarize(0) 985 .lower(); 986 987 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 988 .legalFor({S32}) 989 .minScalar(0, S32) 990 .widenScalarToNextPow2(0) 991 .scalarize(0) 992 .lower(); 993 } 994 995 getActionDefinitionsBuilder(G_INTTOPTR) 996 // List the common cases 997 .legalForCartesianProduct(AddrSpaces64, {S64}) 998 .legalForCartesianProduct(AddrSpaces32, {S32}) 999 .scalarize(0) 1000 // Accept any address space as long as the size matches 1001 .legalIf(sameSize(0, 1)) 1002 .widenScalarIf(smallerThan(1, 0), 1003 [](const LegalityQuery &Query) { 1004 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1005 }) 1006 .narrowScalarIf(largerThan(1, 0), 1007 [](const LegalityQuery &Query) { 1008 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1009 }); 1010 1011 getActionDefinitionsBuilder(G_PTRTOINT) 1012 // List the common cases 1013 .legalForCartesianProduct(AddrSpaces64, {S64}) 1014 .legalForCartesianProduct(AddrSpaces32, {S32}) 1015 .scalarize(0) 1016 // Accept any address space as long as the size matches 1017 .legalIf(sameSize(0, 1)) 1018 .widenScalarIf(smallerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }) 1022 .narrowScalarIf( 1023 largerThan(0, 1), 1024 [](const LegalityQuery &Query) { 1025 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1026 }); 1027 1028 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1029 .scalarize(0) 1030 .custom(); 1031 1032 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1033 bool IsLoad) -> bool { 1034 const LLT DstTy = Query.Types[0]; 1035 1036 // Split vector extloads. 1037 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1038 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1039 1040 if (MemSize < DstTy.getSizeInBits()) 1041 MemSize = std::max(MemSize, AlignBits); 1042 1043 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1044 return true; 1045 1046 const LLT PtrTy = Query.Types[1]; 1047 unsigned AS = PtrTy.getAddressSpace(); 1048 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1049 return true; 1050 1051 // Catch weird sized loads that don't evenly divide into the access sizes 1052 // TODO: May be able to widen depending on alignment etc. 1053 unsigned NumRegs = (MemSize + 31) / 32; 1054 if (NumRegs == 3) { 1055 if (!ST.hasDwordx3LoadStores()) 1056 return true; 1057 } else { 1058 // If the alignment allows, these should have been widened. 1059 if (!isPowerOf2_32(NumRegs)) 1060 return true; 1061 } 1062 1063 if (AlignBits < MemSize) { 1064 const SITargetLowering *TLI = ST.getTargetLowering(); 1065 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1066 Align(AlignBits / 8)); 1067 } 1068 1069 return false; 1070 }; 1071 1072 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1073 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1074 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1075 1076 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1077 // LDS 1078 // TODO: Unsupported flat for SI. 1079 1080 for (unsigned Op : {G_LOAD, G_STORE}) { 1081 const bool IsStore = Op == G_STORE; 1082 1083 auto &Actions = getActionDefinitionsBuilder(Op); 1084 // Explicitly list some common cases. 1085 // TODO: Does this help compile time at all? 1086 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1087 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1088 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1089 {S64, GlobalPtr, 64, GlobalAlign32}, 1090 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1091 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1092 {S32, GlobalPtr, 8, GlobalAlign8}, 1093 {S32, GlobalPtr, 16, GlobalAlign16}, 1094 1095 {S32, LocalPtr, 32, 32}, 1096 {S64, LocalPtr, 64, 32}, 1097 {V2S32, LocalPtr, 64, 32}, 1098 {S32, LocalPtr, 8, 8}, 1099 {S32, LocalPtr, 16, 16}, 1100 {V2S16, LocalPtr, 32, 32}, 1101 1102 {S32, PrivatePtr, 32, 32}, 1103 {S32, PrivatePtr, 8, 8}, 1104 {S32, PrivatePtr, 16, 16}, 1105 {V2S16, PrivatePtr, 32, 32}, 1106 1107 {S32, ConstantPtr, 32, GlobalAlign32}, 1108 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1109 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1110 {S64, ConstantPtr, 64, GlobalAlign32}, 1111 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1112 Actions.legalIf( 1113 [=](const LegalityQuery &Query) -> bool { 1114 return isLoadStoreLegal(ST, Query, Op); 1115 }); 1116 1117 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1118 // 64-bits. 1119 // 1120 // TODO: Should generalize bitcast action into coerce, which will also cover 1121 // inserting addrspacecasts. 1122 Actions.customIf(typeIs(1, Constant32Ptr)); 1123 1124 // Turn any illegal element vectors into something easier to deal 1125 // with. These will ultimately produce 32-bit scalar shifts to extract the 1126 // parts anyway. 1127 // 1128 // For odd 16-bit element vectors, prefer to split those into pieces with 1129 // 16-bit vector parts. 1130 Actions.bitcastIf( 1131 [=](const LegalityQuery &Query) -> bool { 1132 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1133 Query.MMODescrs[0].SizeInBits); 1134 }, bitcastToRegisterType(0)); 1135 1136 if (!IsStore) { 1137 // Widen suitably aligned loads by loading extra bytes. The standard 1138 // legalization actions can't properly express widening memory operands. 1139 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1140 return shouldWidenLoad(ST, Query, G_LOAD); 1141 }); 1142 } 1143 1144 // FIXME: load/store narrowing should be moved to lower action 1145 Actions 1146 .narrowScalarIf( 1147 [=](const LegalityQuery &Query) -> bool { 1148 return !Query.Types[0].isVector() && 1149 needToSplitMemOp(Query, Op == G_LOAD); 1150 }, 1151 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1152 const LLT DstTy = Query.Types[0]; 1153 const LLT PtrTy = Query.Types[1]; 1154 1155 const unsigned DstSize = DstTy.getSizeInBits(); 1156 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1157 1158 // Split extloads. 1159 if (DstSize > MemSize) 1160 return std::make_pair(0, LLT::scalar(MemSize)); 1161 1162 if (!isPowerOf2_32(DstSize)) { 1163 // We're probably decomposing an odd sized store. Try to split 1164 // to the widest type. TODO: Account for alignment. As-is it 1165 // should be OK, since the new parts will be further legalized. 1166 unsigned FloorSize = PowerOf2Floor(DstSize); 1167 return std::make_pair(0, LLT::scalar(FloorSize)); 1168 } 1169 1170 if (DstSize > 32 && (DstSize % 32 != 0)) { 1171 // FIXME: Need a way to specify non-extload of larger size if 1172 // suitably aligned. 1173 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1174 } 1175 1176 unsigned MaxSize = maxSizeForAddrSpace(ST, 1177 PtrTy.getAddressSpace(), 1178 Op == G_LOAD); 1179 if (MemSize > MaxSize) 1180 return std::make_pair(0, LLT::scalar(MaxSize)); 1181 1182 unsigned Align = Query.MMODescrs[0].AlignInBits; 1183 return std::make_pair(0, LLT::scalar(Align)); 1184 }) 1185 .fewerElementsIf( 1186 [=](const LegalityQuery &Query) -> bool { 1187 return Query.Types[0].isVector() && 1188 needToSplitMemOp(Query, Op == G_LOAD); 1189 }, 1190 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1191 const LLT DstTy = Query.Types[0]; 1192 const LLT PtrTy = Query.Types[1]; 1193 1194 LLT EltTy = DstTy.getElementType(); 1195 unsigned MaxSize = maxSizeForAddrSpace(ST, 1196 PtrTy.getAddressSpace(), 1197 Op == G_LOAD); 1198 1199 // FIXME: Handle widened to power of 2 results better. This ends 1200 // up scalarizing. 1201 // FIXME: 3 element stores scalarized on SI 1202 1203 // Split if it's too large for the address space. 1204 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1205 unsigned NumElts = DstTy.getNumElements(); 1206 unsigned EltSize = EltTy.getSizeInBits(); 1207 1208 if (MaxSize % EltSize == 0) { 1209 return std::make_pair( 1210 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1211 } 1212 1213 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1214 1215 // FIXME: Refine when odd breakdowns handled 1216 // The scalars will need to be re-legalized. 1217 if (NumPieces == 1 || NumPieces >= NumElts || 1218 NumElts % NumPieces != 0) 1219 return std::make_pair(0, EltTy); 1220 1221 return std::make_pair(0, 1222 LLT::vector(NumElts / NumPieces, EltTy)); 1223 } 1224 1225 // FIXME: We could probably handle weird extending loads better. 1226 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1227 if (DstTy.getSizeInBits() > MemSize) 1228 return std::make_pair(0, EltTy); 1229 1230 unsigned EltSize = EltTy.getSizeInBits(); 1231 unsigned DstSize = DstTy.getSizeInBits(); 1232 if (!isPowerOf2_32(DstSize)) { 1233 // We're probably decomposing an odd sized store. Try to split 1234 // to the widest type. TODO: Account for alignment. As-is it 1235 // should be OK, since the new parts will be further legalized. 1236 unsigned FloorSize = PowerOf2Floor(DstSize); 1237 return std::make_pair( 1238 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1239 } 1240 1241 // Need to split because of alignment. 1242 unsigned Align = Query.MMODescrs[0].AlignInBits; 1243 if (EltSize > Align && 1244 (EltSize / Align < DstTy.getNumElements())) { 1245 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1246 } 1247 1248 // May need relegalization for the scalars. 1249 return std::make_pair(0, EltTy); 1250 }) 1251 .lowerIfMemSizeNotPow2() 1252 .minScalar(0, S32); 1253 1254 if (IsStore) 1255 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1256 1257 Actions 1258 .widenScalarToNextPow2(0) 1259 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1260 .lower(); 1261 } 1262 1263 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1264 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1265 {S32, GlobalPtr, 16, 2 * 8}, 1266 {S32, LocalPtr, 8, 8}, 1267 {S32, LocalPtr, 16, 16}, 1268 {S32, PrivatePtr, 8, 8}, 1269 {S32, PrivatePtr, 16, 16}, 1270 {S32, ConstantPtr, 8, 8}, 1271 {S32, ConstantPtr, 16, 2 * 8}}); 1272 if (ST.hasFlatAddressSpace()) { 1273 ExtLoads.legalForTypesWithMemDesc( 1274 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1275 } 1276 1277 ExtLoads.clampScalar(0, S32, S32) 1278 .widenScalarToNextPow2(0) 1279 .unsupportedIfMemSizeNotPow2() 1280 .lower(); 1281 1282 auto &Atomics = getActionDefinitionsBuilder( 1283 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1284 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1285 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1286 G_ATOMICRMW_UMIN}) 1287 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1288 {S64, GlobalPtr}, {S64, LocalPtr}, 1289 {S32, RegionPtr}, {S64, RegionPtr}}); 1290 if (ST.hasFlatAddressSpace()) { 1291 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1292 } 1293 1294 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1295 if (ST.hasLDSFPAtomics()) { 1296 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1297 if (ST.hasGFX90AInsts()) 1298 Atomic.legalFor({{S64, LocalPtr}}); 1299 } 1300 if (ST.hasAtomicFaddInsts()) 1301 Atomic.legalFor({{S32, GlobalPtr}}); 1302 1303 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1304 // demarshalling 1305 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1306 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1307 {S32, FlatPtr}, {S64, FlatPtr}}) 1308 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1309 {S32, RegionPtr}, {S64, RegionPtr}}); 1310 // TODO: Pointer types, any 32-bit or 64-bit vector 1311 1312 // Condition should be s32 for scalar, s1 for vector. 1313 getActionDefinitionsBuilder(G_SELECT) 1314 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1315 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1316 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1317 .clampScalar(0, S16, S64) 1318 .scalarize(1) 1319 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1320 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1321 .clampMaxNumElements(0, S32, 2) 1322 .clampMaxNumElements(0, LocalPtr, 2) 1323 .clampMaxNumElements(0, PrivatePtr, 2) 1324 .scalarize(0) 1325 .widenScalarToNextPow2(0) 1326 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1327 1328 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1329 // be more flexible with the shift amount type. 1330 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1331 .legalFor({{S32, S32}, {S64, S32}}); 1332 if (ST.has16BitInsts()) { 1333 if (ST.hasVOP3PInsts()) { 1334 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1335 .clampMaxNumElements(0, S16, 2); 1336 } else 1337 Shifts.legalFor({{S16, S16}}); 1338 1339 // TODO: Support 16-bit shift amounts for all types 1340 Shifts.widenScalarIf( 1341 [=](const LegalityQuery &Query) { 1342 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1343 // 32-bit amount. 1344 const LLT ValTy = Query.Types[0]; 1345 const LLT AmountTy = Query.Types[1]; 1346 return ValTy.getSizeInBits() <= 16 && 1347 AmountTy.getSizeInBits() < 16; 1348 }, changeTo(1, S16)); 1349 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1350 Shifts.clampScalar(1, S32, S32); 1351 Shifts.clampScalar(0, S16, S64); 1352 Shifts.widenScalarToNextPow2(0, 16); 1353 1354 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1355 .minScalar(0, S16) 1356 .scalarize(0) 1357 .lower(); 1358 } else { 1359 // Make sure we legalize the shift amount type first, as the general 1360 // expansion for the shifted type will produce much worse code if it hasn't 1361 // been truncated already. 1362 Shifts.clampScalar(1, S32, S32); 1363 Shifts.clampScalar(0, S32, S64); 1364 Shifts.widenScalarToNextPow2(0, 32); 1365 1366 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1367 .minScalar(0, S32) 1368 .scalarize(0) 1369 .lower(); 1370 } 1371 Shifts.scalarize(0); 1372 1373 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1374 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1375 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1376 unsigned IdxTypeIdx = 2; 1377 1378 getActionDefinitionsBuilder(Op) 1379 .customIf([=](const LegalityQuery &Query) { 1380 const LLT EltTy = Query.Types[EltTypeIdx]; 1381 const LLT VecTy = Query.Types[VecTypeIdx]; 1382 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1383 const unsigned EltSize = EltTy.getSizeInBits(); 1384 return (EltSize == 32 || EltSize == 64) && 1385 VecTy.getSizeInBits() % 32 == 0 && 1386 VecTy.getSizeInBits() <= MaxRegisterSize && 1387 IdxTy.getSizeInBits() == 32; 1388 }) 1389 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1390 bitcastToVectorElement32(VecTypeIdx)) 1391 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1392 .bitcastIf( 1393 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1394 [=](const LegalityQuery &Query) { 1395 // For > 64-bit element types, try to turn this into a 64-bit 1396 // element vector since we may be able to do better indexing 1397 // if this is scalar. If not, fall back to 32. 1398 const LLT EltTy = Query.Types[EltTypeIdx]; 1399 const LLT VecTy = Query.Types[VecTypeIdx]; 1400 const unsigned DstEltSize = EltTy.getSizeInBits(); 1401 const unsigned VecSize = VecTy.getSizeInBits(); 1402 1403 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1404 return std::make_pair( 1405 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1406 }) 1407 .clampScalar(EltTypeIdx, S32, S64) 1408 .clampScalar(VecTypeIdx, S32, S64) 1409 .clampScalar(IdxTypeIdx, S32, S32) 1410 .clampMaxNumElements(VecTypeIdx, S32, 32) 1411 // TODO: Clamp elements for 64-bit vectors? 1412 // It should only be necessary with variable indexes. 1413 // As a last resort, lower to the stack 1414 .lower(); 1415 } 1416 1417 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1418 .unsupportedIf([=](const LegalityQuery &Query) { 1419 const LLT &EltTy = Query.Types[1].getElementType(); 1420 return Query.Types[0] != EltTy; 1421 }); 1422 1423 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1424 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1425 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1426 1427 // FIXME: Doesn't handle extract of illegal sizes. 1428 getActionDefinitionsBuilder(Op) 1429 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1430 // FIXME: Multiples of 16 should not be legal. 1431 .legalIf([=](const LegalityQuery &Query) { 1432 const LLT BigTy = Query.Types[BigTyIdx]; 1433 const LLT LitTy = Query.Types[LitTyIdx]; 1434 return (BigTy.getSizeInBits() % 32 == 0) && 1435 (LitTy.getSizeInBits() % 16 == 0); 1436 }) 1437 .widenScalarIf( 1438 [=](const LegalityQuery &Query) { 1439 const LLT BigTy = Query.Types[BigTyIdx]; 1440 return (BigTy.getScalarSizeInBits() < 16); 1441 }, 1442 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1443 .widenScalarIf( 1444 [=](const LegalityQuery &Query) { 1445 const LLT LitTy = Query.Types[LitTyIdx]; 1446 return (LitTy.getScalarSizeInBits() < 16); 1447 }, 1448 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1449 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1450 .widenScalarToNextPow2(BigTyIdx, 32); 1451 1452 } 1453 1454 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1455 .legalForCartesianProduct(AllS32Vectors, {S32}) 1456 .legalForCartesianProduct(AllS64Vectors, {S64}) 1457 .clampNumElements(0, V16S32, V32S32) 1458 .clampNumElements(0, V2S64, V16S64) 1459 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1460 1461 if (ST.hasScalarPackInsts()) { 1462 BuildVector 1463 // FIXME: Should probably widen s1 vectors straight to s32 1464 .minScalarOrElt(0, S16) 1465 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1466 .minScalar(1, S32); 1467 1468 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1469 .legalFor({V2S16, S32}) 1470 .lower(); 1471 BuildVector.minScalarOrElt(0, S32); 1472 } else { 1473 BuildVector.customFor({V2S16, S16}); 1474 BuildVector.minScalarOrElt(0, S32); 1475 1476 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1477 .customFor({V2S16, S32}) 1478 .lower(); 1479 } 1480 1481 BuildVector.legalIf(isRegisterType(0)); 1482 1483 // FIXME: Clamp maximum size 1484 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1485 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1486 .clampMaxNumElements(0, S32, 32) 1487 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1488 .clampMaxNumElements(0, S16, 64); 1489 1490 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1491 // pre-legalize. 1492 if (ST.hasVOP3PInsts()) { 1493 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1494 .customFor({V2S16, V2S16}) 1495 .lower(); 1496 } else 1497 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1498 1499 // Merge/Unmerge 1500 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1501 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1502 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1503 1504 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1505 const LLT Ty = Query.Types[TypeIdx]; 1506 if (Ty.isVector()) { 1507 const LLT &EltTy = Ty.getElementType(); 1508 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1509 return true; 1510 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1511 return true; 1512 } 1513 return false; 1514 }; 1515 1516 auto &Builder = getActionDefinitionsBuilder(Op) 1517 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1518 .lowerFor({{S16, V2S16}}) 1519 .lowerIf([=](const LegalityQuery &Query) { 1520 const LLT BigTy = Query.Types[BigTyIdx]; 1521 return BigTy.getSizeInBits() == 32; 1522 }) 1523 // Try to widen to s16 first for small types. 1524 // TODO: Only do this on targets with legal s16 shifts 1525 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1527 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1528 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1529 elementTypeIs(1, S16)), 1530 changeTo(1, V2S16)) 1531 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1532 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1533 // valid. 1534 .clampScalar(LitTyIdx, S32, S512) 1535 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1536 // Break up vectors with weird elements into scalars 1537 .fewerElementsIf( 1538 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1539 scalarize(0)) 1540 .fewerElementsIf( 1541 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1542 scalarize(1)) 1543 .clampScalar(BigTyIdx, S32, MaxScalar); 1544 1545 if (Op == G_MERGE_VALUES) { 1546 Builder.widenScalarIf( 1547 // TODO: Use 16-bit shifts if legal for 8-bit values? 1548 [=](const LegalityQuery &Query) { 1549 const LLT Ty = Query.Types[LitTyIdx]; 1550 return Ty.getSizeInBits() < 32; 1551 }, 1552 changeTo(LitTyIdx, S32)); 1553 } 1554 1555 Builder.widenScalarIf( 1556 [=](const LegalityQuery &Query) { 1557 const LLT Ty = Query.Types[BigTyIdx]; 1558 return !isPowerOf2_32(Ty.getSizeInBits()) && 1559 Ty.getSizeInBits() % 16 != 0; 1560 }, 1561 [=](const LegalityQuery &Query) { 1562 // Pick the next power of 2, or a multiple of 64 over 128. 1563 // Whichever is smaller. 1564 const LLT &Ty = Query.Types[BigTyIdx]; 1565 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1566 if (NewSizeInBits >= 256) { 1567 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1568 if (RoundedTo < NewSizeInBits) 1569 NewSizeInBits = RoundedTo; 1570 } 1571 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1572 }) 1573 // Any vectors left are the wrong size. Scalarize them. 1574 .scalarize(0) 1575 .scalarize(1); 1576 } 1577 1578 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1579 // RegBankSelect. 1580 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1581 .legalFor({{S32}, {S64}}); 1582 1583 if (ST.hasVOP3PInsts()) { 1584 SextInReg.lowerFor({{V2S16}}) 1585 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1586 // get more vector shift opportunities, since we'll get those when 1587 // expanded. 1588 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1589 } else if (ST.has16BitInsts()) { 1590 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1591 } else { 1592 // Prefer to promote to s32 before lowering if we don't have 16-bit 1593 // shifts. This avoid a lot of intermediate truncate and extend operations. 1594 SextInReg.lowerFor({{S32}, {S64}}); 1595 } 1596 1597 SextInReg 1598 .scalarize(0) 1599 .clampScalar(0, S32, S64) 1600 .lower(); 1601 1602 // TODO: Only Try to form v2s16 with legal packed instructions. 1603 getActionDefinitionsBuilder(G_FSHR) 1604 .legalFor({{S32, S32}}) 1605 .lowerFor({{V2S16, V2S16}}) 1606 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1607 .scalarize(0) 1608 .lower(); 1609 1610 if (ST.hasVOP3PInsts()) { 1611 getActionDefinitionsBuilder(G_FSHL) 1612 .lowerFor({{V2S16, V2S16}}) 1613 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1614 .scalarize(0) 1615 .lower(); 1616 } else { 1617 getActionDefinitionsBuilder(G_FSHL) 1618 .scalarize(0) 1619 .lower(); 1620 } 1621 1622 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1623 .legalFor({S64}); 1624 1625 getActionDefinitionsBuilder(G_FENCE) 1626 .alwaysLegal(); 1627 1628 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1629 .scalarize(0) 1630 .minScalar(0, S32) 1631 .lower(); 1632 1633 getActionDefinitionsBuilder({ 1634 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1635 G_FCOPYSIGN, 1636 1637 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1638 G_ATOMICRMW_NAND, 1639 G_ATOMICRMW_FSUB, 1640 G_READ_REGISTER, 1641 G_WRITE_REGISTER, 1642 1643 G_SADDO, G_SSUBO, 1644 1645 // TODO: Implement 1646 G_FMINIMUM, G_FMAXIMUM}).lower(); 1647 1648 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1649 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1650 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1651 .unsupported(); 1652 1653 computeTables(); 1654 verify(*ST.getInstrInfo()); 1655 } 1656 1657 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1658 MachineInstr &MI) const { 1659 MachineIRBuilder &B = Helper.MIRBuilder; 1660 MachineRegisterInfo &MRI = *B.getMRI(); 1661 1662 switch (MI.getOpcode()) { 1663 case TargetOpcode::G_ADDRSPACE_CAST: 1664 return legalizeAddrSpaceCast(MI, MRI, B); 1665 case TargetOpcode::G_FRINT: 1666 return legalizeFrint(MI, MRI, B); 1667 case TargetOpcode::G_FCEIL: 1668 return legalizeFceil(MI, MRI, B); 1669 case TargetOpcode::G_FREM: 1670 return legalizeFrem(MI, MRI, B); 1671 case TargetOpcode::G_INTRINSIC_TRUNC: 1672 return legalizeIntrinsicTrunc(MI, MRI, B); 1673 case TargetOpcode::G_SITOFP: 1674 return legalizeITOFP(MI, MRI, B, true); 1675 case TargetOpcode::G_UITOFP: 1676 return legalizeITOFP(MI, MRI, B, false); 1677 case TargetOpcode::G_FPTOSI: 1678 return legalizeFPTOI(MI, MRI, B, true); 1679 case TargetOpcode::G_FPTOUI: 1680 return legalizeFPTOI(MI, MRI, B, false); 1681 case TargetOpcode::G_FMINNUM: 1682 case TargetOpcode::G_FMAXNUM: 1683 case TargetOpcode::G_FMINNUM_IEEE: 1684 case TargetOpcode::G_FMAXNUM_IEEE: 1685 return legalizeMinNumMaxNum(Helper, MI); 1686 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1687 return legalizeExtractVectorElt(MI, MRI, B); 1688 case TargetOpcode::G_INSERT_VECTOR_ELT: 1689 return legalizeInsertVectorElt(MI, MRI, B); 1690 case TargetOpcode::G_SHUFFLE_VECTOR: 1691 return legalizeShuffleVector(MI, MRI, B); 1692 case TargetOpcode::G_FSIN: 1693 case TargetOpcode::G_FCOS: 1694 return legalizeSinCos(MI, MRI, B); 1695 case TargetOpcode::G_GLOBAL_VALUE: 1696 return legalizeGlobalValue(MI, MRI, B); 1697 case TargetOpcode::G_LOAD: 1698 return legalizeLoad(Helper, MI); 1699 case TargetOpcode::G_FMAD: 1700 return legalizeFMad(MI, MRI, B); 1701 case TargetOpcode::G_FDIV: 1702 return legalizeFDIV(MI, MRI, B); 1703 case TargetOpcode::G_UDIV: 1704 case TargetOpcode::G_UREM: 1705 return legalizeUDIV_UREM(MI, MRI, B); 1706 case TargetOpcode::G_SDIV: 1707 case TargetOpcode::G_SREM: 1708 return legalizeSDIV_SREM(MI, MRI, B); 1709 case TargetOpcode::G_ATOMIC_CMPXCHG: 1710 return legalizeAtomicCmpXChg(MI, MRI, B); 1711 case TargetOpcode::G_FLOG: 1712 return legalizeFlog(MI, B, numbers::ln2f); 1713 case TargetOpcode::G_FLOG10: 1714 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1715 case TargetOpcode::G_FEXP: 1716 return legalizeFExp(MI, B); 1717 case TargetOpcode::G_FPOW: 1718 return legalizeFPow(MI, B); 1719 case TargetOpcode::G_FFLOOR: 1720 return legalizeFFloor(MI, MRI, B); 1721 case TargetOpcode::G_BUILD_VECTOR: 1722 return legalizeBuildVector(MI, MRI, B); 1723 default: 1724 return false; 1725 } 1726 1727 llvm_unreachable("expected switch to return"); 1728 } 1729 1730 Register AMDGPULegalizerInfo::getSegmentAperture( 1731 unsigned AS, 1732 MachineRegisterInfo &MRI, 1733 MachineIRBuilder &B) const { 1734 MachineFunction &MF = B.getMF(); 1735 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1736 const LLT S32 = LLT::scalar(32); 1737 1738 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1739 1740 if (ST.hasApertureRegs()) { 1741 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1742 // getreg. 1743 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1744 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1745 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1746 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1747 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1748 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1749 unsigned Encoding = 1750 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1751 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1752 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1753 1754 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1755 1756 B.buildInstr(AMDGPU::S_GETREG_B32) 1757 .addDef(GetReg) 1758 .addImm(Encoding); 1759 MRI.setType(GetReg, S32); 1760 1761 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1762 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1763 } 1764 1765 Register QueuePtr = MRI.createGenericVirtualRegister( 1766 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1767 1768 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1769 return Register(); 1770 1771 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1772 // private_segment_aperture_base_hi. 1773 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1774 1775 // TODO: can we be smarter about machine pointer info? 1776 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1777 MachineMemOperand *MMO = MF.getMachineMemOperand( 1778 PtrInfo, 1779 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1780 MachineMemOperand::MOInvariant, 1781 4, commonAlignment(Align(64), StructOffset)); 1782 1783 Register LoadAddr; 1784 1785 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1786 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1787 } 1788 1789 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1790 MachineInstr &MI, MachineRegisterInfo &MRI, 1791 MachineIRBuilder &B) const { 1792 MachineFunction &MF = B.getMF(); 1793 1794 const LLT S32 = LLT::scalar(32); 1795 Register Dst = MI.getOperand(0).getReg(); 1796 Register Src = MI.getOperand(1).getReg(); 1797 1798 LLT DstTy = MRI.getType(Dst); 1799 LLT SrcTy = MRI.getType(Src); 1800 unsigned DestAS = DstTy.getAddressSpace(); 1801 unsigned SrcAS = SrcTy.getAddressSpace(); 1802 1803 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1804 // vector element. 1805 assert(!DstTy.isVector()); 1806 1807 const AMDGPUTargetMachine &TM 1808 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1809 1810 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1811 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1812 return true; 1813 } 1814 1815 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1816 // Truncate. 1817 B.buildExtract(Dst, Src, 0); 1818 MI.eraseFromParent(); 1819 return true; 1820 } 1821 1822 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1823 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1824 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1825 1826 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1827 // another. Merge operands are required to be the same type, but creating an 1828 // extra ptrtoint would be kind of pointless. 1829 auto HighAddr = B.buildConstant( 1830 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1831 B.buildMerge(Dst, {Src, HighAddr}); 1832 MI.eraseFromParent(); 1833 return true; 1834 } 1835 1836 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1837 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1838 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1839 unsigned NullVal = TM.getNullPointerValue(DestAS); 1840 1841 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1842 auto FlatNull = B.buildConstant(SrcTy, 0); 1843 1844 // Extract low 32-bits of the pointer. 1845 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1846 1847 auto CmpRes = 1848 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1849 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1850 1851 MI.eraseFromParent(); 1852 return true; 1853 } 1854 1855 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1856 return false; 1857 1858 if (!ST.hasFlatAddressSpace()) 1859 return false; 1860 1861 auto SegmentNull = 1862 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1863 auto FlatNull = 1864 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1865 1866 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1867 if (!ApertureReg.isValid()) 1868 return false; 1869 1870 auto CmpRes = 1871 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1872 1873 // Coerce the type of the low half of the result so we can use merge_values. 1874 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1875 1876 // TODO: Should we allow mismatched types but matching sizes in merges to 1877 // avoid the ptrtoint? 1878 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1879 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1880 1881 MI.eraseFromParent(); 1882 return true; 1883 } 1884 1885 bool AMDGPULegalizerInfo::legalizeFrint( 1886 MachineInstr &MI, MachineRegisterInfo &MRI, 1887 MachineIRBuilder &B) const { 1888 Register Src = MI.getOperand(1).getReg(); 1889 LLT Ty = MRI.getType(Src); 1890 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1891 1892 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1893 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1894 1895 auto C1 = B.buildFConstant(Ty, C1Val); 1896 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1897 1898 // TODO: Should this propagate fast-math-flags? 1899 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1900 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1901 1902 auto C2 = B.buildFConstant(Ty, C2Val); 1903 auto Fabs = B.buildFAbs(Ty, Src); 1904 1905 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1906 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeFceil( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 1915 const LLT S1 = LLT::scalar(1); 1916 const LLT S64 = LLT::scalar(64); 1917 1918 Register Src = MI.getOperand(1).getReg(); 1919 assert(MRI.getType(Src) == S64); 1920 1921 // result = trunc(src) 1922 // if (src > 0.0 && src != result) 1923 // result += 1.0 1924 1925 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1926 1927 const auto Zero = B.buildFConstant(S64, 0.0); 1928 const auto One = B.buildFConstant(S64, 1.0); 1929 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1930 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1931 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1932 auto Add = B.buildSelect(S64, And, One, Zero); 1933 1934 // TODO: Should this propagate fast-math-flags? 1935 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1936 return true; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeFrem( 1940 MachineInstr &MI, MachineRegisterInfo &MRI, 1941 MachineIRBuilder &B) const { 1942 Register DstReg = MI.getOperand(0).getReg(); 1943 Register Src0Reg = MI.getOperand(1).getReg(); 1944 Register Src1Reg = MI.getOperand(2).getReg(); 1945 auto Flags = MI.getFlags(); 1946 LLT Ty = MRI.getType(DstReg); 1947 1948 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1949 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1950 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1951 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1952 MI.eraseFromParent(); 1953 return true; 1954 } 1955 1956 static MachineInstrBuilder extractF64Exponent(Register Hi, 1957 MachineIRBuilder &B) { 1958 const unsigned FractBits = 52; 1959 const unsigned ExpBits = 11; 1960 LLT S32 = LLT::scalar(32); 1961 1962 auto Const0 = B.buildConstant(S32, FractBits - 32); 1963 auto Const1 = B.buildConstant(S32, ExpBits); 1964 1965 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1966 .addUse(Hi) 1967 .addUse(Const0.getReg(0)) 1968 .addUse(Const1.getReg(0)); 1969 1970 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1971 } 1972 1973 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1974 MachineInstr &MI, MachineRegisterInfo &MRI, 1975 MachineIRBuilder &B) const { 1976 const LLT S1 = LLT::scalar(1); 1977 const LLT S32 = LLT::scalar(32); 1978 const LLT S64 = LLT::scalar(64); 1979 1980 Register Src = MI.getOperand(1).getReg(); 1981 assert(MRI.getType(Src) == S64); 1982 1983 // TODO: Should this use extract since the low half is unused? 1984 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1985 Register Hi = Unmerge.getReg(1); 1986 1987 // Extract the upper half, since this is where we will find the sign and 1988 // exponent. 1989 auto Exp = extractF64Exponent(Hi, B); 1990 1991 const unsigned FractBits = 52; 1992 1993 // Extract the sign bit. 1994 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1995 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1996 1997 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1998 1999 const auto Zero32 = B.buildConstant(S32, 0); 2000 2001 // Extend back to 64-bits. 2002 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2003 2004 auto Shr = B.buildAShr(S64, FractMask, Exp); 2005 auto Not = B.buildNot(S64, Shr); 2006 auto Tmp0 = B.buildAnd(S64, Src, Not); 2007 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2008 2009 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2010 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2011 2012 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2013 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 bool AMDGPULegalizerInfo::legalizeITOFP( 2019 MachineInstr &MI, MachineRegisterInfo &MRI, 2020 MachineIRBuilder &B, bool Signed) const { 2021 2022 Register Dst = MI.getOperand(0).getReg(); 2023 Register Src = MI.getOperand(1).getReg(); 2024 2025 const LLT S64 = LLT::scalar(64); 2026 const LLT S32 = LLT::scalar(32); 2027 2028 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2029 2030 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2031 2032 auto CvtHi = Signed ? 2033 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2034 B.buildUITOFP(S64, Unmerge.getReg(1)); 2035 2036 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2037 2038 auto ThirtyTwo = B.buildConstant(S32, 32); 2039 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2040 .addUse(CvtHi.getReg(0)) 2041 .addUse(ThirtyTwo.getReg(0)); 2042 2043 // TODO: Should this propagate fast-math-flags? 2044 B.buildFAdd(Dst, LdExp, CvtLo); 2045 MI.eraseFromParent(); 2046 return true; 2047 } 2048 2049 // TODO: Copied from DAG implementation. Verify logic and document how this 2050 // actually works. 2051 bool AMDGPULegalizerInfo::legalizeFPTOI( 2052 MachineInstr &MI, MachineRegisterInfo &MRI, 2053 MachineIRBuilder &B, bool Signed) const { 2054 2055 Register Dst = MI.getOperand(0).getReg(); 2056 Register Src = MI.getOperand(1).getReg(); 2057 2058 const LLT S64 = LLT::scalar(64); 2059 const LLT S32 = LLT::scalar(32); 2060 2061 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2062 2063 unsigned Flags = MI.getFlags(); 2064 2065 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2066 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2067 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2068 2069 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2070 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2071 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2072 2073 auto Hi = Signed ? 2074 B.buildFPTOSI(S32, FloorMul) : 2075 B.buildFPTOUI(S32, FloorMul); 2076 auto Lo = B.buildFPTOUI(S32, Fma); 2077 2078 B.buildMerge(Dst, { Lo, Hi }); 2079 MI.eraseFromParent(); 2080 2081 return true; 2082 } 2083 2084 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2085 MachineInstr &MI) const { 2086 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2087 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2088 2089 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2090 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2091 2092 // With ieee_mode disabled, the instructions have the correct behavior 2093 // already for G_FMINNUM/G_FMAXNUM 2094 if (!MFI->getMode().IEEE) 2095 return !IsIEEEOp; 2096 2097 if (IsIEEEOp) 2098 return true; 2099 2100 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2104 MachineInstr &MI, MachineRegisterInfo &MRI, 2105 MachineIRBuilder &B) const { 2106 // TODO: Should move some of this into LegalizerHelper. 2107 2108 // TODO: Promote dynamic indexing of s16 to s32 2109 2110 // FIXME: Artifact combiner probably should have replaced the truncated 2111 // constant before this, so we shouldn't need 2112 // getConstantVRegValWithLookThrough. 2113 Optional<ValueAndVReg> MaybeIdxVal = 2114 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2115 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2116 return true; 2117 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2118 2119 Register Dst = MI.getOperand(0).getReg(); 2120 Register Vec = MI.getOperand(1).getReg(); 2121 2122 LLT VecTy = MRI.getType(Vec); 2123 LLT EltTy = VecTy.getElementType(); 2124 assert(EltTy == MRI.getType(Dst)); 2125 2126 if (IdxVal < VecTy.getNumElements()) 2127 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2128 else 2129 B.buildUndef(Dst); 2130 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2136 MachineInstr &MI, MachineRegisterInfo &MRI, 2137 MachineIRBuilder &B) const { 2138 // TODO: Should move some of this into LegalizerHelper. 2139 2140 // TODO: Promote dynamic indexing of s16 to s32 2141 2142 // FIXME: Artifact combiner probably should have replaced the truncated 2143 // constant before this, so we shouldn't need 2144 // getConstantVRegValWithLookThrough. 2145 Optional<ValueAndVReg> MaybeIdxVal = 2146 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2147 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2148 return true; 2149 2150 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2151 Register Dst = MI.getOperand(0).getReg(); 2152 Register Vec = MI.getOperand(1).getReg(); 2153 Register Ins = MI.getOperand(2).getReg(); 2154 2155 LLT VecTy = MRI.getType(Vec); 2156 LLT EltTy = VecTy.getElementType(); 2157 assert(EltTy == MRI.getType(Ins)); 2158 2159 if (IdxVal < VecTy.getNumElements()) 2160 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2161 else 2162 B.buildUndef(Dst); 2163 2164 MI.eraseFromParent(); 2165 return true; 2166 } 2167 2168 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2169 MachineInstr &MI, MachineRegisterInfo &MRI, 2170 MachineIRBuilder &B) const { 2171 const LLT V2S16 = LLT::vector(2, 16); 2172 2173 Register Dst = MI.getOperand(0).getReg(); 2174 Register Src0 = MI.getOperand(1).getReg(); 2175 LLT DstTy = MRI.getType(Dst); 2176 LLT SrcTy = MRI.getType(Src0); 2177 2178 if (SrcTy == V2S16 && DstTy == V2S16 && 2179 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2180 return true; 2181 2182 MachineIRBuilder HelperBuilder(MI); 2183 GISelObserverWrapper DummyObserver; 2184 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2185 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2186 } 2187 2188 bool AMDGPULegalizerInfo::legalizeSinCos( 2189 MachineInstr &MI, MachineRegisterInfo &MRI, 2190 MachineIRBuilder &B) const { 2191 2192 Register DstReg = MI.getOperand(0).getReg(); 2193 Register SrcReg = MI.getOperand(1).getReg(); 2194 LLT Ty = MRI.getType(DstReg); 2195 unsigned Flags = MI.getFlags(); 2196 2197 Register TrigVal; 2198 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2199 if (ST.hasTrigReducedRange()) { 2200 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2201 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2202 .addUse(MulVal.getReg(0)) 2203 .setMIFlags(Flags).getReg(0); 2204 } else 2205 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2206 2207 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2208 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2209 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2210 .addUse(TrigVal) 2211 .setMIFlags(Flags); 2212 MI.eraseFromParent(); 2213 return true; 2214 } 2215 2216 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2217 MachineIRBuilder &B, 2218 const GlobalValue *GV, 2219 int64_t Offset, 2220 unsigned GAFlags) const { 2221 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2222 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2223 // to the following code sequence: 2224 // 2225 // For constant address space: 2226 // s_getpc_b64 s[0:1] 2227 // s_add_u32 s0, s0, $symbol 2228 // s_addc_u32 s1, s1, 0 2229 // 2230 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2231 // a fixup or relocation is emitted to replace $symbol with a literal 2232 // constant, which is a pc-relative offset from the encoding of the $symbol 2233 // operand to the global variable. 2234 // 2235 // For global address space: 2236 // s_getpc_b64 s[0:1] 2237 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2238 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2239 // 2240 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2241 // fixups or relocations are emitted to replace $symbol@*@lo and 2242 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2243 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2244 // operand to the global variable. 2245 // 2246 // What we want here is an offset from the value returned by s_getpc 2247 // (which is the address of the s_add_u32 instruction) to the global 2248 // variable, but since the encoding of $symbol starts 4 bytes after the start 2249 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2250 // small. This requires us to add 4 to the global variable offset in order to 2251 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2252 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2253 // instruction. 2254 2255 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2256 2257 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2258 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2259 2260 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2261 .addDef(PCReg); 2262 2263 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2264 if (GAFlags == SIInstrInfo::MO_NONE) 2265 MIB.addImm(0); 2266 else 2267 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2268 2269 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2270 2271 if (PtrTy.getSizeInBits() == 32) 2272 B.buildExtract(DstReg, PCReg, 0); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2277 MachineInstr &MI, MachineRegisterInfo &MRI, 2278 MachineIRBuilder &B) const { 2279 Register DstReg = MI.getOperand(0).getReg(); 2280 LLT Ty = MRI.getType(DstReg); 2281 unsigned AS = Ty.getAddressSpace(); 2282 2283 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2284 MachineFunction &MF = B.getMF(); 2285 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2286 2287 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2288 if (!MFI->isModuleEntryFunction()) { 2289 const Function &Fn = MF.getFunction(); 2290 DiagnosticInfoUnsupported BadLDSDecl( 2291 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2292 DS_Warning); 2293 Fn.getContext().diagnose(BadLDSDecl); 2294 2295 // We currently don't have a way to correctly allocate LDS objects that 2296 // aren't directly associated with a kernel. We do force inlining of 2297 // functions that use local objects. However, if these dead functions are 2298 // not eliminated, we don't want a compile time error. Just emit a warning 2299 // and a trap, since there should be no callable path here. 2300 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2301 B.buildUndef(DstReg); 2302 MI.eraseFromParent(); 2303 return true; 2304 } 2305 2306 // TODO: We could emit code to handle the initialization somewhere. 2307 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2308 const SITargetLowering *TLI = ST.getTargetLowering(); 2309 if (!TLI->shouldUseLDSConstAddress(GV)) { 2310 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2311 return true; // Leave in place; 2312 } 2313 2314 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2315 Type *Ty = GV->getValueType(); 2316 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2317 // zero-sized type in other languages to declare the dynamic shared 2318 // memory which size is not known at the compile time. They will be 2319 // allocated by the runtime and placed directly after the static 2320 // allocated ones. They all share the same offset. 2321 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2322 // Adjust alignment for that dynamic shared memory array. 2323 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2324 LLT S32 = LLT::scalar(32); 2325 auto Sz = 2326 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2327 B.buildIntToPtr(DstReg, Sz); 2328 MI.eraseFromParent(); 2329 return true; 2330 } 2331 } 2332 2333 B.buildConstant( 2334 DstReg, 2335 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2336 MI.eraseFromParent(); 2337 return true; 2338 } 2339 2340 const Function &Fn = MF.getFunction(); 2341 DiagnosticInfoUnsupported BadInit( 2342 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2343 Fn.getContext().diagnose(BadInit); 2344 return true; 2345 } 2346 2347 const SITargetLowering *TLI = ST.getTargetLowering(); 2348 2349 if (TLI->shouldEmitFixup(GV)) { 2350 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 if (TLI->shouldEmitPCReloc(GV)) { 2356 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2357 MI.eraseFromParent(); 2358 return true; 2359 } 2360 2361 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2362 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2363 2364 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2365 MachinePointerInfo::getGOT(MF), 2366 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2367 MachineMemOperand::MOInvariant, 2368 8 /*Size*/, Align(8)); 2369 2370 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2371 2372 if (Ty.getSizeInBits() == 32) { 2373 // Truncate if this is a 32-bit constant adrdess. 2374 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2375 B.buildExtract(DstReg, Load, 0); 2376 } else 2377 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2378 2379 MI.eraseFromParent(); 2380 return true; 2381 } 2382 2383 static LLT widenToNextPowerOf2(LLT Ty) { 2384 if (Ty.isVector()) 2385 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2386 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2387 } 2388 2389 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2390 MachineInstr &MI) const { 2391 MachineIRBuilder &B = Helper.MIRBuilder; 2392 MachineRegisterInfo &MRI = *B.getMRI(); 2393 GISelChangeObserver &Observer = Helper.Observer; 2394 2395 Register PtrReg = MI.getOperand(1).getReg(); 2396 LLT PtrTy = MRI.getType(PtrReg); 2397 unsigned AddrSpace = PtrTy.getAddressSpace(); 2398 2399 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2400 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2401 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2402 Observer.changingInstr(MI); 2403 MI.getOperand(1).setReg(Cast.getReg(0)); 2404 Observer.changedInstr(MI); 2405 return true; 2406 } 2407 2408 Register ValReg = MI.getOperand(0).getReg(); 2409 LLT ValTy = MRI.getType(ValReg); 2410 2411 MachineMemOperand *MMO = *MI.memoperands_begin(); 2412 const unsigned ValSize = ValTy.getSizeInBits(); 2413 const unsigned MemSize = 8 * MMO->getSize(); 2414 const Align MemAlign = MMO->getAlign(); 2415 const unsigned AlignInBits = 8 * MemAlign.value(); 2416 2417 // Widen non-power-of-2 loads to the alignment if needed 2418 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2419 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2420 2421 // This was already the correct extending load result type, so just adjust 2422 // the memory type. 2423 if (WideMemSize == ValSize) { 2424 MachineFunction &MF = B.getMF(); 2425 2426 MachineMemOperand *WideMMO = 2427 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2428 Observer.changingInstr(MI); 2429 MI.setMemRefs(MF, {WideMMO}); 2430 Observer.changedInstr(MI); 2431 return true; 2432 } 2433 2434 // Don't bother handling edge case that should probably never be produced. 2435 if (ValSize > WideMemSize) 2436 return false; 2437 2438 LLT WideTy = widenToNextPowerOf2(ValTy); 2439 2440 Register WideLoad; 2441 if (!WideTy.isVector()) { 2442 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2443 B.buildTrunc(ValReg, WideLoad).getReg(0); 2444 } else { 2445 // Extract the subvector. 2446 2447 if (isRegisterType(ValTy)) { 2448 // If this a case where G_EXTRACT is legal, use it. 2449 // (e.g. <3 x s32> -> <4 x s32>) 2450 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2451 B.buildExtract(ValReg, WideLoad, 0); 2452 } else { 2453 // For cases where the widened type isn't a nice register value, unmerge 2454 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2455 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2456 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2457 B.setInsertPt(B.getMBB(), MI.getIterator()); 2458 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2459 } 2460 } 2461 2462 MI.eraseFromParent(); 2463 return true; 2464 } 2465 2466 return false; 2467 } 2468 2469 bool AMDGPULegalizerInfo::legalizeFMad( 2470 MachineInstr &MI, MachineRegisterInfo &MRI, 2471 MachineIRBuilder &B) const { 2472 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2473 assert(Ty.isScalar()); 2474 2475 MachineFunction &MF = B.getMF(); 2476 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2477 2478 // TODO: Always legal with future ftz flag. 2479 // FIXME: Do we need just output? 2480 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2481 return true; 2482 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2483 return true; 2484 2485 MachineIRBuilder HelperBuilder(MI); 2486 GISelObserverWrapper DummyObserver; 2487 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2488 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2489 } 2490 2491 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2492 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2493 Register DstReg = MI.getOperand(0).getReg(); 2494 Register PtrReg = MI.getOperand(1).getReg(); 2495 Register CmpVal = MI.getOperand(2).getReg(); 2496 Register NewVal = MI.getOperand(3).getReg(); 2497 2498 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2499 "this should not have been custom lowered"); 2500 2501 LLT ValTy = MRI.getType(CmpVal); 2502 LLT VecTy = LLT::vector(2, ValTy); 2503 2504 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2505 2506 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2507 .addDef(DstReg) 2508 .addUse(PtrReg) 2509 .addUse(PackedVal) 2510 .setMemRefs(MI.memoperands()); 2511 2512 MI.eraseFromParent(); 2513 return true; 2514 } 2515 2516 bool AMDGPULegalizerInfo::legalizeFlog( 2517 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2518 Register Dst = MI.getOperand(0).getReg(); 2519 Register Src = MI.getOperand(1).getReg(); 2520 LLT Ty = B.getMRI()->getType(Dst); 2521 unsigned Flags = MI.getFlags(); 2522 2523 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2524 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2525 2526 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2527 MI.eraseFromParent(); 2528 return true; 2529 } 2530 2531 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2532 MachineIRBuilder &B) const { 2533 Register Dst = MI.getOperand(0).getReg(); 2534 Register Src = MI.getOperand(1).getReg(); 2535 unsigned Flags = MI.getFlags(); 2536 LLT Ty = B.getMRI()->getType(Dst); 2537 2538 auto K = B.buildFConstant(Ty, numbers::log2e); 2539 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2540 B.buildFExp2(Dst, Mul, Flags); 2541 MI.eraseFromParent(); 2542 return true; 2543 } 2544 2545 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2546 MachineIRBuilder &B) const { 2547 Register Dst = MI.getOperand(0).getReg(); 2548 Register Src0 = MI.getOperand(1).getReg(); 2549 Register Src1 = MI.getOperand(2).getReg(); 2550 unsigned Flags = MI.getFlags(); 2551 LLT Ty = B.getMRI()->getType(Dst); 2552 const LLT S16 = LLT::scalar(16); 2553 const LLT S32 = LLT::scalar(32); 2554 2555 if (Ty == S32) { 2556 auto Log = B.buildFLog2(S32, Src0, Flags); 2557 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2558 .addUse(Log.getReg(0)) 2559 .addUse(Src1) 2560 .setMIFlags(Flags); 2561 B.buildFExp2(Dst, Mul, Flags); 2562 } else if (Ty == S16) { 2563 // There's no f16 fmul_legacy, so we need to convert for it. 2564 auto Log = B.buildFLog2(S16, Src0, Flags); 2565 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2566 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2567 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2568 .addUse(Ext0.getReg(0)) 2569 .addUse(Ext1.getReg(0)) 2570 .setMIFlags(Flags); 2571 2572 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2573 } else 2574 return false; 2575 2576 MI.eraseFromParent(); 2577 return true; 2578 } 2579 2580 // Find a source register, ignoring any possible source modifiers. 2581 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2582 Register ModSrc = OrigSrc; 2583 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2584 ModSrc = SrcFNeg->getOperand(1).getReg(); 2585 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2586 ModSrc = SrcFAbs->getOperand(1).getReg(); 2587 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2588 ModSrc = SrcFAbs->getOperand(1).getReg(); 2589 return ModSrc; 2590 } 2591 2592 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2593 MachineRegisterInfo &MRI, 2594 MachineIRBuilder &B) const { 2595 2596 const LLT S1 = LLT::scalar(1); 2597 const LLT S64 = LLT::scalar(64); 2598 Register Dst = MI.getOperand(0).getReg(); 2599 Register OrigSrc = MI.getOperand(1).getReg(); 2600 unsigned Flags = MI.getFlags(); 2601 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2602 "this should not have been custom lowered"); 2603 2604 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2605 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2606 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2607 // V_FRACT bug is: 2608 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2609 // 2610 // Convert floor(x) to (x - fract(x)) 2611 2612 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2613 .addUse(OrigSrc) 2614 .setMIFlags(Flags); 2615 2616 // Give source modifier matching some assistance before obscuring a foldable 2617 // pattern. 2618 2619 // TODO: We can avoid the neg on the fract? The input sign to fract 2620 // shouldn't matter? 2621 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2622 2623 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2624 2625 Register Min = MRI.createGenericVirtualRegister(S64); 2626 2627 // We don't need to concern ourselves with the snan handling difference, so 2628 // use the one which will directly select. 2629 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2630 if (MFI->getMode().IEEE) 2631 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2632 else 2633 B.buildFMinNum(Min, Fract, Const, Flags); 2634 2635 Register CorrectedFract = Min; 2636 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2637 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2638 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2639 } 2640 2641 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2642 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2643 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 // Turn an illegal packed v2s16 build vector into bit operations. 2649 // TODO: This should probably be a bitcast action in LegalizerHelper. 2650 bool AMDGPULegalizerInfo::legalizeBuildVector( 2651 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2652 Register Dst = MI.getOperand(0).getReg(); 2653 const LLT S32 = LLT::scalar(32); 2654 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2655 2656 Register Src0 = MI.getOperand(1).getReg(); 2657 Register Src1 = MI.getOperand(2).getReg(); 2658 assert(MRI.getType(Src0) == LLT::scalar(16)); 2659 2660 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2661 B.buildBitcast(Dst, Merge); 2662 2663 MI.eraseFromParent(); 2664 return true; 2665 } 2666 2667 // Check that this is a G_XOR x, -1 2668 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2669 if (MI.getOpcode() != TargetOpcode::G_XOR) 2670 return false; 2671 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2672 return ConstVal && *ConstVal == -1; 2673 } 2674 2675 // Return the use branch instruction, otherwise null if the usage is invalid. 2676 static MachineInstr * 2677 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2678 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2679 Register CondDef = MI.getOperand(0).getReg(); 2680 if (!MRI.hasOneNonDBGUse(CondDef)) 2681 return nullptr; 2682 2683 MachineBasicBlock *Parent = MI.getParent(); 2684 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2685 2686 if (isNot(MRI, *UseMI)) { 2687 Register NegatedCond = UseMI->getOperand(0).getReg(); 2688 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2689 return nullptr; 2690 2691 // We're deleting the def of this value, so we need to remove it. 2692 UseMI->eraseFromParent(); 2693 2694 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2695 Negated = true; 2696 } 2697 2698 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2699 return nullptr; 2700 2701 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2702 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2703 if (Next == Parent->end()) { 2704 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2705 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2706 return nullptr; 2707 UncondBrTarget = &*NextMBB; 2708 } else { 2709 if (Next->getOpcode() != AMDGPU::G_BR) 2710 return nullptr; 2711 Br = &*Next; 2712 UncondBrTarget = Br->getOperand(0).getMBB(); 2713 } 2714 2715 return UseMI; 2716 } 2717 2718 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2719 const ArgDescriptor *Arg, 2720 const TargetRegisterClass *ArgRC, 2721 LLT ArgTy) const { 2722 MCRegister SrcReg = Arg->getRegister(); 2723 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2724 assert(DstReg.isVirtual() && "Virtual register expected"); 2725 2726 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2727 ArgTy); 2728 if (Arg->isMasked()) { 2729 // TODO: Should we try to emit this once in the entry block? 2730 const LLT S32 = LLT::scalar(32); 2731 const unsigned Mask = Arg->getMask(); 2732 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2733 2734 Register AndMaskSrc = LiveIn; 2735 2736 if (Shift != 0) { 2737 auto ShiftAmt = B.buildConstant(S32, Shift); 2738 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2739 } 2740 2741 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2742 } else { 2743 B.buildCopy(DstReg, LiveIn); 2744 } 2745 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::loadInputValue( 2750 Register DstReg, MachineIRBuilder &B, 2751 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2752 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2753 const ArgDescriptor *Arg; 2754 const TargetRegisterClass *ArgRC; 2755 LLT ArgTy; 2756 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2757 2758 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2759 return false; // TODO: Handle these 2760 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2761 } 2762 2763 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2764 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2765 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2766 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2767 return false; 2768 2769 MI.eraseFromParent(); 2770 return true; 2771 } 2772 2773 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2774 MachineRegisterInfo &MRI, 2775 MachineIRBuilder &B) const { 2776 Register Dst = MI.getOperand(0).getReg(); 2777 LLT DstTy = MRI.getType(Dst); 2778 LLT S16 = LLT::scalar(16); 2779 LLT S32 = LLT::scalar(32); 2780 LLT S64 = LLT::scalar(64); 2781 2782 if (DstTy == S16) 2783 return legalizeFDIV16(MI, MRI, B); 2784 if (DstTy == S32) 2785 return legalizeFDIV32(MI, MRI, B); 2786 if (DstTy == S64) 2787 return legalizeFDIV64(MI, MRI, B); 2788 2789 return false; 2790 } 2791 2792 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2793 Register DstReg, 2794 Register X, 2795 Register Y, 2796 bool IsDiv) const { 2797 const LLT S1 = LLT::scalar(1); 2798 const LLT S32 = LLT::scalar(32); 2799 2800 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2801 // algorithm used here. 2802 2803 // Initial estimate of inv(y). 2804 auto FloatY = B.buildUITOFP(S32, Y); 2805 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2806 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2807 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2808 auto Z = B.buildFPTOUI(S32, ScaledY); 2809 2810 // One round of UNR. 2811 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2812 auto NegYZ = B.buildMul(S32, NegY, Z); 2813 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2814 2815 // Quotient/remainder estimate. 2816 auto Q = B.buildUMulH(S32, X, Z); 2817 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2818 2819 // First quotient/remainder refinement. 2820 auto One = B.buildConstant(S32, 1); 2821 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2822 if (IsDiv) 2823 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2824 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2825 2826 // Second quotient/remainder refinement. 2827 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2828 if (IsDiv) 2829 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2830 else 2831 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2832 } 2833 2834 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2835 MachineRegisterInfo &MRI, 2836 MachineIRBuilder &B) const { 2837 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2838 Register DstReg = MI.getOperand(0).getReg(); 2839 Register Num = MI.getOperand(1).getReg(); 2840 Register Den = MI.getOperand(2).getReg(); 2841 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2842 MI.eraseFromParent(); 2843 return true; 2844 } 2845 2846 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2847 // 2848 // Return lo, hi of result 2849 // 2850 // %cvt.lo = G_UITOFP Val.lo 2851 // %cvt.hi = G_UITOFP Val.hi 2852 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2853 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2854 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2855 // %mul2 = G_FMUL %mul1, 2**(-32) 2856 // %trunc = G_INTRINSIC_TRUNC %mul2 2857 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2858 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2859 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2860 Register Val) { 2861 const LLT S32 = LLT::scalar(32); 2862 auto Unmerge = B.buildUnmerge(S32, Val); 2863 2864 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2865 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2866 2867 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2868 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2869 2870 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2871 auto Mul1 = 2872 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2873 2874 // 2**(-32) 2875 auto Mul2 = 2876 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2877 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2878 2879 // -(2**32) 2880 auto Mad2 = B.buildFMAD(S32, Trunc, 2881 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2882 2883 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2884 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2885 2886 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2887 } 2888 2889 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2890 Register DstReg, 2891 Register Numer, 2892 Register Denom, 2893 bool IsDiv) const { 2894 const LLT S32 = LLT::scalar(32); 2895 const LLT S64 = LLT::scalar(64); 2896 const LLT S1 = LLT::scalar(1); 2897 Register RcpLo, RcpHi; 2898 2899 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2900 2901 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2902 2903 auto Zero64 = B.buildConstant(S64, 0); 2904 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2905 2906 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2907 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2908 2909 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2910 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2911 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2912 2913 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2914 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2915 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2916 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2917 2918 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2919 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2920 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2921 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2922 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2923 2924 auto Zero32 = B.buildConstant(S32, 0); 2925 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2926 auto Add2_HiC = 2927 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2928 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2929 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2930 2931 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2932 Register NumerLo = UnmergeNumer.getReg(0); 2933 Register NumerHi = UnmergeNumer.getReg(1); 2934 2935 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2936 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2937 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2938 Register Mul3_Lo = UnmergeMul3.getReg(0); 2939 Register Mul3_Hi = UnmergeMul3.getReg(1); 2940 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2941 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2942 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2943 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2944 2945 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2946 Register DenomLo = UnmergeDenom.getReg(0); 2947 Register DenomHi = UnmergeDenom.getReg(1); 2948 2949 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2950 auto C1 = B.buildSExt(S32, CmpHi); 2951 2952 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2953 auto C2 = B.buildSExt(S32, CmpLo); 2954 2955 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2956 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2957 2958 // TODO: Here and below portions of the code can be enclosed into if/endif. 2959 // Currently control flow is unconditional and we have 4 selects after 2960 // potential endif to substitute PHIs. 2961 2962 // if C3 != 0 ... 2963 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2964 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2965 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2966 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2967 2968 auto One64 = B.buildConstant(S64, 1); 2969 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2970 2971 auto C4 = 2972 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2973 auto C5 = 2974 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2975 auto C6 = B.buildSelect( 2976 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2977 2978 // if (C6 != 0) 2979 auto Add4 = B.buildAdd(S64, Add3, One64); 2980 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2981 2982 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2983 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2984 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2985 2986 // endif C6 2987 // endif C3 2988 2989 if (IsDiv) { 2990 auto Sel1 = B.buildSelect( 2991 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2992 B.buildSelect(DstReg, 2993 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2994 } else { 2995 auto Sel2 = B.buildSelect( 2996 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2997 B.buildSelect(DstReg, 2998 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2999 } 3000 } 3001 3002 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 3003 MachineRegisterInfo &MRI, 3004 MachineIRBuilder &B) const { 3005 const LLT S64 = LLT::scalar(64); 3006 const LLT S32 = LLT::scalar(32); 3007 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 3008 Register DstReg = MI.getOperand(0).getReg(); 3009 Register Num = MI.getOperand(1).getReg(); 3010 Register Den = MI.getOperand(2).getReg(); 3011 LLT Ty = MRI.getType(DstReg); 3012 3013 if (Ty == S32) 3014 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 3015 else if (Ty == S64) 3016 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 3017 else 3018 return false; 3019 3020 MI.eraseFromParent(); 3021 return true; 3022 3023 } 3024 3025 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3026 MachineRegisterInfo &MRI, 3027 MachineIRBuilder &B) const { 3028 const LLT S64 = LLT::scalar(64); 3029 const LLT S32 = LLT::scalar(32); 3030 3031 Register DstReg = MI.getOperand(0).getReg(); 3032 const LLT Ty = MRI.getType(DstReg); 3033 if (Ty != S32 && Ty != S64) 3034 return false; 3035 3036 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3037 3038 Register LHS = MI.getOperand(1).getReg(); 3039 Register RHS = MI.getOperand(2).getReg(); 3040 3041 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3042 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3043 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3044 3045 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3046 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3047 3048 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3049 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3050 3051 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3052 if (Ty == S32) 3053 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3054 else 3055 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3056 3057 Register Sign; 3058 if (IsDiv) 3059 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3060 else 3061 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3062 3063 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3064 B.buildSub(DstReg, UDivRem, Sign); 3065 3066 MI.eraseFromParent(); 3067 return true; 3068 } 3069 3070 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3071 MachineRegisterInfo &MRI, 3072 MachineIRBuilder &B) const { 3073 Register Res = MI.getOperand(0).getReg(); 3074 Register LHS = MI.getOperand(1).getReg(); 3075 Register RHS = MI.getOperand(2).getReg(); 3076 uint16_t Flags = MI.getFlags(); 3077 LLT ResTy = MRI.getType(Res); 3078 3079 const MachineFunction &MF = B.getMF(); 3080 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3081 MI.getFlag(MachineInstr::FmAfn); 3082 3083 if (!AllowInaccurateRcp) 3084 return false; 3085 3086 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3087 // 1 / x -> RCP(x) 3088 if (CLHS->isExactlyValue(1.0)) { 3089 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3090 .addUse(RHS) 3091 .setMIFlags(Flags); 3092 3093 MI.eraseFromParent(); 3094 return true; 3095 } 3096 3097 // -1 / x -> RCP( FNEG(x) ) 3098 if (CLHS->isExactlyValue(-1.0)) { 3099 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3100 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3101 .addUse(FNeg.getReg(0)) 3102 .setMIFlags(Flags); 3103 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 } 3108 3109 // x / y -> x * (1.0 / y) 3110 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3111 .addUse(RHS) 3112 .setMIFlags(Flags); 3113 B.buildFMul(Res, LHS, RCP, Flags); 3114 3115 MI.eraseFromParent(); 3116 return true; 3117 } 3118 3119 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3120 MachineRegisterInfo &MRI, 3121 MachineIRBuilder &B) const { 3122 Register Res = MI.getOperand(0).getReg(); 3123 Register X = MI.getOperand(1).getReg(); 3124 Register Y = MI.getOperand(2).getReg(); 3125 uint16_t Flags = MI.getFlags(); 3126 LLT ResTy = MRI.getType(Res); 3127 3128 const MachineFunction &MF = B.getMF(); 3129 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3130 MI.getFlag(MachineInstr::FmAfn); 3131 3132 if (!AllowInaccurateRcp) 3133 return false; 3134 3135 auto NegY = B.buildFNeg(ResTy, Y); 3136 auto One = B.buildFConstant(ResTy, 1.0); 3137 3138 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3139 .addUse(Y) 3140 .setMIFlags(Flags); 3141 3142 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3143 R = B.buildFMA(ResTy, Tmp0, R, R); 3144 3145 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3146 R = B.buildFMA(ResTy, Tmp1, R, R); 3147 3148 auto Ret = B.buildFMul(ResTy, X, R); 3149 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3150 3151 B.buildFMA(Res, Tmp2, R, Ret); 3152 MI.eraseFromParent(); 3153 return true; 3154 } 3155 3156 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3157 MachineRegisterInfo &MRI, 3158 MachineIRBuilder &B) const { 3159 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3160 return true; 3161 3162 Register Res = MI.getOperand(0).getReg(); 3163 Register LHS = MI.getOperand(1).getReg(); 3164 Register RHS = MI.getOperand(2).getReg(); 3165 3166 uint16_t Flags = MI.getFlags(); 3167 3168 LLT S16 = LLT::scalar(16); 3169 LLT S32 = LLT::scalar(32); 3170 3171 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3172 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3173 3174 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3175 .addUse(RHSExt.getReg(0)) 3176 .setMIFlags(Flags); 3177 3178 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3179 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3180 3181 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3182 .addUse(RDst.getReg(0)) 3183 .addUse(RHS) 3184 .addUse(LHS) 3185 .setMIFlags(Flags); 3186 3187 MI.eraseFromParent(); 3188 return true; 3189 } 3190 3191 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3192 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3193 static void toggleSPDenormMode(bool Enable, 3194 MachineIRBuilder &B, 3195 const GCNSubtarget &ST, 3196 AMDGPU::SIModeRegisterDefaults Mode) { 3197 // Set SP denorm mode to this value. 3198 unsigned SPDenormMode = 3199 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3200 3201 if (ST.hasDenormModeInst()) { 3202 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3203 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3204 3205 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3206 B.buildInstr(AMDGPU::S_DENORM_MODE) 3207 .addImm(NewDenormModeValue); 3208 3209 } else { 3210 // Select FP32 bit field in mode register. 3211 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3212 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3213 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3214 3215 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3216 .addImm(SPDenormMode) 3217 .addImm(SPDenormModeBitField); 3218 } 3219 } 3220 3221 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3222 MachineRegisterInfo &MRI, 3223 MachineIRBuilder &B) const { 3224 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3225 return true; 3226 3227 Register Res = MI.getOperand(0).getReg(); 3228 Register LHS = MI.getOperand(1).getReg(); 3229 Register RHS = MI.getOperand(2).getReg(); 3230 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3231 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3232 3233 uint16_t Flags = MI.getFlags(); 3234 3235 LLT S32 = LLT::scalar(32); 3236 LLT S1 = LLT::scalar(1); 3237 3238 auto One = B.buildFConstant(S32, 1.0f); 3239 3240 auto DenominatorScaled = 3241 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3242 .addUse(LHS) 3243 .addUse(RHS) 3244 .addImm(0) 3245 .setMIFlags(Flags); 3246 auto NumeratorScaled = 3247 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3248 .addUse(LHS) 3249 .addUse(RHS) 3250 .addImm(1) 3251 .setMIFlags(Flags); 3252 3253 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3254 .addUse(DenominatorScaled.getReg(0)) 3255 .setMIFlags(Flags); 3256 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3257 3258 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3259 // aren't modeled as reading it. 3260 if (!Mode.allFP32Denormals()) 3261 toggleSPDenormMode(true, B, ST, Mode); 3262 3263 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3264 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3265 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3266 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3267 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3268 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3269 3270 if (!Mode.allFP32Denormals()) 3271 toggleSPDenormMode(false, B, ST, Mode); 3272 3273 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3274 .addUse(Fma4.getReg(0)) 3275 .addUse(Fma1.getReg(0)) 3276 .addUse(Fma3.getReg(0)) 3277 .addUse(NumeratorScaled.getReg(1)) 3278 .setMIFlags(Flags); 3279 3280 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3281 .addUse(Fmas.getReg(0)) 3282 .addUse(RHS) 3283 .addUse(LHS) 3284 .setMIFlags(Flags); 3285 3286 MI.eraseFromParent(); 3287 return true; 3288 } 3289 3290 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3291 MachineRegisterInfo &MRI, 3292 MachineIRBuilder &B) const { 3293 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3294 return true; 3295 3296 Register Res = MI.getOperand(0).getReg(); 3297 Register LHS = MI.getOperand(1).getReg(); 3298 Register RHS = MI.getOperand(2).getReg(); 3299 3300 uint16_t Flags = MI.getFlags(); 3301 3302 LLT S64 = LLT::scalar(64); 3303 LLT S1 = LLT::scalar(1); 3304 3305 auto One = B.buildFConstant(S64, 1.0); 3306 3307 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3308 .addUse(LHS) 3309 .addUse(RHS) 3310 .addImm(0) 3311 .setMIFlags(Flags); 3312 3313 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3314 3315 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3316 .addUse(DivScale0.getReg(0)) 3317 .setMIFlags(Flags); 3318 3319 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3320 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3321 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3322 3323 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3324 .addUse(LHS) 3325 .addUse(RHS) 3326 .addImm(1) 3327 .setMIFlags(Flags); 3328 3329 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3330 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3331 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3332 3333 Register Scale; 3334 if (!ST.hasUsableDivScaleConditionOutput()) { 3335 // Workaround a hardware bug on SI where the condition output from div_scale 3336 // is not usable. 3337 3338 LLT S32 = LLT::scalar(32); 3339 3340 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3341 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3342 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3343 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3344 3345 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3346 Scale1Unmerge.getReg(1)); 3347 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3348 Scale0Unmerge.getReg(1)); 3349 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3350 } else { 3351 Scale = DivScale1.getReg(1); 3352 } 3353 3354 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3355 .addUse(Fma4.getReg(0)) 3356 .addUse(Fma3.getReg(0)) 3357 .addUse(Mul.getReg(0)) 3358 .addUse(Scale) 3359 .setMIFlags(Flags); 3360 3361 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3362 .addUse(Fmas.getReg(0)) 3363 .addUse(RHS) 3364 .addUse(LHS) 3365 .setMIFlags(Flags); 3366 3367 MI.eraseFromParent(); 3368 return true; 3369 } 3370 3371 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3372 MachineRegisterInfo &MRI, 3373 MachineIRBuilder &B) const { 3374 Register Res = MI.getOperand(0).getReg(); 3375 Register LHS = MI.getOperand(2).getReg(); 3376 Register RHS = MI.getOperand(3).getReg(); 3377 uint16_t Flags = MI.getFlags(); 3378 3379 LLT S32 = LLT::scalar(32); 3380 LLT S1 = LLT::scalar(1); 3381 3382 auto Abs = B.buildFAbs(S32, RHS, Flags); 3383 const APFloat C0Val(1.0f); 3384 3385 auto C0 = B.buildConstant(S32, 0x6f800000); 3386 auto C1 = B.buildConstant(S32, 0x2f800000); 3387 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3388 3389 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3390 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3391 3392 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3393 3394 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3395 .addUse(Mul0.getReg(0)) 3396 .setMIFlags(Flags); 3397 3398 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3399 3400 B.buildFMul(Res, Sel, Mul1, Flags); 3401 3402 MI.eraseFromParent(); 3403 return true; 3404 } 3405 3406 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3407 // FIXME: Why do we handle this one but not other removed instructions? 3408 // 3409 // Reciprocal square root. The clamp prevents infinite results, clamping 3410 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3411 // +-max_float. 3412 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3413 MachineRegisterInfo &MRI, 3414 MachineIRBuilder &B) const { 3415 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3416 return true; 3417 3418 Register Dst = MI.getOperand(0).getReg(); 3419 Register Src = MI.getOperand(2).getReg(); 3420 auto Flags = MI.getFlags(); 3421 3422 LLT Ty = MRI.getType(Dst); 3423 3424 const fltSemantics *FltSemantics; 3425 if (Ty == LLT::scalar(32)) 3426 FltSemantics = &APFloat::IEEEsingle(); 3427 else if (Ty == LLT::scalar(64)) 3428 FltSemantics = &APFloat::IEEEdouble(); 3429 else 3430 return false; 3431 3432 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3433 .addUse(Src) 3434 .setMIFlags(Flags); 3435 3436 // We don't need to concern ourselves with the snan handling difference, since 3437 // the rsq quieted (or not) so use the one which will directly select. 3438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3439 const bool UseIEEE = MFI->getMode().IEEE; 3440 3441 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3442 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3443 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3444 3445 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3446 3447 if (UseIEEE) 3448 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3449 else 3450 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3451 MI.eraseFromParent(); 3452 return true; 3453 } 3454 3455 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3456 switch (IID) { 3457 case Intrinsic::amdgcn_ds_fadd: 3458 return AMDGPU::G_ATOMICRMW_FADD; 3459 case Intrinsic::amdgcn_ds_fmin: 3460 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3461 case Intrinsic::amdgcn_ds_fmax: 3462 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3463 default: 3464 llvm_unreachable("not a DS FP intrinsic"); 3465 } 3466 } 3467 3468 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3469 MachineInstr &MI, 3470 Intrinsic::ID IID) const { 3471 GISelChangeObserver &Observer = Helper.Observer; 3472 Observer.changingInstr(MI); 3473 3474 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3475 3476 // The remaining operands were used to set fields in the MemOperand on 3477 // construction. 3478 for (int I = 6; I > 3; --I) 3479 MI.RemoveOperand(I); 3480 3481 MI.RemoveOperand(1); // Remove the intrinsic ID. 3482 Observer.changedInstr(MI); 3483 return true; 3484 } 3485 3486 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3487 MachineRegisterInfo &MRI, 3488 MachineIRBuilder &B) const { 3489 uint64_t Offset = 3490 ST.getTargetLowering()->getImplicitParameterOffset( 3491 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3492 LLT DstTy = MRI.getType(DstReg); 3493 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3494 3495 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3496 if (!loadInputValue(KernargPtrReg, B, 3497 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3498 return false; 3499 3500 // FIXME: This should be nuw 3501 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3502 return true; 3503 } 3504 3505 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3506 MachineRegisterInfo &MRI, 3507 MachineIRBuilder &B) const { 3508 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3509 if (!MFI->isEntryFunction()) { 3510 return legalizePreloadedArgIntrin(MI, MRI, B, 3511 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3512 } 3513 3514 Register DstReg = MI.getOperand(0).getReg(); 3515 if (!getImplicitArgPtr(DstReg, MRI, B)) 3516 return false; 3517 3518 MI.eraseFromParent(); 3519 return true; 3520 } 3521 3522 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3523 MachineRegisterInfo &MRI, 3524 MachineIRBuilder &B, 3525 unsigned AddrSpace) const { 3526 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3527 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3528 Register Hi32 = Unmerge.getReg(1); 3529 3530 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3531 MI.eraseFromParent(); 3532 return true; 3533 } 3534 3535 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3536 // offset (the offset that is included in bounds checking and swizzling, to be 3537 // split between the instruction's voffset and immoffset fields) and soffset 3538 // (the offset that is excluded from bounds checking and swizzling, to go in 3539 // the instruction's soffset field). This function takes the first kind of 3540 // offset and figures out how to split it between voffset and immoffset. 3541 std::tuple<Register, unsigned, unsigned> 3542 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3543 Register OrigOffset) const { 3544 const unsigned MaxImm = 4095; 3545 Register BaseReg; 3546 unsigned TotalConstOffset; 3547 const LLT S32 = LLT::scalar(32); 3548 MachineRegisterInfo &MRI = *B.getMRI(); 3549 3550 std::tie(BaseReg, TotalConstOffset) = 3551 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3552 3553 unsigned ImmOffset = TotalConstOffset; 3554 3555 // If BaseReg is a pointer, convert it to int. 3556 if (MRI.getType(BaseReg).isPointer()) 3557 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3558 3559 // If the immediate value is too big for the immoffset field, put the value 3560 // and -4096 into the immoffset field so that the value that is copied/added 3561 // for the voffset field is a multiple of 4096, and it stands more chance 3562 // of being CSEd with the copy/add for another similar load/store. 3563 // However, do not do that rounding down to a multiple of 4096 if that is a 3564 // negative number, as it appears to be illegal to have a negative offset 3565 // in the vgpr, even if adding the immediate offset makes it positive. 3566 unsigned Overflow = ImmOffset & ~MaxImm; 3567 ImmOffset -= Overflow; 3568 if ((int32_t)Overflow < 0) { 3569 Overflow += ImmOffset; 3570 ImmOffset = 0; 3571 } 3572 3573 if (Overflow != 0) { 3574 if (!BaseReg) { 3575 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3576 } else { 3577 auto OverflowVal = B.buildConstant(S32, Overflow); 3578 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3579 } 3580 } 3581 3582 if (!BaseReg) 3583 BaseReg = B.buildConstant(S32, 0).getReg(0); 3584 3585 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3586 } 3587 3588 /// Handle register layout difference for f16 images for some subtargets. 3589 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3590 MachineRegisterInfo &MRI, 3591 Register Reg, 3592 bool ImageStore) const { 3593 const LLT S16 = LLT::scalar(16); 3594 const LLT S32 = LLT::scalar(32); 3595 LLT StoreVT = MRI.getType(Reg); 3596 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3597 3598 if (ST.hasUnpackedD16VMem()) { 3599 auto Unmerge = B.buildUnmerge(S16, Reg); 3600 3601 SmallVector<Register, 4> WideRegs; 3602 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3603 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3604 3605 int NumElts = StoreVT.getNumElements(); 3606 3607 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3608 } 3609 3610 if (ImageStore && ST.hasImageStoreD16Bug()) { 3611 if (StoreVT.getNumElements() == 2) { 3612 SmallVector<Register, 4> PackedRegs; 3613 Reg = B.buildBitcast(S32, Reg).getReg(0); 3614 PackedRegs.push_back(Reg); 3615 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3616 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3617 } 3618 3619 if (StoreVT.getNumElements() == 3) { 3620 SmallVector<Register, 4> PackedRegs; 3621 auto Unmerge = B.buildUnmerge(S16, Reg); 3622 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3623 PackedRegs.push_back(Unmerge.getReg(I)); 3624 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3625 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3626 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3627 } 3628 3629 if (StoreVT.getNumElements() == 4) { 3630 SmallVector<Register, 4> PackedRegs; 3631 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3632 auto Unmerge = B.buildUnmerge(S32, Reg); 3633 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3634 PackedRegs.push_back(Unmerge.getReg(I)); 3635 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3636 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3637 } 3638 3639 llvm_unreachable("invalid data type"); 3640 } 3641 3642 return Reg; 3643 } 3644 3645 Register AMDGPULegalizerInfo::fixStoreSourceType( 3646 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3647 MachineRegisterInfo *MRI = B.getMRI(); 3648 LLT Ty = MRI->getType(VData); 3649 3650 const LLT S16 = LLT::scalar(16); 3651 3652 // Fixup illegal register types for i8 stores. 3653 if (Ty == LLT::scalar(8) || Ty == S16) { 3654 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3655 return AnyExt; 3656 } 3657 3658 if (Ty.isVector()) { 3659 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3660 if (IsFormat) 3661 return handleD16VData(B, *MRI, VData); 3662 } 3663 } 3664 3665 return VData; 3666 } 3667 3668 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3669 MachineRegisterInfo &MRI, 3670 MachineIRBuilder &B, 3671 bool IsTyped, 3672 bool IsFormat) const { 3673 Register VData = MI.getOperand(1).getReg(); 3674 LLT Ty = MRI.getType(VData); 3675 LLT EltTy = Ty.getScalarType(); 3676 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3677 const LLT S32 = LLT::scalar(32); 3678 3679 VData = fixStoreSourceType(B, VData, IsFormat); 3680 Register RSrc = MI.getOperand(2).getReg(); 3681 3682 MachineMemOperand *MMO = *MI.memoperands_begin(); 3683 const int MemSize = MMO->getSize(); 3684 3685 unsigned ImmOffset; 3686 unsigned TotalOffset; 3687 3688 // The typed intrinsics add an immediate after the registers. 3689 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3690 3691 // The struct intrinsic variants add one additional operand over raw. 3692 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3693 Register VIndex; 3694 int OpOffset = 0; 3695 if (HasVIndex) { 3696 VIndex = MI.getOperand(3).getReg(); 3697 OpOffset = 1; 3698 } 3699 3700 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3701 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3702 3703 unsigned Format = 0; 3704 if (IsTyped) { 3705 Format = MI.getOperand(5 + OpOffset).getImm(); 3706 ++OpOffset; 3707 } 3708 3709 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3710 3711 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3712 if (TotalOffset != 0) 3713 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3714 3715 unsigned Opc; 3716 if (IsTyped) { 3717 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3718 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3719 } else if (IsFormat) { 3720 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3721 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3722 } else { 3723 switch (MemSize) { 3724 case 1: 3725 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3726 break; 3727 case 2: 3728 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3729 break; 3730 default: 3731 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3732 break; 3733 } 3734 } 3735 3736 if (!VIndex) 3737 VIndex = B.buildConstant(S32, 0).getReg(0); 3738 3739 auto MIB = B.buildInstr(Opc) 3740 .addUse(VData) // vdata 3741 .addUse(RSrc) // rsrc 3742 .addUse(VIndex) // vindex 3743 .addUse(VOffset) // voffset 3744 .addUse(SOffset) // soffset 3745 .addImm(ImmOffset); // offset(imm) 3746 3747 if (IsTyped) 3748 MIB.addImm(Format); 3749 3750 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3751 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3752 .addMemOperand(MMO); 3753 3754 MI.eraseFromParent(); 3755 return true; 3756 } 3757 3758 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3759 MachineRegisterInfo &MRI, 3760 MachineIRBuilder &B, 3761 bool IsFormat, 3762 bool IsTyped) const { 3763 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3764 MachineMemOperand *MMO = *MI.memoperands_begin(); 3765 const int MemSize = MMO->getSize(); 3766 const LLT S32 = LLT::scalar(32); 3767 3768 Register Dst = MI.getOperand(0).getReg(); 3769 Register RSrc = MI.getOperand(2).getReg(); 3770 3771 // The typed intrinsics add an immediate after the registers. 3772 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3773 3774 // The struct intrinsic variants add one additional operand over raw. 3775 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3776 Register VIndex; 3777 int OpOffset = 0; 3778 if (HasVIndex) { 3779 VIndex = MI.getOperand(3).getReg(); 3780 OpOffset = 1; 3781 } 3782 3783 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3784 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3785 3786 unsigned Format = 0; 3787 if (IsTyped) { 3788 Format = MI.getOperand(5 + OpOffset).getImm(); 3789 ++OpOffset; 3790 } 3791 3792 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3793 unsigned ImmOffset; 3794 unsigned TotalOffset; 3795 3796 LLT Ty = MRI.getType(Dst); 3797 LLT EltTy = Ty.getScalarType(); 3798 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3799 const bool Unpacked = ST.hasUnpackedD16VMem(); 3800 3801 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3802 if (TotalOffset != 0) 3803 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3804 3805 unsigned Opc; 3806 3807 if (IsTyped) { 3808 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3809 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3810 } else if (IsFormat) { 3811 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3812 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3813 } else { 3814 switch (MemSize) { 3815 case 1: 3816 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3817 break; 3818 case 2: 3819 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3820 break; 3821 default: 3822 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3823 break; 3824 } 3825 } 3826 3827 Register LoadDstReg; 3828 3829 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3830 LLT UnpackedTy = Ty.changeElementSize(32); 3831 3832 if (IsExtLoad) 3833 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3834 else if (Unpacked && IsD16 && Ty.isVector()) 3835 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3836 else 3837 LoadDstReg = Dst; 3838 3839 if (!VIndex) 3840 VIndex = B.buildConstant(S32, 0).getReg(0); 3841 3842 auto MIB = B.buildInstr(Opc) 3843 .addDef(LoadDstReg) // vdata 3844 .addUse(RSrc) // rsrc 3845 .addUse(VIndex) // vindex 3846 .addUse(VOffset) // voffset 3847 .addUse(SOffset) // soffset 3848 .addImm(ImmOffset); // offset(imm) 3849 3850 if (IsTyped) 3851 MIB.addImm(Format); 3852 3853 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3854 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3855 .addMemOperand(MMO); 3856 3857 if (LoadDstReg != Dst) { 3858 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3859 3860 // Widen result for extending loads was widened. 3861 if (IsExtLoad) 3862 B.buildTrunc(Dst, LoadDstReg); 3863 else { 3864 // Repack to original 16-bit vector result 3865 // FIXME: G_TRUNC should work, but legalization currently fails 3866 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3867 SmallVector<Register, 4> Repack; 3868 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3869 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3870 B.buildMerge(Dst, Repack); 3871 } 3872 } 3873 3874 MI.eraseFromParent(); 3875 return true; 3876 } 3877 3878 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3879 MachineIRBuilder &B, 3880 bool IsInc) const { 3881 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3882 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3883 B.buildInstr(Opc) 3884 .addDef(MI.getOperand(0).getReg()) 3885 .addUse(MI.getOperand(2).getReg()) 3886 .addUse(MI.getOperand(3).getReg()) 3887 .cloneMemRefs(MI); 3888 MI.eraseFromParent(); 3889 return true; 3890 } 3891 3892 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3893 switch (IntrID) { 3894 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3895 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3896 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3897 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3898 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3899 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3900 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3901 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3903 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3904 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3906 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3907 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3908 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3909 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3910 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3911 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3912 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3913 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3915 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3916 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3918 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3919 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3921 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3922 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3923 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3924 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3925 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3926 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3927 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3928 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3930 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3931 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3933 case Intrinsic::amdgcn_buffer_atomic_fadd: 3934 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3935 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3936 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3937 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 3938 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 3939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 3940 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 3941 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 3942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 3943 default: 3944 llvm_unreachable("unhandled atomic opcode"); 3945 } 3946 } 3947 3948 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3949 MachineIRBuilder &B, 3950 Intrinsic::ID IID) const { 3951 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3952 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3953 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3954 3955 Register Dst; 3956 3957 int OpOffset = 0; 3958 if (HasReturn) { 3959 // A few FP atomics do not support return values. 3960 Dst = MI.getOperand(0).getReg(); 3961 } else { 3962 OpOffset = -1; 3963 } 3964 3965 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3966 Register CmpVal; 3967 3968 if (IsCmpSwap) { 3969 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3970 ++OpOffset; 3971 } 3972 3973 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3974 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3975 3976 // The struct intrinsic variants add one additional operand over raw. 3977 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3978 Register VIndex; 3979 if (HasVIndex) { 3980 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3981 ++OpOffset; 3982 } 3983 3984 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3985 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3986 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3987 3988 MachineMemOperand *MMO = *MI.memoperands_begin(); 3989 3990 unsigned ImmOffset; 3991 unsigned TotalOffset; 3992 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3993 if (TotalOffset != 0) 3994 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3995 3996 if (!VIndex) 3997 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3998 3999 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4000 4001 if (HasReturn) 4002 MIB.addDef(Dst); 4003 4004 MIB.addUse(VData); // vdata 4005 4006 if (IsCmpSwap) 4007 MIB.addReg(CmpVal); 4008 4009 MIB.addUse(RSrc) // rsrc 4010 .addUse(VIndex) // vindex 4011 .addUse(VOffset) // voffset 4012 .addUse(SOffset) // soffset 4013 .addImm(ImmOffset) // offset(imm) 4014 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4015 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4016 .addMemOperand(MMO); 4017 4018 MI.eraseFromParent(); 4019 return true; 4020 } 4021 4022 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 4023 /// vector with s16 typed elements. 4024 static void packImageA16AddressToDwords( 4025 MachineIRBuilder &B, MachineInstr &MI, 4026 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 4027 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 4028 const LLT S16 = LLT::scalar(16); 4029 const LLT V2S16 = LLT::vector(2, 16); 4030 4031 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4032 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4033 if (!SrcOp.isReg()) 4034 continue; // _L to _LZ may have eliminated this. 4035 4036 Register AddrReg = SrcOp.getReg(); 4037 4038 if (I < Intr->GradientStart) { 4039 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4040 PackedAddrs.push_back(AddrReg); 4041 } else { 4042 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4043 // derivatives dx/dh and dx/dv are packed with undef. 4044 if (((I + 1) >= EndIdx) || 4045 ((Intr->NumGradients / 2) % 2 == 1 && 4046 (I == static_cast<unsigned>(Intr->GradientStart + 4047 (Intr->NumGradients / 2) - 1) || 4048 I == static_cast<unsigned>(Intr->GradientStart + 4049 Intr->NumGradients - 1))) || 4050 // Check for _L to _LZ optimization 4051 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4052 PackedAddrs.push_back( 4053 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4054 .getReg(0)); 4055 } else { 4056 PackedAddrs.push_back( 4057 B.buildBuildVector( 4058 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4059 .getReg(0)); 4060 ++I; 4061 } 4062 } 4063 } 4064 } 4065 4066 /// Convert from separate vaddr components to a single vector address register, 4067 /// and replace the remaining operands with $noreg. 4068 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4069 int DimIdx, int NumVAddrs) { 4070 const LLT S32 = LLT::scalar(32); 4071 4072 SmallVector<Register, 8> AddrRegs; 4073 for (int I = 0; I != NumVAddrs; ++I) { 4074 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4075 if (SrcOp.isReg()) { 4076 AddrRegs.push_back(SrcOp.getReg()); 4077 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4078 } 4079 } 4080 4081 int NumAddrRegs = AddrRegs.size(); 4082 if (NumAddrRegs != 1) { 4083 // Round up to 8 elements for v5-v7 4084 // FIXME: Missing intermediate sized register classes and instructions. 4085 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4086 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4087 auto Undef = B.buildUndef(S32); 4088 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4089 NumAddrRegs = RoundedNumRegs; 4090 } 4091 4092 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4093 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4094 } 4095 4096 for (int I = 1; I != NumVAddrs; ++I) { 4097 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4098 if (SrcOp.isReg()) 4099 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4100 } 4101 } 4102 4103 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4104 /// 4105 /// Depending on the subtarget, load/store with 16-bit element data need to be 4106 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4107 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4108 /// registers. 4109 /// 4110 /// We don't want to directly select image instructions just yet, but also want 4111 /// to exposes all register repacking to the legalizer/combiners. We also don't 4112 /// want a selected instrution entering RegBankSelect. In order to avoid 4113 /// defining a multitude of intermediate image instructions, directly hack on 4114 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4115 /// now unnecessary arguments with $noreg. 4116 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4117 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4118 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4119 4120 const unsigned NumDefs = MI.getNumExplicitDefs(); 4121 const unsigned ArgOffset = NumDefs + 1; 4122 bool IsTFE = NumDefs == 2; 4123 // We are only processing the operands of d16 image operations on subtargets 4124 // that use the unpacked register layout, or need to repack the TFE result. 4125 4126 // TODO: Do we need to guard against already legalized intrinsics? 4127 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4128 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4129 4130 MachineRegisterInfo *MRI = B.getMRI(); 4131 const LLT S32 = LLT::scalar(32); 4132 const LLT S16 = LLT::scalar(16); 4133 const LLT V2S16 = LLT::vector(2, 16); 4134 4135 unsigned DMask = 0; 4136 4137 // Check for 16 bit addresses and pack if true. 4138 LLT GradTy = 4139 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4140 LLT AddrTy = 4141 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4142 const bool IsG16 = GradTy == S16; 4143 const bool IsA16 = AddrTy == S16; 4144 4145 int DMaskLanes = 0; 4146 if (!BaseOpcode->Atomic) { 4147 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4148 if (BaseOpcode->Gather4) { 4149 DMaskLanes = 4; 4150 } else if (DMask != 0) { 4151 DMaskLanes = countPopulation(DMask); 4152 } else if (!IsTFE && !BaseOpcode->Store) { 4153 // If dmask is 0, this is a no-op load. This can be eliminated. 4154 B.buildUndef(MI.getOperand(0)); 4155 MI.eraseFromParent(); 4156 return true; 4157 } 4158 } 4159 4160 Observer.changingInstr(MI); 4161 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4162 4163 unsigned NewOpcode = NumDefs == 0 ? 4164 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4165 4166 // Track that we legalized this 4167 MI.setDesc(B.getTII().get(NewOpcode)); 4168 4169 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4170 // dmask to be at least 1 otherwise the instruction will fail 4171 if (IsTFE && DMask == 0) { 4172 DMask = 0x1; 4173 DMaskLanes = 1; 4174 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4175 } 4176 4177 if (BaseOpcode->Atomic) { 4178 Register VData0 = MI.getOperand(2).getReg(); 4179 LLT Ty = MRI->getType(VData0); 4180 4181 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4182 if (Ty.isVector()) 4183 return false; 4184 4185 if (BaseOpcode->AtomicX2) { 4186 Register VData1 = MI.getOperand(3).getReg(); 4187 // The two values are packed in one register. 4188 LLT PackedTy = LLT::vector(2, Ty); 4189 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4190 MI.getOperand(2).setReg(Concat.getReg(0)); 4191 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4192 } 4193 } 4194 4195 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4196 4197 // Optimize _L to _LZ when _L is zero 4198 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4199 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4200 const ConstantFP *ConstantLod; 4201 4202 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4203 m_GFCst(ConstantLod))) { 4204 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4205 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4206 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4207 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4208 Intr->Dim); 4209 4210 // The starting indexes should remain in the same place. 4211 --CorrectedNumVAddrs; 4212 4213 MI.getOperand(MI.getNumExplicitDefs()) 4214 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4215 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4216 Intr = NewImageDimIntr; 4217 } 4218 } 4219 } 4220 4221 // Optimize _mip away, when 'lod' is zero 4222 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4223 int64_t ConstantLod; 4224 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4225 m_ICst(ConstantLod))) { 4226 if (ConstantLod == 0) { 4227 // TODO: Change intrinsic opcode and remove operand instead or replacing 4228 // it with 0, as the _L to _LZ handling is done above. 4229 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4230 --CorrectedNumVAddrs; 4231 } 4232 } 4233 } 4234 4235 // Rewrite the addressing register layout before doing anything else. 4236 if (IsA16 || IsG16) { 4237 if (IsA16) { 4238 // Target must support the feature and gradients need to be 16 bit too 4239 if (!ST.hasA16() || !IsG16) 4240 return false; 4241 } else if (!ST.hasG16()) 4242 return false; 4243 4244 if (Intr->NumVAddrs > 1) { 4245 SmallVector<Register, 4> PackedRegs; 4246 // Don't compress addresses for G16 4247 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4248 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4249 PackEndIdx); 4250 4251 if (!IsA16) { 4252 // Add uncompressed address 4253 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4254 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4255 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4256 PackedRegs.push_back(AddrReg); 4257 } 4258 } 4259 4260 // See also below in the non-a16 branch 4261 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4262 4263 if (!UseNSA && PackedRegs.size() > 1) { 4264 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4265 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4266 PackedRegs[0] = Concat.getReg(0); 4267 PackedRegs.resize(1); 4268 } 4269 4270 const unsigned NumPacked = PackedRegs.size(); 4271 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4272 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4273 if (!SrcOp.isReg()) { 4274 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4275 continue; 4276 } 4277 4278 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4279 4280 if (I - Intr->VAddrStart < NumPacked) 4281 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4282 else 4283 SrcOp.setReg(AMDGPU::NoRegister); 4284 } 4285 } 4286 } else { 4287 // If the register allocator cannot place the address registers contiguously 4288 // without introducing moves, then using the non-sequential address encoding 4289 // is always preferable, since it saves VALU instructions and is usually a 4290 // wash in terms of code size or even better. 4291 // 4292 // However, we currently have no way of hinting to the register allocator 4293 // that MIMG addresses should be placed contiguously when it is possible to 4294 // do so, so force non-NSA for the common 2-address case as a heuristic. 4295 // 4296 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4297 // allocation when possible. 4298 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4299 4300 if (!UseNSA && Intr->NumVAddrs > 1) 4301 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4302 Intr->NumVAddrs); 4303 } 4304 4305 int Flags = 0; 4306 if (IsA16) 4307 Flags |= 1; 4308 if (IsG16) 4309 Flags |= 2; 4310 MI.addOperand(MachineOperand::CreateImm(Flags)); 4311 4312 if (BaseOpcode->Store) { // No TFE for stores? 4313 // TODO: Handle dmask trim 4314 Register VData = MI.getOperand(1).getReg(); 4315 LLT Ty = MRI->getType(VData); 4316 if (!Ty.isVector() || Ty.getElementType() != S16) 4317 return true; 4318 4319 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4320 if (RepackedReg != VData) { 4321 MI.getOperand(1).setReg(RepackedReg); 4322 } 4323 4324 return true; 4325 } 4326 4327 Register DstReg = MI.getOperand(0).getReg(); 4328 LLT Ty = MRI->getType(DstReg); 4329 const LLT EltTy = Ty.getScalarType(); 4330 const bool IsD16 = Ty.getScalarType() == S16; 4331 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4332 4333 // Confirm that the return type is large enough for the dmask specified 4334 if (NumElts < DMaskLanes) 4335 return false; 4336 4337 if (NumElts > 4 || DMaskLanes > 4) 4338 return false; 4339 4340 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4341 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4342 4343 // The raw dword aligned data component of the load. The only legal cases 4344 // where this matters should be when using the packed D16 format, for 4345 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4346 LLT RoundedTy; 4347 4348 // S32 vector to to cover all data, plus TFE result element. 4349 LLT TFETy; 4350 4351 // Register type to use for each loaded component. Will be S32 or V2S16. 4352 LLT RegTy; 4353 4354 if (IsD16 && ST.hasUnpackedD16VMem()) { 4355 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4356 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4357 RegTy = S32; 4358 } else { 4359 unsigned EltSize = EltTy.getSizeInBits(); 4360 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4361 unsigned RoundedSize = 32 * RoundedElts; 4362 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4363 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4364 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4365 } 4366 4367 // The return type does not need adjustment. 4368 // TODO: Should we change s16 case to s32 or <2 x s16>? 4369 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4370 return true; 4371 4372 Register Dst1Reg; 4373 4374 // Insert after the instruction. 4375 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4376 4377 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4378 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4379 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4380 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4381 4382 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4383 4384 MI.getOperand(0).setReg(NewResultReg); 4385 4386 // In the IR, TFE is supposed to be used with a 2 element struct return 4387 // type. The intruction really returns these two values in one contiguous 4388 // register, with one additional dword beyond the loaded data. Rewrite the 4389 // return type to use a single register result. 4390 4391 if (IsTFE) { 4392 Dst1Reg = MI.getOperand(1).getReg(); 4393 if (MRI->getType(Dst1Reg) != S32) 4394 return false; 4395 4396 // TODO: Make sure the TFE operand bit is set. 4397 MI.RemoveOperand(1); 4398 4399 // Handle the easy case that requires no repack instructions. 4400 if (Ty == S32) { 4401 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4402 return true; 4403 } 4404 } 4405 4406 // Now figure out how to copy the new result register back into the old 4407 // result. 4408 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4409 4410 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4411 4412 if (ResultNumRegs == 1) { 4413 assert(!IsTFE); 4414 ResultRegs[0] = NewResultReg; 4415 } else { 4416 // We have to repack into a new vector of some kind. 4417 for (int I = 0; I != NumDataRegs; ++I) 4418 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4419 B.buildUnmerge(ResultRegs, NewResultReg); 4420 4421 // Drop the final TFE element to get the data part. The TFE result is 4422 // directly written to the right place already. 4423 if (IsTFE) 4424 ResultRegs.resize(NumDataRegs); 4425 } 4426 4427 // For an s16 scalar result, we form an s32 result with a truncate regardless 4428 // of packed vs. unpacked. 4429 if (IsD16 && !Ty.isVector()) { 4430 B.buildTrunc(DstReg, ResultRegs[0]); 4431 return true; 4432 } 4433 4434 // Avoid a build/concat_vector of 1 entry. 4435 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4436 B.buildBitcast(DstReg, ResultRegs[0]); 4437 return true; 4438 } 4439 4440 assert(Ty.isVector()); 4441 4442 if (IsD16) { 4443 // For packed D16 results with TFE enabled, all the data components are 4444 // S32. Cast back to the expected type. 4445 // 4446 // TODO: We don't really need to use load s32 elements. We would only need one 4447 // cast for the TFE result if a multiple of v2s16 was used. 4448 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4449 for (Register &Reg : ResultRegs) 4450 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4451 } else if (ST.hasUnpackedD16VMem()) { 4452 for (Register &Reg : ResultRegs) 4453 Reg = B.buildTrunc(S16, Reg).getReg(0); 4454 } 4455 } 4456 4457 auto padWithUndef = [&](LLT Ty, int NumElts) { 4458 if (NumElts == 0) 4459 return; 4460 Register Undef = B.buildUndef(Ty).getReg(0); 4461 for (int I = 0; I != NumElts; ++I) 4462 ResultRegs.push_back(Undef); 4463 }; 4464 4465 // Pad out any elements eliminated due to the dmask. 4466 LLT ResTy = MRI->getType(ResultRegs[0]); 4467 if (!ResTy.isVector()) { 4468 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4469 B.buildBuildVector(DstReg, ResultRegs); 4470 return true; 4471 } 4472 4473 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4474 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4475 4476 // Deal with the one annoying legal case. 4477 const LLT V3S16 = LLT::vector(3, 16); 4478 if (Ty == V3S16) { 4479 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4480 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4481 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4482 return true; 4483 } 4484 4485 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4486 B.buildConcatVectors(DstReg, ResultRegs); 4487 return true; 4488 } 4489 4490 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4491 LegalizerHelper &Helper, MachineInstr &MI) const { 4492 MachineIRBuilder &B = Helper.MIRBuilder; 4493 GISelChangeObserver &Observer = Helper.Observer; 4494 4495 Register Dst = MI.getOperand(0).getReg(); 4496 LLT Ty = B.getMRI()->getType(Dst); 4497 unsigned Size = Ty.getSizeInBits(); 4498 MachineFunction &MF = B.getMF(); 4499 4500 Observer.changingInstr(MI); 4501 4502 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4503 Ty = getBitcastRegisterType(Ty); 4504 Helper.bitcastDst(MI, Ty, 0); 4505 Dst = MI.getOperand(0).getReg(); 4506 B.setInsertPt(B.getMBB(), MI); 4507 } 4508 4509 // FIXME: We don't really need this intermediate instruction. The intrinsic 4510 // should be fixed to have a memory operand. Since it's readnone, we're not 4511 // allowed to add one. 4512 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4513 MI.RemoveOperand(1); // Remove intrinsic ID 4514 4515 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4516 // TODO: Should this use datalayout alignment? 4517 const unsigned MemSize = (Size + 7) / 8; 4518 const Align MemAlign(4); 4519 MachineMemOperand *MMO = MF.getMachineMemOperand( 4520 MachinePointerInfo(), 4521 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4522 MachineMemOperand::MOInvariant, 4523 MemSize, MemAlign); 4524 MI.addMemOperand(MF, MMO); 4525 4526 // There are no 96-bit result scalar loads, but widening to 128-bit should 4527 // always be legal. We may need to restore this to a 96-bit result if it turns 4528 // out this needs to be converted to a vector load during RegBankSelect. 4529 if (!isPowerOf2_32(Size)) { 4530 if (Ty.isVector()) 4531 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4532 else 4533 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4534 } 4535 4536 Observer.changedInstr(MI); 4537 return true; 4538 } 4539 4540 // TODO: Move to selection 4541 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4542 MachineRegisterInfo &MRI, 4543 MachineIRBuilder &B) const { 4544 if (!ST.isTrapHandlerEnabled() || 4545 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4546 return legalizeTrapEndpgm(MI, MRI, B); 4547 4548 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4549 switch (*HsaAbiVer) { 4550 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4551 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4552 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4553 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4554 return ST.supportsGetDoorbellID() ? 4555 legalizeTrapHsa(MI, MRI, B) : 4556 legalizeTrapHsaQueuePtr(MI, MRI, B); 4557 } 4558 } 4559 4560 llvm_unreachable("Unknown trap handler"); 4561 } 4562 4563 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4564 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4565 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4566 MI.eraseFromParent(); 4567 return true; 4568 } 4569 4570 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4571 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4572 // Pass queue pointer to trap handler as input, and insert trap instruction 4573 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4574 Register LiveIn = 4575 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4576 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4577 return false; 4578 4579 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4580 B.buildCopy(SGPR01, LiveIn); 4581 B.buildInstr(AMDGPU::S_TRAP) 4582 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4583 .addReg(SGPR01, RegState::Implicit); 4584 4585 MI.eraseFromParent(); 4586 return true; 4587 } 4588 4589 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4590 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4591 B.buildInstr(AMDGPU::S_TRAP) 4592 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4593 MI.eraseFromParent(); 4594 return true; 4595 } 4596 4597 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4598 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4599 // Is non-HSA path or trap-handler disabled? then, report a warning 4600 // accordingly 4601 if (!ST.isTrapHandlerEnabled() || 4602 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4603 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4604 "debugtrap handler not supported", 4605 MI.getDebugLoc(), DS_Warning); 4606 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4607 Ctx.diagnose(NoTrap); 4608 } else { 4609 // Insert debug-trap instruction 4610 B.buildInstr(AMDGPU::S_TRAP) 4611 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4612 } 4613 4614 MI.eraseFromParent(); 4615 return true; 4616 } 4617 4618 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4619 MachineIRBuilder &B) const { 4620 MachineRegisterInfo &MRI = *B.getMRI(); 4621 const LLT S16 = LLT::scalar(16); 4622 const LLT S32 = LLT::scalar(32); 4623 4624 Register DstReg = MI.getOperand(0).getReg(); 4625 Register NodePtr = MI.getOperand(2).getReg(); 4626 Register RayExtent = MI.getOperand(3).getReg(); 4627 Register RayOrigin = MI.getOperand(4).getReg(); 4628 Register RayDir = MI.getOperand(5).getReg(); 4629 Register RayInvDir = MI.getOperand(6).getReg(); 4630 Register TDescr = MI.getOperand(7).getReg(); 4631 4632 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4633 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4634 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4635 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4636 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4637 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4638 4639 SmallVector<Register, 12> Ops; 4640 if (Is64) { 4641 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4642 Ops.push_back(Unmerge.getReg(0)); 4643 Ops.push_back(Unmerge.getReg(1)); 4644 } else { 4645 Ops.push_back(NodePtr); 4646 } 4647 Ops.push_back(RayExtent); 4648 4649 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4650 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4651 Ops.push_back(Unmerge.getReg(0)); 4652 Ops.push_back(Unmerge.getReg(1)); 4653 Ops.push_back(Unmerge.getReg(2)); 4654 }; 4655 4656 packLanes(RayOrigin); 4657 if (IsA16) { 4658 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4659 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4660 Register R1 = MRI.createGenericVirtualRegister(S32); 4661 Register R2 = MRI.createGenericVirtualRegister(S32); 4662 Register R3 = MRI.createGenericVirtualRegister(S32); 4663 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4664 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4665 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4666 Ops.push_back(R1); 4667 Ops.push_back(R2); 4668 Ops.push_back(R3); 4669 } else { 4670 packLanes(RayDir); 4671 packLanes(RayInvDir); 4672 } 4673 4674 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4675 .addDef(DstReg) 4676 .addImm(Opcode); 4677 4678 for (Register R : Ops) { 4679 MIB.addUse(R); 4680 } 4681 4682 MIB.addUse(TDescr) 4683 .addImm(IsA16 ? 1 : 0) 4684 .cloneMemRefs(MI); 4685 4686 MI.eraseFromParent(); 4687 return true; 4688 } 4689 4690 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4691 MachineInstr &MI) const { 4692 MachineIRBuilder &B = Helper.MIRBuilder; 4693 MachineRegisterInfo &MRI = *B.getMRI(); 4694 4695 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4696 auto IntrID = MI.getIntrinsicID(); 4697 switch (IntrID) { 4698 case Intrinsic::amdgcn_if: 4699 case Intrinsic::amdgcn_else: { 4700 MachineInstr *Br = nullptr; 4701 MachineBasicBlock *UncondBrTarget = nullptr; 4702 bool Negated = false; 4703 if (MachineInstr *BrCond = 4704 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4705 const SIRegisterInfo *TRI 4706 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4707 4708 Register Def = MI.getOperand(1).getReg(); 4709 Register Use = MI.getOperand(3).getReg(); 4710 4711 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4712 4713 if (Negated) 4714 std::swap(CondBrTarget, UncondBrTarget); 4715 4716 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4717 if (IntrID == Intrinsic::amdgcn_if) { 4718 B.buildInstr(AMDGPU::SI_IF) 4719 .addDef(Def) 4720 .addUse(Use) 4721 .addMBB(UncondBrTarget); 4722 } else { 4723 B.buildInstr(AMDGPU::SI_ELSE) 4724 .addDef(Def) 4725 .addUse(Use) 4726 .addMBB(UncondBrTarget); 4727 } 4728 4729 if (Br) { 4730 Br->getOperand(0).setMBB(CondBrTarget); 4731 } else { 4732 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4733 // since we're swapping branch targets it needs to be reinserted. 4734 // FIXME: IRTranslator should probably not do this 4735 B.buildBr(*CondBrTarget); 4736 } 4737 4738 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4739 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4740 MI.eraseFromParent(); 4741 BrCond->eraseFromParent(); 4742 return true; 4743 } 4744 4745 return false; 4746 } 4747 case Intrinsic::amdgcn_loop: { 4748 MachineInstr *Br = nullptr; 4749 MachineBasicBlock *UncondBrTarget = nullptr; 4750 bool Negated = false; 4751 if (MachineInstr *BrCond = 4752 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4753 const SIRegisterInfo *TRI 4754 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4755 4756 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4757 Register Reg = MI.getOperand(2).getReg(); 4758 4759 if (Negated) 4760 std::swap(CondBrTarget, UncondBrTarget); 4761 4762 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4763 B.buildInstr(AMDGPU::SI_LOOP) 4764 .addUse(Reg) 4765 .addMBB(UncondBrTarget); 4766 4767 if (Br) 4768 Br->getOperand(0).setMBB(CondBrTarget); 4769 else 4770 B.buildBr(*CondBrTarget); 4771 4772 MI.eraseFromParent(); 4773 BrCond->eraseFromParent(); 4774 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4775 return true; 4776 } 4777 4778 return false; 4779 } 4780 case Intrinsic::amdgcn_kernarg_segment_ptr: 4781 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4782 // This only makes sense to call in a kernel, so just lower to null. 4783 B.buildConstant(MI.getOperand(0).getReg(), 0); 4784 MI.eraseFromParent(); 4785 return true; 4786 } 4787 4788 return legalizePreloadedArgIntrin( 4789 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4790 case Intrinsic::amdgcn_implicitarg_ptr: 4791 return legalizeImplicitArgPtr(MI, MRI, B); 4792 case Intrinsic::amdgcn_workitem_id_x: 4793 return legalizePreloadedArgIntrin(MI, MRI, B, 4794 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4795 case Intrinsic::amdgcn_workitem_id_y: 4796 return legalizePreloadedArgIntrin(MI, MRI, B, 4797 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4798 case Intrinsic::amdgcn_workitem_id_z: 4799 return legalizePreloadedArgIntrin(MI, MRI, B, 4800 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4801 case Intrinsic::amdgcn_workgroup_id_x: 4802 return legalizePreloadedArgIntrin(MI, MRI, B, 4803 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4804 case Intrinsic::amdgcn_workgroup_id_y: 4805 return legalizePreloadedArgIntrin(MI, MRI, B, 4806 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4807 case Intrinsic::amdgcn_workgroup_id_z: 4808 return legalizePreloadedArgIntrin(MI, MRI, B, 4809 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4810 case Intrinsic::amdgcn_dispatch_ptr: 4811 return legalizePreloadedArgIntrin(MI, MRI, B, 4812 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4813 case Intrinsic::amdgcn_queue_ptr: 4814 return legalizePreloadedArgIntrin(MI, MRI, B, 4815 AMDGPUFunctionArgInfo::QUEUE_PTR); 4816 case Intrinsic::amdgcn_implicit_buffer_ptr: 4817 return legalizePreloadedArgIntrin( 4818 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4819 case Intrinsic::amdgcn_dispatch_id: 4820 return legalizePreloadedArgIntrin(MI, MRI, B, 4821 AMDGPUFunctionArgInfo::DISPATCH_ID); 4822 case Intrinsic::amdgcn_fdiv_fast: 4823 return legalizeFDIVFastIntrin(MI, MRI, B); 4824 case Intrinsic::amdgcn_is_shared: 4825 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4826 case Intrinsic::amdgcn_is_private: 4827 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4828 case Intrinsic::amdgcn_wavefrontsize: { 4829 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4830 MI.eraseFromParent(); 4831 return true; 4832 } 4833 case Intrinsic::amdgcn_s_buffer_load: 4834 return legalizeSBufferLoad(Helper, MI); 4835 case Intrinsic::amdgcn_raw_buffer_store: 4836 case Intrinsic::amdgcn_struct_buffer_store: 4837 return legalizeBufferStore(MI, MRI, B, false, false); 4838 case Intrinsic::amdgcn_raw_buffer_store_format: 4839 case Intrinsic::amdgcn_struct_buffer_store_format: 4840 return legalizeBufferStore(MI, MRI, B, false, true); 4841 case Intrinsic::amdgcn_raw_tbuffer_store: 4842 case Intrinsic::amdgcn_struct_tbuffer_store: 4843 return legalizeBufferStore(MI, MRI, B, true, true); 4844 case Intrinsic::amdgcn_raw_buffer_load: 4845 case Intrinsic::amdgcn_struct_buffer_load: 4846 return legalizeBufferLoad(MI, MRI, B, false, false); 4847 case Intrinsic::amdgcn_raw_buffer_load_format: 4848 case Intrinsic::amdgcn_struct_buffer_load_format: 4849 return legalizeBufferLoad(MI, MRI, B, true, false); 4850 case Intrinsic::amdgcn_raw_tbuffer_load: 4851 case Intrinsic::amdgcn_struct_tbuffer_load: 4852 return legalizeBufferLoad(MI, MRI, B, true, true); 4853 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4854 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4855 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4856 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4857 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4858 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4859 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4860 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4861 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4862 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4863 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4864 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4865 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4866 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4867 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4868 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4869 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4870 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4871 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4872 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4873 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4874 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4875 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4876 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4877 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4878 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4879 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4880 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4881 case Intrinsic::amdgcn_buffer_atomic_fadd: 4882 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4883 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4884 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4885 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4886 return legalizeBufferAtomic(MI, B, IntrID); 4887 case Intrinsic::amdgcn_atomic_inc: 4888 return legalizeAtomicIncDec(MI, B, true); 4889 case Intrinsic::amdgcn_atomic_dec: 4890 return legalizeAtomicIncDec(MI, B, false); 4891 case Intrinsic::trap: 4892 return legalizeTrapIntrinsic(MI, MRI, B); 4893 case Intrinsic::debugtrap: 4894 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4895 case Intrinsic::amdgcn_rsq_clamp: 4896 return legalizeRsqClampIntrinsic(MI, MRI, B); 4897 case Intrinsic::amdgcn_ds_fadd: 4898 case Intrinsic::amdgcn_ds_fmin: 4899 case Intrinsic::amdgcn_ds_fmax: 4900 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4901 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4902 return legalizeBVHIntrinsic(MI, B); 4903 default: { 4904 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4905 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4906 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4907 return true; 4908 } 4909 } 4910 4911 return true; 4912 } 4913