1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/CodeGen/TargetOpcodes.h" 26 #include "llvm/CodeGen/ValueTypes.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/Debug.h" 31 32 #define DEBUG_TYPE "amdgpu-legalinfo" 33 34 using namespace llvm; 35 using namespace LegalizeActions; 36 using namespace LegalizeMutations; 37 using namespace LegalityPredicates; 38 using namespace MIPatternMatch; 39 40 // Hack until load/store selection patterns support any tuple of legal types. 41 static cl::opt<bool> EnableNewLegality( 42 "amdgpu-global-isel-new-legality", 43 cl::desc("Use GlobalISel desired legality, rather than try to use" 44 "rules compatible with selection patterns"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 static constexpr unsigned MaxRegisterSize = 1024; 49 50 // Round the number of elements to the next power of two elements 51 static LLT getPow2VectorType(LLT Ty) { 52 unsigned NElts = Ty.getNumElements(); 53 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 54 return Ty.changeNumElements(Pow2NElts); 55 } 56 57 // Round the number of bits to the next power of two bits 58 static LLT getPow2ScalarType(LLT Ty) { 59 unsigned Bits = Ty.getSizeInBits(); 60 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 61 return LLT::scalar(Pow2Bits); 62 } 63 64 /// \returs true if this is an odd sized vector which should widen by adding an 65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 66 /// excludes s1 vectors, which should always be scalarized. 67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 if (!Ty.isVector()) 71 return false; 72 73 const LLT EltTy = Ty.getElementType(); 74 const unsigned EltSize = EltTy.getSizeInBits(); 75 return Ty.getNumElements() % 2 != 0 && 76 EltSize > 1 && EltSize < 32 && 77 Ty.getSizeInBits() % 32 != 0; 78 }; 79 } 80 81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 return Ty.getSizeInBits() % 32 == 0; 85 }; 86 } 87 88 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getScalarType(); 92 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 93 }; 94 } 95 96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 const LLT EltTy = Ty.getElementType(); 100 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 112 }; 113 } 114 115 // Increase the number of vector elements to reach the next multiple of 32-bit 116 // type. 117 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 118 return [=](const LegalityQuery &Query) { 119 const LLT Ty = Query.Types[TypeIdx]; 120 121 const LLT EltTy = Ty.getElementType(); 122 const int Size = Ty.getSizeInBits(); 123 const int EltSize = EltTy.getSizeInBits(); 124 const int NextMul32 = (Size + 31) / 32; 125 126 assert(EltSize < 32); 127 128 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 129 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 130 }; 131 } 132 133 static LLT getBitcastRegisterType(const LLT Ty) { 134 const unsigned Size = Ty.getSizeInBits(); 135 136 LLT CoercedTy; 137 if (Size <= 32) { 138 // <2 x s8> -> s16 139 // <4 x s8> -> s32 140 return LLT::scalar(Size); 141 } 142 143 return LLT::scalarOrVector(Size / 32, 32); 144 } 145 146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 147 return [=](const LegalityQuery &Query) { 148 const LLT Ty = Query.Types[TypeIdx]; 149 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 150 }; 151 } 152 153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 unsigned Size = Ty.getSizeInBits(); 157 assert(Size % 32 == 0); 158 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 159 }; 160 } 161 162 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 163 return [=](const LegalityQuery &Query) { 164 const LLT QueryTy = Query.Types[TypeIdx]; 165 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 166 }; 167 } 168 169 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 170 return [=](const LegalityQuery &Query) { 171 const LLT QueryTy = Query.Types[TypeIdx]; 172 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 173 }; 174 } 175 176 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 177 return [=](const LegalityQuery &Query) { 178 const LLT QueryTy = Query.Types[TypeIdx]; 179 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 180 }; 181 } 182 183 static bool isRegisterSize(unsigned Size) { 184 return Size % 32 == 0 && Size <= MaxRegisterSize; 185 } 186 187 static bool isRegisterVectorElementType(LLT EltTy) { 188 const int EltSize = EltTy.getSizeInBits(); 189 return EltSize == 16 || EltSize % 32 == 0; 190 } 191 192 static bool isRegisterVectorType(LLT Ty) { 193 const int EltSize = Ty.getElementType().getSizeInBits(); 194 return EltSize == 32 || EltSize == 64 || 195 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 196 EltSize == 128 || EltSize == 256; 197 } 198 199 static bool isRegisterType(LLT Ty) { 200 if (!isRegisterSize(Ty.getSizeInBits())) 201 return false; 202 203 if (Ty.isVector()) 204 return isRegisterVectorType(Ty); 205 206 return true; 207 } 208 209 // Any combination of 32 or 64-bit elements up the maximum register size, and 210 // multiples of v2s16. 211 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 212 return [=](const LegalityQuery &Query) { 213 return isRegisterType(Query.Types[TypeIdx]); 214 }; 215 } 216 217 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 218 return [=](const LegalityQuery &Query) { 219 const LLT QueryTy = Query.Types[TypeIdx]; 220 if (!QueryTy.isVector()) 221 return false; 222 const LLT EltTy = QueryTy.getElementType(); 223 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 224 }; 225 } 226 227 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 228 return [=](const LegalityQuery &Query) { 229 const LLT Ty = Query.Types[TypeIdx]; 230 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 231 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 232 }; 233 } 234 235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 236 // handle some operations by just promoting the register during 237 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 239 bool IsLoad) { 240 switch (AS) { 241 case AMDGPUAS::PRIVATE_ADDRESS: 242 // FIXME: Private element size. 243 return ST.enableFlatScratch() ? 128 : 32; 244 case AMDGPUAS::LOCAL_ADDRESS: 245 return ST.useDS128() ? 128 : 64; 246 case AMDGPUAS::GLOBAL_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS: 248 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 249 // Treat constant and global as identical. SMRD loads are sometimes usable for 250 // global loads (ideally constant address space should be eliminated) 251 // depending on the context. Legality cannot be context dependent, but 252 // RegBankSelect can split the load as necessary depending on the pointer 253 // register bank/uniformity and if the memory is invariant or not written in a 254 // kernel. 255 return IsLoad ? 512 : 128; 256 default: 257 // Flat addresses may contextually need to be split to 32-bit parts if they 258 // may alias scratch depending on the subtarget. 259 return 128; 260 } 261 } 262 263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 264 const LegalityQuery &Query, 265 unsigned Opcode) { 266 const LLT Ty = Query.Types[0]; 267 268 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 269 const bool IsLoad = Opcode != AMDGPU::G_STORE; 270 271 unsigned RegSize = Ty.getSizeInBits(); 272 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 273 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 274 unsigned AS = Query.Types[1].getAddressSpace(); 275 276 // All of these need to be custom lowered to cast the pointer operand. 277 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 278 return false; 279 280 // TODO: We should be able to widen loads if the alignment is high enough, but 281 // we also need to modify the memory access size. 282 #if 0 283 // Accept widening loads based on alignment. 284 if (IsLoad && MemSize < Size) 285 MemSize = std::max(MemSize, Align); 286 #endif 287 288 // Only 1-byte and 2-byte to 32-bit extloads are valid. 289 if (MemSize != RegSize && RegSize != 32) 290 return false; 291 292 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 293 return false; 294 295 switch (MemSize) { 296 case 8: 297 case 16: 298 case 32: 299 case 64: 300 case 128: 301 break; 302 case 96: 303 if (!ST.hasDwordx3LoadStores()) 304 return false; 305 break; 306 case 256: 307 case 512: 308 // These may contextually need to be broken down. 309 break; 310 default: 311 return false; 312 } 313 314 assert(RegSize >= MemSize); 315 316 if (AlignBits < MemSize) { 317 const SITargetLowering *TLI = ST.getTargetLowering(); 318 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 319 Align(AlignBits / 8))) 320 return false; 321 } 322 323 return true; 324 } 325 326 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 327 // workaround this. Eventually it should ignore the type for loads and only care 328 // about the size. Return true in cases where we will workaround this for now by 329 // bitcasting. 330 static bool loadStoreBitcastWorkaround(const LLT Ty) { 331 if (EnableNewLegality) 332 return false; 333 334 const unsigned Size = Ty.getSizeInBits(); 335 if (Size <= 64) 336 return false; 337 if (!Ty.isVector()) 338 return true; 339 340 LLT EltTy = Ty.getElementType(); 341 if (EltTy.isPointer()) 342 return true; 343 344 unsigned EltSize = EltTy.getSizeInBits(); 345 return EltSize != 32 && EltSize != 64; 346 } 347 348 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 349 unsigned Opcode) { 350 const LLT Ty = Query.Types[0]; 351 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 352 !loadStoreBitcastWorkaround(Ty); 353 } 354 355 /// Return true if a load or store of the type should be lowered with a bitcast 356 /// to a different type. 357 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 358 const unsigned MemSizeInBits) { 359 const unsigned Size = Ty.getSizeInBits(); 360 if (Size != MemSizeInBits) 361 return Size <= 32 && Ty.isVector(); 362 363 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 364 return true; 365 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 366 !isRegisterVectorElementType(Ty.getElementType()); 367 } 368 369 /// Return true if we should legalize a load by widening an odd sized memory 370 /// access up to the alignment. Note this case when the memory access itself 371 /// changes, not the size of the result register. 372 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 373 unsigned AlignInBits, unsigned AddrSpace, 374 unsigned Opcode) { 375 // We don't want to widen cases that are naturally legal. 376 if (isPowerOf2_32(SizeInBits)) 377 return false; 378 379 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 380 // end up widening these for a scalar load during RegBankSelect, since there 381 // aren't 96-bit scalar loads. 382 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 383 return false; 384 385 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 386 return false; 387 388 // A load is known dereferenceable up to the alignment, so it's legal to widen 389 // to it. 390 // 391 // TODO: Could check dereferenceable for less aligned cases. 392 unsigned RoundedSize = NextPowerOf2(SizeInBits); 393 if (AlignInBits < RoundedSize) 394 return false; 395 396 // Do not widen if it would introduce a slow unaligned load. 397 const SITargetLowering *TLI = ST.getTargetLowering(); 398 bool Fast = false; 399 return TLI->allowsMisalignedMemoryAccessesImpl( 400 RoundedSize, AddrSpace, Align(AlignInBits / 8), 401 MachineMemOperand::MOLoad, &Fast) && 402 Fast; 403 } 404 405 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 406 unsigned Opcode) { 407 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 408 return false; 409 410 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 411 Query.MMODescrs[0].AlignInBits, 412 Query.Types[1].getAddressSpace(), Opcode); 413 } 414 415 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 416 const GCNTargetMachine &TM) 417 : ST(ST_) { 418 using namespace TargetOpcode; 419 420 auto GetAddrSpacePtr = [&TM](unsigned AS) { 421 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 422 }; 423 424 const LLT S1 = LLT::scalar(1); 425 const LLT S8 = LLT::scalar(8); 426 const LLT S16 = LLT::scalar(16); 427 const LLT S32 = LLT::scalar(32); 428 const LLT S64 = LLT::scalar(64); 429 const LLT S128 = LLT::scalar(128); 430 const LLT S256 = LLT::scalar(256); 431 const LLT S512 = LLT::scalar(512); 432 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 433 434 const LLT V2S8 = LLT::vector(2, 8); 435 const LLT V2S16 = LLT::vector(2, 16); 436 const LLT V4S16 = LLT::vector(4, 16); 437 438 const LLT V2S32 = LLT::vector(2, 32); 439 const LLT V3S32 = LLT::vector(3, 32); 440 const LLT V4S32 = LLT::vector(4, 32); 441 const LLT V5S32 = LLT::vector(5, 32); 442 const LLT V6S32 = LLT::vector(6, 32); 443 const LLT V7S32 = LLT::vector(7, 32); 444 const LLT V8S32 = LLT::vector(8, 32); 445 const LLT V9S32 = LLT::vector(9, 32); 446 const LLT V10S32 = LLT::vector(10, 32); 447 const LLT V11S32 = LLT::vector(11, 32); 448 const LLT V12S32 = LLT::vector(12, 32); 449 const LLT V13S32 = LLT::vector(13, 32); 450 const LLT V14S32 = LLT::vector(14, 32); 451 const LLT V15S32 = LLT::vector(15, 32); 452 const LLT V16S32 = LLT::vector(16, 32); 453 const LLT V32S32 = LLT::vector(32, 32); 454 455 const LLT V2S64 = LLT::vector(2, 64); 456 const LLT V3S64 = LLT::vector(3, 64); 457 const LLT V4S64 = LLT::vector(4, 64); 458 const LLT V5S64 = LLT::vector(5, 64); 459 const LLT V6S64 = LLT::vector(6, 64); 460 const LLT V7S64 = LLT::vector(7, 64); 461 const LLT V8S64 = LLT::vector(8, 64); 462 const LLT V16S64 = LLT::vector(16, 64); 463 464 std::initializer_list<LLT> AllS32Vectors = 465 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 466 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 467 std::initializer_list<LLT> AllS64Vectors = 468 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 469 470 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 471 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 472 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 473 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 474 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 475 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 476 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 477 478 const LLT CodePtr = FlatPtr; 479 480 const std::initializer_list<LLT> AddrSpaces64 = { 481 GlobalPtr, ConstantPtr, FlatPtr 482 }; 483 484 const std::initializer_list<LLT> AddrSpaces32 = { 485 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 486 }; 487 488 const std::initializer_list<LLT> FPTypesBase = { 489 S32, S64 490 }; 491 492 const std::initializer_list<LLT> FPTypes16 = { 493 S32, S64, S16 494 }; 495 496 const std::initializer_list<LLT> FPTypesPK16 = { 497 S32, S64, S16, V2S16 498 }; 499 500 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 501 502 setAction({G_BRCOND, S1}, Legal); // VCC branches 503 setAction({G_BRCOND, S32}, Legal); // SCC branches 504 505 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 506 // elements for v3s16 507 getActionDefinitionsBuilder(G_PHI) 508 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 509 .legalFor(AllS32Vectors) 510 .legalFor(AllS64Vectors) 511 .legalFor(AddrSpaces64) 512 .legalFor(AddrSpaces32) 513 .legalIf(isPointer(0)) 514 .clampScalar(0, S16, S256) 515 .widenScalarToNextPow2(0, 32) 516 .clampMaxNumElements(0, S32, 16) 517 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 518 .scalarize(0); 519 520 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 521 // Full set of gfx9 features. 522 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 523 .legalFor({S32, S16, V2S16}) 524 .clampScalar(0, S16, S32) 525 .clampMaxNumElements(0, S16, 2) 526 .scalarize(0) 527 .widenScalarToNextPow2(0, 32); 528 529 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 530 .legalFor({S32, S16, V2S16}) // Clamp modifier 531 .minScalarOrElt(0, S16) 532 .clampMaxNumElements(0, S16, 2) 533 .scalarize(0) 534 .widenScalarToNextPow2(0, 32) 535 .lower(); 536 } else if (ST.has16BitInsts()) { 537 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 538 .legalFor({S32, S16}) 539 .clampScalar(0, S16, S32) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 542 543 // Technically the saturating operations require clamp bit support, but this 544 // was introduced at the same time as 16-bit operations. 545 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 546 .legalFor({S32, S16}) // Clamp modifier 547 .minScalar(0, S16) 548 .scalarize(0) 549 .widenScalarToNextPow2(0, 16) 550 .lower(); 551 552 // We're just lowering this, but it helps get a better result to try to 553 // coerce to the desired type first. 554 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 555 .minScalar(0, S16) 556 .scalarize(0) 557 .lower(); 558 } else { 559 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 560 .legalFor({S32}) 561 .clampScalar(0, S32, S32) 562 .scalarize(0); 563 564 if (ST.hasIntClamp()) { 565 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 566 .legalFor({S32}) // Clamp modifier. 567 .scalarize(0) 568 .minScalarOrElt(0, S32) 569 .lower(); 570 } else { 571 // Clamp bit support was added in VI, along with 16-bit operations. 572 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 573 .minScalar(0, S32) 574 .scalarize(0) 575 .lower(); 576 } 577 578 // FIXME: DAG expansion gets better results. The widening uses the smaller 579 // range values and goes for the min/max lowering directly. 580 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 581 .minScalar(0, S32) 582 .scalarize(0) 583 .lower(); 584 } 585 586 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 587 .customFor({S32, S64}) 588 .clampScalar(0, S32, S64) 589 .widenScalarToNextPow2(0, 32) 590 .scalarize(0); 591 592 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 593 .legalFor({S32}) 594 .maxScalarOrElt(0, S32); 595 596 if (ST.hasVOP3PInsts()) { 597 Mulh 598 .clampMaxNumElements(0, S8, 2) 599 .lowerFor({V2S8}); 600 } 601 602 Mulh 603 .scalarize(0) 604 .lower(); 605 606 // Report legal for any types we can handle anywhere. For the cases only legal 607 // on the SALU, RegBankSelect will be able to re-legalize. 608 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 609 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 610 .clampScalar(0, S32, S64) 611 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 612 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 613 .widenScalarToNextPow2(0) 614 .scalarize(0); 615 616 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 617 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 618 .legalFor({{S32, S1}, {S32, S32}}) 619 .minScalar(0, S32) 620 // TODO: .scalarize(0) 621 .lower(); 622 623 getActionDefinitionsBuilder(G_BITCAST) 624 // Don't worry about the size constraint. 625 .legalIf(all(isRegisterType(0), isRegisterType(1))) 626 .lower(); 627 628 629 getActionDefinitionsBuilder(G_CONSTANT) 630 .legalFor({S1, S32, S64, S16, GlobalPtr, 631 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 632 .legalIf(isPointer(0)) 633 .clampScalar(0, S32, S64) 634 .widenScalarToNextPow2(0); 635 636 getActionDefinitionsBuilder(G_FCONSTANT) 637 .legalFor({S32, S64, S16}) 638 .clampScalar(0, S16, S64); 639 640 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 641 .legalIf(isRegisterType(0)) 642 // s1 and s16 are special cases because they have legal operations on 643 // them, but don't really occupy registers in the normal way. 644 .legalFor({S1, S16}) 645 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 646 .clampScalarOrElt(0, S32, MaxScalar) 647 .widenScalarToNextPow2(0, 32) 648 .clampMaxNumElements(0, S32, 16); 649 650 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 651 652 // If the amount is divergent, we have to do a wave reduction to get the 653 // maximum value, so this is expanded during RegBankSelect. 654 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 655 .legalFor({{PrivatePtr, S32}}); 656 657 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 658 .customIf(typeIsNot(0, PrivatePtr)); 659 660 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 661 662 auto &FPOpActions = getActionDefinitionsBuilder( 663 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 664 .legalFor({S32, S64}); 665 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 666 .customFor({S32, S64}); 667 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 668 .customFor({S32, S64}); 669 670 if (ST.has16BitInsts()) { 671 if (ST.hasVOP3PInsts()) 672 FPOpActions.legalFor({S16, V2S16}); 673 else 674 FPOpActions.legalFor({S16}); 675 676 TrigActions.customFor({S16}); 677 FDIVActions.customFor({S16}); 678 } 679 680 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 681 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 682 683 if (ST.hasVOP3PInsts()) { 684 MinNumMaxNum.customFor(FPTypesPK16) 685 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 686 .clampMaxNumElements(0, S16, 2) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else if (ST.has16BitInsts()) { 690 MinNumMaxNum.customFor(FPTypes16) 691 .clampScalar(0, S16, S64) 692 .scalarize(0); 693 } else { 694 MinNumMaxNum.customFor(FPTypesBase) 695 .clampScalar(0, S32, S64) 696 .scalarize(0); 697 } 698 699 if (ST.hasVOP3PInsts()) 700 FPOpActions.clampMaxNumElements(0, S16, 2); 701 702 FPOpActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 TrigActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 FDIVActions 711 .scalarize(0) 712 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 713 714 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 715 .legalFor(FPTypesPK16) 716 .clampMaxNumElements(0, S16, 2) 717 .scalarize(0) 718 .clampScalar(0, S16, S64); 719 720 if (ST.has16BitInsts()) { 721 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 722 .legalFor({S32, S64, S16}) 723 .scalarize(0) 724 .clampScalar(0, S16, S64); 725 } else { 726 getActionDefinitionsBuilder(G_FSQRT) 727 .legalFor({S32, S64}) 728 .scalarize(0) 729 .clampScalar(0, S32, S64); 730 731 if (ST.hasFractBug()) { 732 getActionDefinitionsBuilder(G_FFLOOR) 733 .customFor({S64}) 734 .legalFor({S32, S64}) 735 .scalarize(0) 736 .clampScalar(0, S32, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FFLOOR) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 } 743 } 744 745 getActionDefinitionsBuilder(G_FPTRUNC) 746 .legalFor({{S32, S64}, {S16, S32}}) 747 .scalarize(0) 748 .lower(); 749 750 getActionDefinitionsBuilder(G_FPEXT) 751 .legalFor({{S64, S32}, {S32, S16}}) 752 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 753 .scalarize(0); 754 755 getActionDefinitionsBuilder(G_FSUB) 756 // Use actual fsub instruction 757 .legalFor({S32}) 758 // Must use fadd + fneg 759 .lowerFor({S64, S16, V2S16}) 760 .scalarize(0) 761 .clampScalar(0, S32, S64); 762 763 // Whether this is legal depends on the floating point mode for the function. 764 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 765 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 766 FMad.customFor({S32, S16}); 767 else if (ST.hasMadMacF32Insts()) 768 FMad.customFor({S32}); 769 else if (ST.hasMadF16()) 770 FMad.customFor({S16}); 771 FMad.scalarize(0) 772 .lower(); 773 774 auto &FRem = getActionDefinitionsBuilder(G_FREM); 775 if (ST.has16BitInsts()) { 776 FRem.customFor({S16, S32, S64}); 777 } else { 778 FRem.minScalar(0, S32) 779 .customFor({S32, S64}); 780 } 781 FRem.scalarize(0); 782 783 // TODO: Do we need to clamp maximum bitwidth? 784 getActionDefinitionsBuilder(G_TRUNC) 785 .legalIf(isScalar(0)) 786 .legalFor({{V2S16, V2S32}}) 787 .clampMaxNumElements(0, S16, 2) 788 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 789 // situations (like an invalid implicit use), we don't want to infinite loop 790 // in the legalizer. 791 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 792 .alwaysLegal(); 793 794 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 795 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 796 {S32, S1}, {S64, S1}, {S16, S1}}) 797 .scalarize(0) 798 .clampScalar(0, S32, S64) 799 .widenScalarToNextPow2(1, 32); 800 801 // TODO: Split s1->s64 during regbankselect for VALU. 802 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 803 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 804 .lowerFor({{S32, S64}}) 805 .lowerIf(typeIs(1, S1)) 806 .customFor({{S64, S64}}); 807 if (ST.has16BitInsts()) 808 IToFP.legalFor({{S16, S16}}); 809 IToFP.clampScalar(1, S32, S64) 810 .minScalar(0, S32) 811 .scalarize(0) 812 .widenScalarToNextPow2(1); 813 814 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 815 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 816 .customFor({{S64, S64}}) 817 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 818 if (ST.has16BitInsts()) 819 FPToI.legalFor({{S16, S16}}); 820 else 821 FPToI.minScalar(1, S32); 822 823 FPToI.minScalar(0, S32) 824 .scalarize(0) 825 .lower(); 826 827 // Lower roundeven into G_FRINT 828 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 829 .scalarize(0) 830 .lower(); 831 832 if (ST.has16BitInsts()) { 833 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 834 .legalFor({S16, S32, S64}) 835 .clampScalar(0, S16, S64) 836 .scalarize(0); 837 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 838 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 839 .legalFor({S32, S64}) 840 .clampScalar(0, S32, S64) 841 .scalarize(0); 842 } else { 843 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 844 .legalFor({S32}) 845 .customFor({S64}) 846 .clampScalar(0, S32, S64) 847 .scalarize(0); 848 } 849 850 getActionDefinitionsBuilder(G_PTR_ADD) 851 .legalIf(all(isPointer(0), sameSize(0, 1))) 852 .scalarize(0) 853 .scalarSameSizeAs(1, 0); 854 855 getActionDefinitionsBuilder(G_PTRMASK) 856 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 857 .scalarSameSizeAs(1, 0) 858 .scalarize(0); 859 860 auto &CmpBuilder = 861 getActionDefinitionsBuilder(G_ICMP) 862 // The compare output type differs based on the register bank of the output, 863 // so make both s1 and s32 legal. 864 // 865 // Scalar compares producing output in scc will be promoted to s32, as that 866 // is the allocatable register type that will be needed for the copy from 867 // scc. This will be promoted during RegBankSelect, and we assume something 868 // before that won't try to use s32 result types. 869 // 870 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 871 // bank. 872 .legalForCartesianProduct( 873 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 874 .legalForCartesianProduct( 875 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 876 if (ST.has16BitInsts()) { 877 CmpBuilder.legalFor({{S1, S16}}); 878 } 879 880 CmpBuilder 881 .widenScalarToNextPow2(1) 882 .clampScalar(1, S32, S64) 883 .scalarize(0) 884 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 885 886 getActionDefinitionsBuilder(G_FCMP) 887 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 888 .widenScalarToNextPow2(1) 889 .clampScalar(1, S32, S64) 890 .scalarize(0); 891 892 // FIXME: fpow has a selection pattern that should move to custom lowering. 893 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 894 if (ST.has16BitInsts()) 895 Exp2Ops.legalFor({S32, S16}); 896 else 897 Exp2Ops.legalFor({S32}); 898 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 899 Exp2Ops.scalarize(0); 900 901 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 902 if (ST.has16BitInsts()) 903 ExpOps.customFor({{S32}, {S16}}); 904 else 905 ExpOps.customFor({S32}); 906 ExpOps.clampScalar(0, MinScalarFPTy, S32) 907 .scalarize(0); 908 909 getActionDefinitionsBuilder(G_FPOWI) 910 .clampScalar(0, MinScalarFPTy, S32) 911 .lower(); 912 913 // The 64-bit versions produce 32-bit results, but only on the SALU. 914 getActionDefinitionsBuilder(G_CTPOP) 915 .legalFor({{S32, S32}, {S32, S64}}) 916 .clampScalar(0, S32, S32) 917 .clampScalar(1, S32, S64) 918 .scalarize(0) 919 .widenScalarToNextPow2(0, 32) 920 .widenScalarToNextPow2(1, 32); 921 922 // The hardware instructions return a different result on 0 than the generic 923 // instructions expect. The hardware produces -1, but these produce the 924 // bitwidth. 925 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 926 .scalarize(0) 927 .clampScalar(0, S32, S32) 928 .clampScalar(1, S32, S64) 929 .widenScalarToNextPow2(0, 32) 930 .widenScalarToNextPow2(1, 32) 931 .lower(); 932 933 // The 64-bit versions produce 32-bit results, but only on the SALU. 934 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 935 .legalFor({{S32, S32}, {S32, S64}}) 936 .clampScalar(0, S32, S32) 937 .clampScalar(1, S32, S64) 938 .scalarize(0) 939 .widenScalarToNextPow2(0, 32) 940 .widenScalarToNextPow2(1, 32); 941 942 getActionDefinitionsBuilder(G_BITREVERSE) 943 .legalFor({S32}) 944 .clampScalar(0, S32, S32) 945 .scalarize(0); 946 947 if (ST.has16BitInsts()) { 948 getActionDefinitionsBuilder(G_BSWAP) 949 .legalFor({S16, S32, V2S16}) 950 .clampMaxNumElements(0, S16, 2) 951 // FIXME: Fixing non-power-of-2 before clamp is workaround for 952 // narrowScalar limitation. 953 .widenScalarToNextPow2(0) 954 .clampScalar(0, S16, S32) 955 .scalarize(0); 956 957 if (ST.hasVOP3PInsts()) { 958 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 959 .legalFor({S32, S16, V2S16}) 960 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 961 .clampMaxNumElements(0, S16, 2) 962 .minScalar(0, S16) 963 .widenScalarToNextPow2(0) 964 .scalarize(0) 965 .lower(); 966 } else { 967 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 968 .legalFor({S32, S16}) 969 .widenScalarToNextPow2(0) 970 .minScalar(0, S16) 971 .scalarize(0) 972 .lower(); 973 } 974 } else { 975 // TODO: Should have same legality without v_perm_b32 976 getActionDefinitionsBuilder(G_BSWAP) 977 .legalFor({S32}) 978 .lowerIf(scalarNarrowerThan(0, 32)) 979 // FIXME: Fixing non-power-of-2 before clamp is workaround for 980 // narrowScalar limitation. 981 .widenScalarToNextPow2(0) 982 .maxScalar(0, S32) 983 .scalarize(0) 984 .lower(); 985 986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 987 .legalFor({S32}) 988 .minScalar(0, S32) 989 .widenScalarToNextPow2(0) 990 .scalarize(0) 991 .lower(); 992 } 993 994 getActionDefinitionsBuilder(G_INTTOPTR) 995 // List the common cases 996 .legalForCartesianProduct(AddrSpaces64, {S64}) 997 .legalForCartesianProduct(AddrSpaces32, {S32}) 998 .scalarize(0) 999 // Accept any address space as long as the size matches 1000 .legalIf(sameSize(0, 1)) 1001 .widenScalarIf(smallerThan(1, 0), 1002 [](const LegalityQuery &Query) { 1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1004 }) 1005 .narrowScalarIf(largerThan(1, 0), 1006 [](const LegalityQuery &Query) { 1007 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1008 }); 1009 1010 getActionDefinitionsBuilder(G_PTRTOINT) 1011 // List the common cases 1012 .legalForCartesianProduct(AddrSpaces64, {S64}) 1013 .legalForCartesianProduct(AddrSpaces32, {S32}) 1014 .scalarize(0) 1015 // Accept any address space as long as the size matches 1016 .legalIf(sameSize(0, 1)) 1017 .widenScalarIf(smallerThan(0, 1), 1018 [](const LegalityQuery &Query) { 1019 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1020 }) 1021 .narrowScalarIf( 1022 largerThan(0, 1), 1023 [](const LegalityQuery &Query) { 1024 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1025 }); 1026 1027 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1028 .scalarize(0) 1029 .custom(); 1030 1031 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1032 bool IsLoad) -> bool { 1033 const LLT DstTy = Query.Types[0]; 1034 1035 // Split vector extloads. 1036 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1037 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1038 1039 if (MemSize < DstTy.getSizeInBits()) 1040 MemSize = std::max(MemSize, AlignBits); 1041 1042 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1043 return true; 1044 1045 const LLT PtrTy = Query.Types[1]; 1046 unsigned AS = PtrTy.getAddressSpace(); 1047 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1048 return true; 1049 1050 // Catch weird sized loads that don't evenly divide into the access sizes 1051 // TODO: May be able to widen depending on alignment etc. 1052 unsigned NumRegs = (MemSize + 31) / 32; 1053 if (NumRegs == 3) { 1054 if (!ST.hasDwordx3LoadStores()) 1055 return true; 1056 } else { 1057 // If the alignment allows, these should have been widened. 1058 if (!isPowerOf2_32(NumRegs)) 1059 return true; 1060 } 1061 1062 if (AlignBits < MemSize) { 1063 const SITargetLowering *TLI = ST.getTargetLowering(); 1064 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1065 Align(AlignBits / 8)); 1066 } 1067 1068 return false; 1069 }; 1070 1071 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1072 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1073 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1074 1075 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1076 // LDS 1077 // TODO: Unsupported flat for SI. 1078 1079 for (unsigned Op : {G_LOAD, G_STORE}) { 1080 const bool IsStore = Op == G_STORE; 1081 1082 auto &Actions = getActionDefinitionsBuilder(Op); 1083 // Explicitly list some common cases. 1084 // TODO: Does this help compile time at all? 1085 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1086 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1087 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1088 {S64, GlobalPtr, 64, GlobalAlign32}, 1089 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1090 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1091 {S32, GlobalPtr, 8, GlobalAlign8}, 1092 {S32, GlobalPtr, 16, GlobalAlign16}, 1093 1094 {S32, LocalPtr, 32, 32}, 1095 {S64, LocalPtr, 64, 32}, 1096 {V2S32, LocalPtr, 64, 32}, 1097 {S32, LocalPtr, 8, 8}, 1098 {S32, LocalPtr, 16, 16}, 1099 {V2S16, LocalPtr, 32, 32}, 1100 1101 {S32, PrivatePtr, 32, 32}, 1102 {S32, PrivatePtr, 8, 8}, 1103 {S32, PrivatePtr, 16, 16}, 1104 {V2S16, PrivatePtr, 32, 32}, 1105 1106 {S32, ConstantPtr, 32, GlobalAlign32}, 1107 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1108 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1109 {S64, ConstantPtr, 64, GlobalAlign32}, 1110 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1111 Actions.legalIf( 1112 [=](const LegalityQuery &Query) -> bool { 1113 return isLoadStoreLegal(ST, Query, Op); 1114 }); 1115 1116 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1117 // 64-bits. 1118 // 1119 // TODO: Should generalize bitcast action into coerce, which will also cover 1120 // inserting addrspacecasts. 1121 Actions.customIf(typeIs(1, Constant32Ptr)); 1122 1123 // Turn any illegal element vectors into something easier to deal 1124 // with. These will ultimately produce 32-bit scalar shifts to extract the 1125 // parts anyway. 1126 // 1127 // For odd 16-bit element vectors, prefer to split those into pieces with 1128 // 16-bit vector parts. 1129 Actions.bitcastIf( 1130 [=](const LegalityQuery &Query) -> bool { 1131 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1132 Query.MMODescrs[0].SizeInBits); 1133 }, bitcastToRegisterType(0)); 1134 1135 if (!IsStore) { 1136 // Widen suitably aligned loads by loading extra bytes. The standard 1137 // legalization actions can't properly express widening memory operands. 1138 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1139 return shouldWidenLoad(ST, Query, G_LOAD); 1140 }); 1141 } 1142 1143 // FIXME: load/store narrowing should be moved to lower action 1144 Actions 1145 .narrowScalarIf( 1146 [=](const LegalityQuery &Query) -> bool { 1147 return !Query.Types[0].isVector() && 1148 needToSplitMemOp(Query, Op == G_LOAD); 1149 }, 1150 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1151 const LLT DstTy = Query.Types[0]; 1152 const LLT PtrTy = Query.Types[1]; 1153 1154 const unsigned DstSize = DstTy.getSizeInBits(); 1155 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1156 1157 // Split extloads. 1158 if (DstSize > MemSize) 1159 return std::make_pair(0, LLT::scalar(MemSize)); 1160 1161 if (!isPowerOf2_32(DstSize)) { 1162 // We're probably decomposing an odd sized store. Try to split 1163 // to the widest type. TODO: Account for alignment. As-is it 1164 // should be OK, since the new parts will be further legalized. 1165 unsigned FloorSize = PowerOf2Floor(DstSize); 1166 return std::make_pair(0, LLT::scalar(FloorSize)); 1167 } 1168 1169 if (DstSize > 32 && (DstSize % 32 != 0)) { 1170 // FIXME: Need a way to specify non-extload of larger size if 1171 // suitably aligned. 1172 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1173 } 1174 1175 unsigned MaxSize = maxSizeForAddrSpace(ST, 1176 PtrTy.getAddressSpace(), 1177 Op == G_LOAD); 1178 if (MemSize > MaxSize) 1179 return std::make_pair(0, LLT::scalar(MaxSize)); 1180 1181 unsigned Align = Query.MMODescrs[0].AlignInBits; 1182 return std::make_pair(0, LLT::scalar(Align)); 1183 }) 1184 .fewerElementsIf( 1185 [=](const LegalityQuery &Query) -> bool { 1186 return Query.Types[0].isVector() && 1187 needToSplitMemOp(Query, Op == G_LOAD); 1188 }, 1189 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1190 const LLT DstTy = Query.Types[0]; 1191 const LLT PtrTy = Query.Types[1]; 1192 1193 LLT EltTy = DstTy.getElementType(); 1194 unsigned MaxSize = maxSizeForAddrSpace(ST, 1195 PtrTy.getAddressSpace(), 1196 Op == G_LOAD); 1197 1198 // FIXME: Handle widened to power of 2 results better. This ends 1199 // up scalarizing. 1200 // FIXME: 3 element stores scalarized on SI 1201 1202 // Split if it's too large for the address space. 1203 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1204 unsigned NumElts = DstTy.getNumElements(); 1205 unsigned EltSize = EltTy.getSizeInBits(); 1206 1207 if (MaxSize % EltSize == 0) { 1208 return std::make_pair( 1209 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1210 } 1211 1212 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1213 1214 // FIXME: Refine when odd breakdowns handled 1215 // The scalars will need to be re-legalized. 1216 if (NumPieces == 1 || NumPieces >= NumElts || 1217 NumElts % NumPieces != 0) 1218 return std::make_pair(0, EltTy); 1219 1220 return std::make_pair(0, 1221 LLT::vector(NumElts / NumPieces, EltTy)); 1222 } 1223 1224 // FIXME: We could probably handle weird extending loads better. 1225 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1226 if (DstTy.getSizeInBits() > MemSize) 1227 return std::make_pair(0, EltTy); 1228 1229 unsigned EltSize = EltTy.getSizeInBits(); 1230 unsigned DstSize = DstTy.getSizeInBits(); 1231 if (!isPowerOf2_32(DstSize)) { 1232 // We're probably decomposing an odd sized store. Try to split 1233 // to the widest type. TODO: Account for alignment. As-is it 1234 // should be OK, since the new parts will be further legalized. 1235 unsigned FloorSize = PowerOf2Floor(DstSize); 1236 return std::make_pair( 1237 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1238 } 1239 1240 // Need to split because of alignment. 1241 unsigned Align = Query.MMODescrs[0].AlignInBits; 1242 if (EltSize > Align && 1243 (EltSize / Align < DstTy.getNumElements())) { 1244 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1245 } 1246 1247 // May need relegalization for the scalars. 1248 return std::make_pair(0, EltTy); 1249 }) 1250 .lowerIfMemSizeNotPow2() 1251 .minScalar(0, S32); 1252 1253 if (IsStore) 1254 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1255 1256 Actions 1257 .widenScalarToNextPow2(0) 1258 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1259 .lower(); 1260 } 1261 1262 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1263 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1264 {S32, GlobalPtr, 16, 2 * 8}, 1265 {S32, LocalPtr, 8, 8}, 1266 {S32, LocalPtr, 16, 16}, 1267 {S32, PrivatePtr, 8, 8}, 1268 {S32, PrivatePtr, 16, 16}, 1269 {S32, ConstantPtr, 8, 8}, 1270 {S32, ConstantPtr, 16, 2 * 8}}); 1271 if (ST.hasFlatAddressSpace()) { 1272 ExtLoads.legalForTypesWithMemDesc( 1273 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1274 } 1275 1276 ExtLoads.clampScalar(0, S32, S32) 1277 .widenScalarToNextPow2(0) 1278 .unsupportedIfMemSizeNotPow2() 1279 .lower(); 1280 1281 auto &Atomics = getActionDefinitionsBuilder( 1282 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1283 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1284 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1285 G_ATOMICRMW_UMIN}) 1286 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1287 {S64, GlobalPtr}, {S64, LocalPtr}, 1288 {S32, RegionPtr}, {S64, RegionPtr}}); 1289 if (ST.hasFlatAddressSpace()) { 1290 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1291 } 1292 1293 if (ST.hasLDSFPAtomics()) { 1294 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1295 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1296 } 1297 1298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1299 // demarshalling 1300 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1301 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1302 {S32, FlatPtr}, {S64, FlatPtr}}) 1303 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1304 {S32, RegionPtr}, {S64, RegionPtr}}); 1305 // TODO: Pointer types, any 32-bit or 64-bit vector 1306 1307 // Condition should be s32 for scalar, s1 for vector. 1308 getActionDefinitionsBuilder(G_SELECT) 1309 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1310 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1311 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1312 .clampScalar(0, S16, S64) 1313 .scalarize(1) 1314 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1315 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1316 .clampMaxNumElements(0, S32, 2) 1317 .clampMaxNumElements(0, LocalPtr, 2) 1318 .clampMaxNumElements(0, PrivatePtr, 2) 1319 .scalarize(0) 1320 .widenScalarToNextPow2(0) 1321 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1322 1323 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1324 // be more flexible with the shift amount type. 1325 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1326 .legalFor({{S32, S32}, {S64, S32}}); 1327 if (ST.has16BitInsts()) { 1328 if (ST.hasVOP3PInsts()) { 1329 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1330 .clampMaxNumElements(0, S16, 2); 1331 } else 1332 Shifts.legalFor({{S16, S16}}); 1333 1334 // TODO: Support 16-bit shift amounts for all types 1335 Shifts.widenScalarIf( 1336 [=](const LegalityQuery &Query) { 1337 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1338 // 32-bit amount. 1339 const LLT ValTy = Query.Types[0]; 1340 const LLT AmountTy = Query.Types[1]; 1341 return ValTy.getSizeInBits() <= 16 && 1342 AmountTy.getSizeInBits() < 16; 1343 }, changeTo(1, S16)); 1344 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1345 Shifts.clampScalar(1, S32, S32); 1346 Shifts.clampScalar(0, S16, S64); 1347 Shifts.widenScalarToNextPow2(0, 16); 1348 1349 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1350 .minScalar(0, S16) 1351 .scalarize(0) 1352 .lower(); 1353 } else { 1354 // Make sure we legalize the shift amount type first, as the general 1355 // expansion for the shifted type will produce much worse code if it hasn't 1356 // been truncated already. 1357 Shifts.clampScalar(1, S32, S32); 1358 Shifts.clampScalar(0, S32, S64); 1359 Shifts.widenScalarToNextPow2(0, 32); 1360 1361 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1362 .minScalar(0, S32) 1363 .scalarize(0) 1364 .lower(); 1365 } 1366 Shifts.scalarize(0); 1367 1368 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1369 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1370 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1371 unsigned IdxTypeIdx = 2; 1372 1373 getActionDefinitionsBuilder(Op) 1374 .customIf([=](const LegalityQuery &Query) { 1375 const LLT EltTy = Query.Types[EltTypeIdx]; 1376 const LLT VecTy = Query.Types[VecTypeIdx]; 1377 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1378 const unsigned EltSize = EltTy.getSizeInBits(); 1379 return (EltSize == 32 || EltSize == 64) && 1380 VecTy.getSizeInBits() % 32 == 0 && 1381 VecTy.getSizeInBits() <= MaxRegisterSize && 1382 IdxTy.getSizeInBits() == 32; 1383 }) 1384 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1385 bitcastToVectorElement32(VecTypeIdx)) 1386 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1387 .bitcastIf( 1388 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1389 [=](const LegalityQuery &Query) { 1390 // For > 64-bit element types, try to turn this into a 64-bit 1391 // element vector since we may be able to do better indexing 1392 // if this is scalar. If not, fall back to 32. 1393 const LLT EltTy = Query.Types[EltTypeIdx]; 1394 const LLT VecTy = Query.Types[VecTypeIdx]; 1395 const unsigned DstEltSize = EltTy.getSizeInBits(); 1396 const unsigned VecSize = VecTy.getSizeInBits(); 1397 1398 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1399 return std::make_pair( 1400 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1401 }) 1402 .clampScalar(EltTypeIdx, S32, S64) 1403 .clampScalar(VecTypeIdx, S32, S64) 1404 .clampScalar(IdxTypeIdx, S32, S32) 1405 .clampMaxNumElements(VecTypeIdx, S32, 32) 1406 // TODO: Clamp elements for 64-bit vectors? 1407 // It should only be necessary with variable indexes. 1408 // As a last resort, lower to the stack 1409 .lower(); 1410 } 1411 1412 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1413 .unsupportedIf([=](const LegalityQuery &Query) { 1414 const LLT &EltTy = Query.Types[1].getElementType(); 1415 return Query.Types[0] != EltTy; 1416 }); 1417 1418 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1419 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1420 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1421 1422 // FIXME: Doesn't handle extract of illegal sizes. 1423 getActionDefinitionsBuilder(Op) 1424 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1425 // FIXME: Multiples of 16 should not be legal. 1426 .legalIf([=](const LegalityQuery &Query) { 1427 const LLT BigTy = Query.Types[BigTyIdx]; 1428 const LLT LitTy = Query.Types[LitTyIdx]; 1429 return (BigTy.getSizeInBits() % 32 == 0) && 1430 (LitTy.getSizeInBits() % 16 == 0); 1431 }) 1432 .widenScalarIf( 1433 [=](const LegalityQuery &Query) { 1434 const LLT BigTy = Query.Types[BigTyIdx]; 1435 return (BigTy.getScalarSizeInBits() < 16); 1436 }, 1437 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1438 .widenScalarIf( 1439 [=](const LegalityQuery &Query) { 1440 const LLT LitTy = Query.Types[LitTyIdx]; 1441 return (LitTy.getScalarSizeInBits() < 16); 1442 }, 1443 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1444 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1445 .widenScalarToNextPow2(BigTyIdx, 32); 1446 1447 } 1448 1449 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1450 .legalForCartesianProduct(AllS32Vectors, {S32}) 1451 .legalForCartesianProduct(AllS64Vectors, {S64}) 1452 .clampNumElements(0, V16S32, V32S32) 1453 .clampNumElements(0, V2S64, V16S64) 1454 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1455 1456 if (ST.hasScalarPackInsts()) { 1457 BuildVector 1458 // FIXME: Should probably widen s1 vectors straight to s32 1459 .minScalarOrElt(0, S16) 1460 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1461 .minScalar(1, S32); 1462 1463 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1464 .legalFor({V2S16, S32}) 1465 .lower(); 1466 BuildVector.minScalarOrElt(0, S32); 1467 } else { 1468 BuildVector.customFor({V2S16, S16}); 1469 BuildVector.minScalarOrElt(0, S32); 1470 1471 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1472 .customFor({V2S16, S32}) 1473 .lower(); 1474 } 1475 1476 BuildVector.legalIf(isRegisterType(0)); 1477 1478 // FIXME: Clamp maximum size 1479 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1480 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1481 .clampMaxNumElements(0, S32, 32) 1482 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1483 .clampMaxNumElements(0, S16, 64); 1484 1485 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1486 // pre-legalize. 1487 if (ST.hasVOP3PInsts()) { 1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1489 .customFor({V2S16, V2S16}) 1490 .lower(); 1491 } else 1492 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1493 1494 // Merge/Unmerge 1495 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1496 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1497 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1498 1499 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1500 const LLT Ty = Query.Types[TypeIdx]; 1501 if (Ty.isVector()) { 1502 const LLT &EltTy = Ty.getElementType(); 1503 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1504 return true; 1505 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1506 return true; 1507 } 1508 return false; 1509 }; 1510 1511 auto &Builder = getActionDefinitionsBuilder(Op) 1512 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1513 .lowerFor({{S16, V2S16}}) 1514 .lowerIf([=](const LegalityQuery &Query) { 1515 const LLT BigTy = Query.Types[BigTyIdx]; 1516 return BigTy.getSizeInBits() == 32; 1517 }) 1518 // Try to widen to s16 first for small types. 1519 // TODO: Only do this on targets with legal s16 shifts 1520 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1521 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1522 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1523 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1524 elementTypeIs(1, S16)), 1525 changeTo(1, V2S16)) 1526 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1527 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1528 // valid. 1529 .clampScalar(LitTyIdx, S32, S512) 1530 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1531 // Break up vectors with weird elements into scalars 1532 .fewerElementsIf( 1533 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1534 scalarize(0)) 1535 .fewerElementsIf( 1536 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1537 scalarize(1)) 1538 .clampScalar(BigTyIdx, S32, MaxScalar); 1539 1540 if (Op == G_MERGE_VALUES) { 1541 Builder.widenScalarIf( 1542 // TODO: Use 16-bit shifts if legal for 8-bit values? 1543 [=](const LegalityQuery &Query) { 1544 const LLT Ty = Query.Types[LitTyIdx]; 1545 return Ty.getSizeInBits() < 32; 1546 }, 1547 changeTo(LitTyIdx, S32)); 1548 } 1549 1550 Builder.widenScalarIf( 1551 [=](const LegalityQuery &Query) { 1552 const LLT Ty = Query.Types[BigTyIdx]; 1553 return !isPowerOf2_32(Ty.getSizeInBits()) && 1554 Ty.getSizeInBits() % 16 != 0; 1555 }, 1556 [=](const LegalityQuery &Query) { 1557 // Pick the next power of 2, or a multiple of 64 over 128. 1558 // Whichever is smaller. 1559 const LLT &Ty = Query.Types[BigTyIdx]; 1560 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1561 if (NewSizeInBits >= 256) { 1562 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1563 if (RoundedTo < NewSizeInBits) 1564 NewSizeInBits = RoundedTo; 1565 } 1566 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1567 }) 1568 // Any vectors left are the wrong size. Scalarize them. 1569 .scalarize(0) 1570 .scalarize(1); 1571 } 1572 1573 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1574 // RegBankSelect. 1575 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1576 .legalFor({{S32}, {S64}}); 1577 1578 if (ST.hasVOP3PInsts()) { 1579 SextInReg.lowerFor({{V2S16}}) 1580 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1581 // get more vector shift opportunities, since we'll get those when 1582 // expanded. 1583 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1584 } else if (ST.has16BitInsts()) { 1585 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1586 } else { 1587 // Prefer to promote to s32 before lowering if we don't have 16-bit 1588 // shifts. This avoid a lot of intermediate truncate and extend operations. 1589 SextInReg.lowerFor({{S32}, {S64}}); 1590 } 1591 1592 SextInReg 1593 .scalarize(0) 1594 .clampScalar(0, S32, S64) 1595 .lower(); 1596 1597 getActionDefinitionsBuilder(G_FSHR) 1598 .legalFor({{S32, S32}}) 1599 .scalarize(0) 1600 .lower(); 1601 1602 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1603 .legalFor({S64}); 1604 1605 getActionDefinitionsBuilder(G_FENCE) 1606 .alwaysLegal(); 1607 1608 getActionDefinitionsBuilder({ 1609 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1610 G_FCOPYSIGN, 1611 1612 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1613 G_ATOMICRMW_NAND, 1614 G_ATOMICRMW_FSUB, 1615 G_READ_REGISTER, 1616 G_WRITE_REGISTER, 1617 1618 G_SADDO, G_SSUBO, 1619 1620 // TODO: Implement 1621 G_FMINIMUM, G_FMAXIMUM, 1622 G_FSHL 1623 }).lower(); 1624 1625 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1626 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1627 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1628 .unsupported(); 1629 1630 computeTables(); 1631 verify(*ST.getInstrInfo()); 1632 } 1633 1634 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1635 MachineInstr &MI) const { 1636 MachineIRBuilder &B = Helper.MIRBuilder; 1637 MachineRegisterInfo &MRI = *B.getMRI(); 1638 1639 switch (MI.getOpcode()) { 1640 case TargetOpcode::G_ADDRSPACE_CAST: 1641 return legalizeAddrSpaceCast(MI, MRI, B); 1642 case TargetOpcode::G_FRINT: 1643 return legalizeFrint(MI, MRI, B); 1644 case TargetOpcode::G_FCEIL: 1645 return legalizeFceil(MI, MRI, B); 1646 case TargetOpcode::G_FREM: 1647 return legalizeFrem(MI, MRI, B); 1648 case TargetOpcode::G_INTRINSIC_TRUNC: 1649 return legalizeIntrinsicTrunc(MI, MRI, B); 1650 case TargetOpcode::G_SITOFP: 1651 return legalizeITOFP(MI, MRI, B, true); 1652 case TargetOpcode::G_UITOFP: 1653 return legalizeITOFP(MI, MRI, B, false); 1654 case TargetOpcode::G_FPTOSI: 1655 return legalizeFPTOI(MI, MRI, B, true); 1656 case TargetOpcode::G_FPTOUI: 1657 return legalizeFPTOI(MI, MRI, B, false); 1658 case TargetOpcode::G_FMINNUM: 1659 case TargetOpcode::G_FMAXNUM: 1660 case TargetOpcode::G_FMINNUM_IEEE: 1661 case TargetOpcode::G_FMAXNUM_IEEE: 1662 return legalizeMinNumMaxNum(Helper, MI); 1663 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1664 return legalizeExtractVectorElt(MI, MRI, B); 1665 case TargetOpcode::G_INSERT_VECTOR_ELT: 1666 return legalizeInsertVectorElt(MI, MRI, B); 1667 case TargetOpcode::G_SHUFFLE_VECTOR: 1668 return legalizeShuffleVector(MI, MRI, B); 1669 case TargetOpcode::G_FSIN: 1670 case TargetOpcode::G_FCOS: 1671 return legalizeSinCos(MI, MRI, B); 1672 case TargetOpcode::G_GLOBAL_VALUE: 1673 return legalizeGlobalValue(MI, MRI, B); 1674 case TargetOpcode::G_LOAD: 1675 return legalizeLoad(Helper, MI); 1676 case TargetOpcode::G_FMAD: 1677 return legalizeFMad(MI, MRI, B); 1678 case TargetOpcode::G_FDIV: 1679 return legalizeFDIV(MI, MRI, B); 1680 case TargetOpcode::G_UDIV: 1681 case TargetOpcode::G_UREM: 1682 return legalizeUDIV_UREM(MI, MRI, B); 1683 case TargetOpcode::G_SDIV: 1684 case TargetOpcode::G_SREM: 1685 return legalizeSDIV_SREM(MI, MRI, B); 1686 case TargetOpcode::G_ATOMIC_CMPXCHG: 1687 return legalizeAtomicCmpXChg(MI, MRI, B); 1688 case TargetOpcode::G_FLOG: 1689 return legalizeFlog(MI, B, numbers::ln2f); 1690 case TargetOpcode::G_FLOG10: 1691 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1692 case TargetOpcode::G_FEXP: 1693 return legalizeFExp(MI, B); 1694 case TargetOpcode::G_FPOW: 1695 return legalizeFPow(MI, B); 1696 case TargetOpcode::G_FFLOOR: 1697 return legalizeFFloor(MI, MRI, B); 1698 case TargetOpcode::G_BUILD_VECTOR: 1699 return legalizeBuildVector(MI, MRI, B); 1700 default: 1701 return false; 1702 } 1703 1704 llvm_unreachable("expected switch to return"); 1705 } 1706 1707 Register AMDGPULegalizerInfo::getSegmentAperture( 1708 unsigned AS, 1709 MachineRegisterInfo &MRI, 1710 MachineIRBuilder &B) const { 1711 MachineFunction &MF = B.getMF(); 1712 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1713 const LLT S32 = LLT::scalar(32); 1714 1715 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1716 1717 if (ST.hasApertureRegs()) { 1718 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1719 // getreg. 1720 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1721 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1722 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1723 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1724 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1725 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1726 unsigned Encoding = 1727 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1728 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1729 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1730 1731 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1732 1733 B.buildInstr(AMDGPU::S_GETREG_B32) 1734 .addDef(GetReg) 1735 .addImm(Encoding); 1736 MRI.setType(GetReg, S32); 1737 1738 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1739 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1740 } 1741 1742 Register QueuePtr = MRI.createGenericVirtualRegister( 1743 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1744 1745 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1746 return Register(); 1747 1748 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1749 // private_segment_aperture_base_hi. 1750 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1751 1752 // TODO: can we be smarter about machine pointer info? 1753 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1754 MachineMemOperand *MMO = MF.getMachineMemOperand( 1755 PtrInfo, 1756 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1757 MachineMemOperand::MOInvariant, 1758 4, commonAlignment(Align(64), StructOffset)); 1759 1760 Register LoadAddr; 1761 1762 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1763 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1764 } 1765 1766 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1767 MachineInstr &MI, MachineRegisterInfo &MRI, 1768 MachineIRBuilder &B) const { 1769 MachineFunction &MF = B.getMF(); 1770 1771 const LLT S32 = LLT::scalar(32); 1772 Register Dst = MI.getOperand(0).getReg(); 1773 Register Src = MI.getOperand(1).getReg(); 1774 1775 LLT DstTy = MRI.getType(Dst); 1776 LLT SrcTy = MRI.getType(Src); 1777 unsigned DestAS = DstTy.getAddressSpace(); 1778 unsigned SrcAS = SrcTy.getAddressSpace(); 1779 1780 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1781 // vector element. 1782 assert(!DstTy.isVector()); 1783 1784 const AMDGPUTargetMachine &TM 1785 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1786 1787 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1788 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1789 return true; 1790 } 1791 1792 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1793 // Truncate. 1794 B.buildExtract(Dst, Src, 0); 1795 MI.eraseFromParent(); 1796 return true; 1797 } 1798 1799 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1800 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1801 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1802 1803 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1804 // another. Merge operands are required to be the same type, but creating an 1805 // extra ptrtoint would be kind of pointless. 1806 auto HighAddr = B.buildConstant( 1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1808 B.buildMerge(Dst, {Src, HighAddr}); 1809 MI.eraseFromParent(); 1810 return true; 1811 } 1812 1813 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1814 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1815 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1816 unsigned NullVal = TM.getNullPointerValue(DestAS); 1817 1818 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1819 auto FlatNull = B.buildConstant(SrcTy, 0); 1820 1821 // Extract low 32-bits of the pointer. 1822 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1823 1824 auto CmpRes = 1825 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1826 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1827 1828 MI.eraseFromParent(); 1829 return true; 1830 } 1831 1832 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1833 return false; 1834 1835 if (!ST.hasFlatAddressSpace()) 1836 return false; 1837 1838 auto SegmentNull = 1839 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1840 auto FlatNull = 1841 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1842 1843 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1844 if (!ApertureReg.isValid()) 1845 return false; 1846 1847 auto CmpRes = 1848 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1849 1850 // Coerce the type of the low half of the result so we can use merge_values. 1851 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1852 1853 // TODO: Should we allow mismatched types but matching sizes in merges to 1854 // avoid the ptrtoint? 1855 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1856 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1857 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 bool AMDGPULegalizerInfo::legalizeFrint( 1863 MachineInstr &MI, MachineRegisterInfo &MRI, 1864 MachineIRBuilder &B) const { 1865 Register Src = MI.getOperand(1).getReg(); 1866 LLT Ty = MRI.getType(Src); 1867 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1868 1869 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1870 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1871 1872 auto C1 = B.buildFConstant(Ty, C1Val); 1873 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1874 1875 // TODO: Should this propagate fast-math-flags? 1876 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1877 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1878 1879 auto C2 = B.buildFConstant(Ty, C2Val); 1880 auto Fabs = B.buildFAbs(Ty, Src); 1881 1882 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1883 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1884 MI.eraseFromParent(); 1885 return true; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeFceil( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 1892 const LLT S1 = LLT::scalar(1); 1893 const LLT S64 = LLT::scalar(64); 1894 1895 Register Src = MI.getOperand(1).getReg(); 1896 assert(MRI.getType(Src) == S64); 1897 1898 // result = trunc(src) 1899 // if (src > 0.0 && src != result) 1900 // result += 1.0 1901 1902 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1903 1904 const auto Zero = B.buildFConstant(S64, 0.0); 1905 const auto One = B.buildFConstant(S64, 1.0); 1906 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1907 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1908 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1909 auto Add = B.buildSelect(S64, And, One, Zero); 1910 1911 // TODO: Should this propagate fast-math-flags? 1912 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1913 return true; 1914 } 1915 1916 bool AMDGPULegalizerInfo::legalizeFrem( 1917 MachineInstr &MI, MachineRegisterInfo &MRI, 1918 MachineIRBuilder &B) const { 1919 Register DstReg = MI.getOperand(0).getReg(); 1920 Register Src0Reg = MI.getOperand(1).getReg(); 1921 Register Src1Reg = MI.getOperand(2).getReg(); 1922 auto Flags = MI.getFlags(); 1923 LLT Ty = MRI.getType(DstReg); 1924 1925 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1926 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1927 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1928 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1929 MI.eraseFromParent(); 1930 return true; 1931 } 1932 1933 static MachineInstrBuilder extractF64Exponent(Register Hi, 1934 MachineIRBuilder &B) { 1935 const unsigned FractBits = 52; 1936 const unsigned ExpBits = 11; 1937 LLT S32 = LLT::scalar(32); 1938 1939 auto Const0 = B.buildConstant(S32, FractBits - 32); 1940 auto Const1 = B.buildConstant(S32, ExpBits); 1941 1942 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1943 .addUse(Hi) 1944 .addUse(Const0.getReg(0)) 1945 .addUse(Const1.getReg(0)); 1946 1947 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1948 } 1949 1950 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1951 MachineInstr &MI, MachineRegisterInfo &MRI, 1952 MachineIRBuilder &B) const { 1953 const LLT S1 = LLT::scalar(1); 1954 const LLT S32 = LLT::scalar(32); 1955 const LLT S64 = LLT::scalar(64); 1956 1957 Register Src = MI.getOperand(1).getReg(); 1958 assert(MRI.getType(Src) == S64); 1959 1960 // TODO: Should this use extract since the low half is unused? 1961 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1962 Register Hi = Unmerge.getReg(1); 1963 1964 // Extract the upper half, since this is where we will find the sign and 1965 // exponent. 1966 auto Exp = extractF64Exponent(Hi, B); 1967 1968 const unsigned FractBits = 52; 1969 1970 // Extract the sign bit. 1971 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1972 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1973 1974 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1975 1976 const auto Zero32 = B.buildConstant(S32, 0); 1977 1978 // Extend back to 64-bits. 1979 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1980 1981 auto Shr = B.buildAShr(S64, FractMask, Exp); 1982 auto Not = B.buildNot(S64, Shr); 1983 auto Tmp0 = B.buildAnd(S64, Src, Not); 1984 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1985 1986 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1987 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1988 1989 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1990 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1991 MI.eraseFromParent(); 1992 return true; 1993 } 1994 1995 bool AMDGPULegalizerInfo::legalizeITOFP( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B, bool Signed) const { 1998 1999 Register Dst = MI.getOperand(0).getReg(); 2000 Register Src = MI.getOperand(1).getReg(); 2001 2002 const LLT S64 = LLT::scalar(64); 2003 const LLT S32 = LLT::scalar(32); 2004 2005 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2006 2007 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2008 2009 auto CvtHi = Signed ? 2010 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2011 B.buildUITOFP(S64, Unmerge.getReg(1)); 2012 2013 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2014 2015 auto ThirtyTwo = B.buildConstant(S32, 32); 2016 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2017 .addUse(CvtHi.getReg(0)) 2018 .addUse(ThirtyTwo.getReg(0)); 2019 2020 // TODO: Should this propagate fast-math-flags? 2021 B.buildFAdd(Dst, LdExp, CvtLo); 2022 MI.eraseFromParent(); 2023 return true; 2024 } 2025 2026 // TODO: Copied from DAG implementation. Verify logic and document how this 2027 // actually works. 2028 bool AMDGPULegalizerInfo::legalizeFPTOI( 2029 MachineInstr &MI, MachineRegisterInfo &MRI, 2030 MachineIRBuilder &B, bool Signed) const { 2031 2032 Register Dst = MI.getOperand(0).getReg(); 2033 Register Src = MI.getOperand(1).getReg(); 2034 2035 const LLT S64 = LLT::scalar(64); 2036 const LLT S32 = LLT::scalar(32); 2037 2038 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2039 2040 unsigned Flags = MI.getFlags(); 2041 2042 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2043 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2044 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2045 2046 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2047 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2048 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2049 2050 auto Hi = Signed ? 2051 B.buildFPTOSI(S32, FloorMul) : 2052 B.buildFPTOUI(S32, FloorMul); 2053 auto Lo = B.buildFPTOUI(S32, Fma); 2054 2055 B.buildMerge(Dst, { Lo, Hi }); 2056 MI.eraseFromParent(); 2057 2058 return true; 2059 } 2060 2061 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2062 MachineInstr &MI) const { 2063 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2064 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2065 2066 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2067 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2068 2069 // With ieee_mode disabled, the instructions have the correct behavior 2070 // already for G_FMINNUM/G_FMAXNUM 2071 if (!MFI->getMode().IEEE) 2072 return !IsIEEEOp; 2073 2074 if (IsIEEEOp) 2075 return true; 2076 2077 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2078 } 2079 2080 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2081 MachineInstr &MI, MachineRegisterInfo &MRI, 2082 MachineIRBuilder &B) const { 2083 // TODO: Should move some of this into LegalizerHelper. 2084 2085 // TODO: Promote dynamic indexing of s16 to s32 2086 2087 // FIXME: Artifact combiner probably should have replaced the truncated 2088 // constant before this, so we shouldn't need 2089 // getConstantVRegValWithLookThrough. 2090 Optional<ValueAndVReg> MaybeIdxVal = 2091 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2092 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2093 return true; 2094 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Vec = MI.getOperand(1).getReg(); 2098 2099 LLT VecTy = MRI.getType(Vec); 2100 LLT EltTy = VecTy.getElementType(); 2101 assert(EltTy == MRI.getType(Dst)); 2102 2103 if (IdxVal < VecTy.getNumElements()) 2104 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2105 else 2106 B.buildUndef(Dst); 2107 2108 MI.eraseFromParent(); 2109 return true; 2110 } 2111 2112 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2113 MachineInstr &MI, MachineRegisterInfo &MRI, 2114 MachineIRBuilder &B) const { 2115 // TODO: Should move some of this into LegalizerHelper. 2116 2117 // TODO: Promote dynamic indexing of s16 to s32 2118 2119 // FIXME: Artifact combiner probably should have replaced the truncated 2120 // constant before this, so we shouldn't need 2121 // getConstantVRegValWithLookThrough. 2122 Optional<ValueAndVReg> MaybeIdxVal = 2123 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2124 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2125 return true; 2126 2127 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2128 Register Dst = MI.getOperand(0).getReg(); 2129 Register Vec = MI.getOperand(1).getReg(); 2130 Register Ins = MI.getOperand(2).getReg(); 2131 2132 LLT VecTy = MRI.getType(Vec); 2133 LLT EltTy = VecTy.getElementType(); 2134 assert(EltTy == MRI.getType(Ins)); 2135 2136 if (IdxVal < VecTy.getNumElements()) 2137 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2138 else 2139 B.buildUndef(Dst); 2140 2141 MI.eraseFromParent(); 2142 return true; 2143 } 2144 2145 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2146 MachineInstr &MI, MachineRegisterInfo &MRI, 2147 MachineIRBuilder &B) const { 2148 const LLT V2S16 = LLT::vector(2, 16); 2149 2150 Register Dst = MI.getOperand(0).getReg(); 2151 Register Src0 = MI.getOperand(1).getReg(); 2152 LLT DstTy = MRI.getType(Dst); 2153 LLT SrcTy = MRI.getType(Src0); 2154 2155 if (SrcTy == V2S16 && DstTy == V2S16 && 2156 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2157 return true; 2158 2159 MachineIRBuilder HelperBuilder(MI); 2160 GISelObserverWrapper DummyObserver; 2161 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2162 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2163 } 2164 2165 bool AMDGPULegalizerInfo::legalizeSinCos( 2166 MachineInstr &MI, MachineRegisterInfo &MRI, 2167 MachineIRBuilder &B) const { 2168 2169 Register DstReg = MI.getOperand(0).getReg(); 2170 Register SrcReg = MI.getOperand(1).getReg(); 2171 LLT Ty = MRI.getType(DstReg); 2172 unsigned Flags = MI.getFlags(); 2173 2174 Register TrigVal; 2175 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2176 if (ST.hasTrigReducedRange()) { 2177 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2178 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2179 .addUse(MulVal.getReg(0)) 2180 .setMIFlags(Flags).getReg(0); 2181 } else 2182 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2183 2184 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2185 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2186 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2187 .addUse(TrigVal) 2188 .setMIFlags(Flags); 2189 MI.eraseFromParent(); 2190 return true; 2191 } 2192 2193 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2194 MachineIRBuilder &B, 2195 const GlobalValue *GV, 2196 int64_t Offset, 2197 unsigned GAFlags) const { 2198 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2199 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2200 // to the following code sequence: 2201 // 2202 // For constant address space: 2203 // s_getpc_b64 s[0:1] 2204 // s_add_u32 s0, s0, $symbol 2205 // s_addc_u32 s1, s1, 0 2206 // 2207 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2208 // a fixup or relocation is emitted to replace $symbol with a literal 2209 // constant, which is a pc-relative offset from the encoding of the $symbol 2210 // operand to the global variable. 2211 // 2212 // For global address space: 2213 // s_getpc_b64 s[0:1] 2214 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2215 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2216 // 2217 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2218 // fixups or relocations are emitted to replace $symbol@*@lo and 2219 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2220 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2221 // operand to the global variable. 2222 // 2223 // What we want here is an offset from the value returned by s_getpc 2224 // (which is the address of the s_add_u32 instruction) to the global 2225 // variable, but since the encoding of $symbol starts 4 bytes after the start 2226 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2227 // small. This requires us to add 4 to the global variable offset in order to 2228 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2229 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2230 // instruction. 2231 2232 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2233 2234 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2235 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2236 2237 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2238 .addDef(PCReg); 2239 2240 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2241 if (GAFlags == SIInstrInfo::MO_NONE) 2242 MIB.addImm(0); 2243 else 2244 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2245 2246 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2247 2248 if (PtrTy.getSizeInBits() == 32) 2249 B.buildExtract(DstReg, PCReg, 0); 2250 return true; 2251 } 2252 2253 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2254 MachineInstr &MI, MachineRegisterInfo &MRI, 2255 MachineIRBuilder &B) const { 2256 Register DstReg = MI.getOperand(0).getReg(); 2257 LLT Ty = MRI.getType(DstReg); 2258 unsigned AS = Ty.getAddressSpace(); 2259 2260 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2261 MachineFunction &MF = B.getMF(); 2262 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2263 2264 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2265 if (!MFI->isModuleEntryFunction()) { 2266 const Function &Fn = MF.getFunction(); 2267 DiagnosticInfoUnsupported BadLDSDecl( 2268 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2269 DS_Warning); 2270 Fn.getContext().diagnose(BadLDSDecl); 2271 2272 // We currently don't have a way to correctly allocate LDS objects that 2273 // aren't directly associated with a kernel. We do force inlining of 2274 // functions that use local objects. However, if these dead functions are 2275 // not eliminated, we don't want a compile time error. Just emit a warning 2276 // and a trap, since there should be no callable path here. 2277 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2278 B.buildUndef(DstReg); 2279 MI.eraseFromParent(); 2280 return true; 2281 } 2282 2283 // TODO: We could emit code to handle the initialization somewhere. 2284 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2285 const SITargetLowering *TLI = ST.getTargetLowering(); 2286 if (!TLI->shouldUseLDSConstAddress(GV)) { 2287 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2288 return true; // Leave in place; 2289 } 2290 2291 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2292 Type *Ty = GV->getValueType(); 2293 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2294 // zero-sized type in other languages to declare the dynamic shared 2295 // memory which size is not known at the compile time. They will be 2296 // allocated by the runtime and placed directly after the static 2297 // allocated ones. They all share the same offset. 2298 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2299 // Adjust alignment for that dynamic shared memory array. 2300 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2301 LLT S32 = LLT::scalar(32); 2302 auto Sz = 2303 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2304 B.buildIntToPtr(DstReg, Sz); 2305 MI.eraseFromParent(); 2306 return true; 2307 } 2308 } 2309 2310 B.buildConstant( 2311 DstReg, 2312 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2313 MI.eraseFromParent(); 2314 return true; 2315 } 2316 2317 const Function &Fn = MF.getFunction(); 2318 DiagnosticInfoUnsupported BadInit( 2319 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2320 Fn.getContext().diagnose(BadInit); 2321 return true; 2322 } 2323 2324 const SITargetLowering *TLI = ST.getTargetLowering(); 2325 2326 if (TLI->shouldEmitFixup(GV)) { 2327 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2328 MI.eraseFromParent(); 2329 return true; 2330 } 2331 2332 if (TLI->shouldEmitPCReloc(GV)) { 2333 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2334 MI.eraseFromParent(); 2335 return true; 2336 } 2337 2338 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2339 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2340 2341 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2342 MachinePointerInfo::getGOT(MF), 2343 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2344 MachineMemOperand::MOInvariant, 2345 8 /*Size*/, Align(8)); 2346 2347 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2348 2349 if (Ty.getSizeInBits() == 32) { 2350 // Truncate if this is a 32-bit constant adrdess. 2351 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2352 B.buildExtract(DstReg, Load, 0); 2353 } else 2354 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2355 2356 MI.eraseFromParent(); 2357 return true; 2358 } 2359 2360 static LLT widenToNextPowerOf2(LLT Ty) { 2361 if (Ty.isVector()) 2362 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2363 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2364 } 2365 2366 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2367 MachineInstr &MI) const { 2368 MachineIRBuilder &B = Helper.MIRBuilder; 2369 MachineRegisterInfo &MRI = *B.getMRI(); 2370 GISelChangeObserver &Observer = Helper.Observer; 2371 2372 Register PtrReg = MI.getOperand(1).getReg(); 2373 LLT PtrTy = MRI.getType(PtrReg); 2374 unsigned AddrSpace = PtrTy.getAddressSpace(); 2375 2376 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2377 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2378 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2379 Observer.changingInstr(MI); 2380 MI.getOperand(1).setReg(Cast.getReg(0)); 2381 Observer.changedInstr(MI); 2382 return true; 2383 } 2384 2385 Register ValReg = MI.getOperand(0).getReg(); 2386 LLT ValTy = MRI.getType(ValReg); 2387 2388 MachineMemOperand *MMO = *MI.memoperands_begin(); 2389 const unsigned ValSize = ValTy.getSizeInBits(); 2390 const unsigned MemSize = 8 * MMO->getSize(); 2391 const Align MemAlign = MMO->getAlign(); 2392 const unsigned AlignInBits = 8 * MemAlign.value(); 2393 2394 // Widen non-power-of-2 loads to the alignment if needed 2395 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2396 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2397 2398 // This was already the correct extending load result type, so just adjust 2399 // the memory type. 2400 if (WideMemSize == ValSize) { 2401 MachineFunction &MF = B.getMF(); 2402 2403 MachineMemOperand *WideMMO = 2404 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2405 Observer.changingInstr(MI); 2406 MI.setMemRefs(MF, {WideMMO}); 2407 Observer.changedInstr(MI); 2408 return true; 2409 } 2410 2411 // Don't bother handling edge case that should probably never be produced. 2412 if (ValSize > WideMemSize) 2413 return false; 2414 2415 LLT WideTy = widenToNextPowerOf2(ValTy); 2416 2417 Register WideLoad; 2418 if (!WideTy.isVector()) { 2419 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2420 B.buildTrunc(ValReg, WideLoad).getReg(0); 2421 } else { 2422 // Extract the subvector. 2423 2424 if (isRegisterType(ValTy)) { 2425 // If this a case where G_EXTRACT is legal, use it. 2426 // (e.g. <3 x s32> -> <4 x s32>) 2427 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2428 B.buildExtract(ValReg, WideLoad, 0); 2429 } else { 2430 // For cases where the widened type isn't a nice register value, unmerge 2431 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2432 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2433 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2434 B.setInsertPt(B.getMBB(), MI.getIterator()); 2435 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2436 } 2437 } 2438 2439 MI.eraseFromParent(); 2440 return true; 2441 } 2442 2443 return false; 2444 } 2445 2446 bool AMDGPULegalizerInfo::legalizeFMad( 2447 MachineInstr &MI, MachineRegisterInfo &MRI, 2448 MachineIRBuilder &B) const { 2449 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2450 assert(Ty.isScalar()); 2451 2452 MachineFunction &MF = B.getMF(); 2453 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2454 2455 // TODO: Always legal with future ftz flag. 2456 // FIXME: Do we need just output? 2457 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2458 return true; 2459 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2460 return true; 2461 2462 MachineIRBuilder HelperBuilder(MI); 2463 GISelObserverWrapper DummyObserver; 2464 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2465 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2466 } 2467 2468 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2469 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2470 Register DstReg = MI.getOperand(0).getReg(); 2471 Register PtrReg = MI.getOperand(1).getReg(); 2472 Register CmpVal = MI.getOperand(2).getReg(); 2473 Register NewVal = MI.getOperand(3).getReg(); 2474 2475 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2476 "this should not have been custom lowered"); 2477 2478 LLT ValTy = MRI.getType(CmpVal); 2479 LLT VecTy = LLT::vector(2, ValTy); 2480 2481 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2482 2483 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2484 .addDef(DstReg) 2485 .addUse(PtrReg) 2486 .addUse(PackedVal) 2487 .setMemRefs(MI.memoperands()); 2488 2489 MI.eraseFromParent(); 2490 return true; 2491 } 2492 2493 bool AMDGPULegalizerInfo::legalizeFlog( 2494 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2495 Register Dst = MI.getOperand(0).getReg(); 2496 Register Src = MI.getOperand(1).getReg(); 2497 LLT Ty = B.getMRI()->getType(Dst); 2498 unsigned Flags = MI.getFlags(); 2499 2500 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2501 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2502 2503 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2504 MI.eraseFromParent(); 2505 return true; 2506 } 2507 2508 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2509 MachineIRBuilder &B) const { 2510 Register Dst = MI.getOperand(0).getReg(); 2511 Register Src = MI.getOperand(1).getReg(); 2512 unsigned Flags = MI.getFlags(); 2513 LLT Ty = B.getMRI()->getType(Dst); 2514 2515 auto K = B.buildFConstant(Ty, numbers::log2e); 2516 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2517 B.buildFExp2(Dst, Mul, Flags); 2518 MI.eraseFromParent(); 2519 return true; 2520 } 2521 2522 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2523 MachineIRBuilder &B) const { 2524 Register Dst = MI.getOperand(0).getReg(); 2525 Register Src0 = MI.getOperand(1).getReg(); 2526 Register Src1 = MI.getOperand(2).getReg(); 2527 unsigned Flags = MI.getFlags(); 2528 LLT Ty = B.getMRI()->getType(Dst); 2529 const LLT S16 = LLT::scalar(16); 2530 const LLT S32 = LLT::scalar(32); 2531 2532 if (Ty == S32) { 2533 auto Log = B.buildFLog2(S32, Src0, Flags); 2534 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2535 .addUse(Log.getReg(0)) 2536 .addUse(Src1) 2537 .setMIFlags(Flags); 2538 B.buildFExp2(Dst, Mul, Flags); 2539 } else if (Ty == S16) { 2540 // There's no f16 fmul_legacy, so we need to convert for it. 2541 auto Log = B.buildFLog2(S16, Src0, Flags); 2542 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2543 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2544 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2545 .addUse(Ext0.getReg(0)) 2546 .addUse(Ext1.getReg(0)) 2547 .setMIFlags(Flags); 2548 2549 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2550 } else 2551 return false; 2552 2553 MI.eraseFromParent(); 2554 return true; 2555 } 2556 2557 // Find a source register, ignoring any possible source modifiers. 2558 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2559 Register ModSrc = OrigSrc; 2560 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2561 ModSrc = SrcFNeg->getOperand(1).getReg(); 2562 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2563 ModSrc = SrcFAbs->getOperand(1).getReg(); 2564 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2565 ModSrc = SrcFAbs->getOperand(1).getReg(); 2566 return ModSrc; 2567 } 2568 2569 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2570 MachineRegisterInfo &MRI, 2571 MachineIRBuilder &B) const { 2572 2573 const LLT S1 = LLT::scalar(1); 2574 const LLT S64 = LLT::scalar(64); 2575 Register Dst = MI.getOperand(0).getReg(); 2576 Register OrigSrc = MI.getOperand(1).getReg(); 2577 unsigned Flags = MI.getFlags(); 2578 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2579 "this should not have been custom lowered"); 2580 2581 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2582 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2583 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2584 // V_FRACT bug is: 2585 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2586 // 2587 // Convert floor(x) to (x - fract(x)) 2588 2589 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2590 .addUse(OrigSrc) 2591 .setMIFlags(Flags); 2592 2593 // Give source modifier matching some assistance before obscuring a foldable 2594 // pattern. 2595 2596 // TODO: We can avoid the neg on the fract? The input sign to fract 2597 // shouldn't matter? 2598 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2599 2600 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2601 2602 Register Min = MRI.createGenericVirtualRegister(S64); 2603 2604 // We don't need to concern ourselves with the snan handling difference, so 2605 // use the one which will directly select. 2606 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2607 if (MFI->getMode().IEEE) 2608 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2609 else 2610 B.buildFMinNum(Min, Fract, Const, Flags); 2611 2612 Register CorrectedFract = Min; 2613 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2614 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2615 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2616 } 2617 2618 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2619 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2620 2621 MI.eraseFromParent(); 2622 return true; 2623 } 2624 2625 // Turn an illegal packed v2s16 build vector into bit operations. 2626 // TODO: This should probably be a bitcast action in LegalizerHelper. 2627 bool AMDGPULegalizerInfo::legalizeBuildVector( 2628 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2629 Register Dst = MI.getOperand(0).getReg(); 2630 const LLT S32 = LLT::scalar(32); 2631 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2632 2633 Register Src0 = MI.getOperand(1).getReg(); 2634 Register Src1 = MI.getOperand(2).getReg(); 2635 assert(MRI.getType(Src0) == LLT::scalar(16)); 2636 2637 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2638 B.buildBitcast(Dst, Merge); 2639 2640 MI.eraseFromParent(); 2641 return true; 2642 } 2643 2644 // Check that this is a G_XOR x, -1 2645 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2646 if (MI.getOpcode() != TargetOpcode::G_XOR) 2647 return false; 2648 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2649 return ConstVal && *ConstVal == -1; 2650 } 2651 2652 // Return the use branch instruction, otherwise null if the usage is invalid. 2653 static MachineInstr * 2654 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2655 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2656 Register CondDef = MI.getOperand(0).getReg(); 2657 if (!MRI.hasOneNonDBGUse(CondDef)) 2658 return nullptr; 2659 2660 MachineBasicBlock *Parent = MI.getParent(); 2661 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2662 2663 if (isNot(MRI, *UseMI)) { 2664 Register NegatedCond = UseMI->getOperand(0).getReg(); 2665 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2666 return nullptr; 2667 2668 // We're deleting the def of this value, so we need to remove it. 2669 UseMI->eraseFromParent(); 2670 2671 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2672 Negated = true; 2673 } 2674 2675 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2676 return nullptr; 2677 2678 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2679 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2680 if (Next == Parent->end()) { 2681 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2682 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2683 return nullptr; 2684 UncondBrTarget = &*NextMBB; 2685 } else { 2686 if (Next->getOpcode() != AMDGPU::G_BR) 2687 return nullptr; 2688 Br = &*Next; 2689 UncondBrTarget = Br->getOperand(0).getMBB(); 2690 } 2691 2692 return UseMI; 2693 } 2694 2695 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2696 const ArgDescriptor *Arg, 2697 const TargetRegisterClass *ArgRC, 2698 LLT ArgTy) const { 2699 MCRegister SrcReg = Arg->getRegister(); 2700 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2701 assert(DstReg.isVirtual() && "Virtual register expected"); 2702 2703 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2704 ArgTy); 2705 if (Arg->isMasked()) { 2706 // TODO: Should we try to emit this once in the entry block? 2707 const LLT S32 = LLT::scalar(32); 2708 const unsigned Mask = Arg->getMask(); 2709 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2710 2711 Register AndMaskSrc = LiveIn; 2712 2713 if (Shift != 0) { 2714 auto ShiftAmt = B.buildConstant(S32, Shift); 2715 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2716 } 2717 2718 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2719 } else { 2720 B.buildCopy(DstReg, LiveIn); 2721 } 2722 2723 return true; 2724 } 2725 2726 bool AMDGPULegalizerInfo::loadInputValue( 2727 Register DstReg, MachineIRBuilder &B, 2728 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2729 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2730 const ArgDescriptor *Arg; 2731 const TargetRegisterClass *ArgRC; 2732 LLT ArgTy; 2733 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2734 2735 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2736 return false; // TODO: Handle these 2737 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2738 } 2739 2740 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2741 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2742 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2743 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2744 return false; 2745 2746 MI.eraseFromParent(); 2747 return true; 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 Register Dst = MI.getOperand(0).getReg(); 2754 LLT DstTy = MRI.getType(Dst); 2755 LLT S16 = LLT::scalar(16); 2756 LLT S32 = LLT::scalar(32); 2757 LLT S64 = LLT::scalar(64); 2758 2759 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2760 return true; 2761 2762 if (DstTy == S16) 2763 return legalizeFDIV16(MI, MRI, B); 2764 if (DstTy == S32) 2765 return legalizeFDIV32(MI, MRI, B); 2766 if (DstTy == S64) 2767 return legalizeFDIV64(MI, MRI, B); 2768 2769 return false; 2770 } 2771 2772 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2773 Register DstReg, 2774 Register X, 2775 Register Y, 2776 bool IsDiv) const { 2777 const LLT S1 = LLT::scalar(1); 2778 const LLT S32 = LLT::scalar(32); 2779 2780 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2781 // algorithm used here. 2782 2783 // Initial estimate of inv(y). 2784 auto FloatY = B.buildUITOFP(S32, Y); 2785 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2786 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2787 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2788 auto Z = B.buildFPTOUI(S32, ScaledY); 2789 2790 // One round of UNR. 2791 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2792 auto NegYZ = B.buildMul(S32, NegY, Z); 2793 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2794 2795 // Quotient/remainder estimate. 2796 auto Q = B.buildUMulH(S32, X, Z); 2797 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2798 2799 // First quotient/remainder refinement. 2800 auto One = B.buildConstant(S32, 1); 2801 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2802 if (IsDiv) 2803 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2804 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2805 2806 // Second quotient/remainder refinement. 2807 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2808 if (IsDiv) 2809 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2810 else 2811 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2812 } 2813 2814 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2815 MachineRegisterInfo &MRI, 2816 MachineIRBuilder &B) const { 2817 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2818 Register DstReg = MI.getOperand(0).getReg(); 2819 Register Num = MI.getOperand(1).getReg(); 2820 Register Den = MI.getOperand(2).getReg(); 2821 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2822 MI.eraseFromParent(); 2823 return true; 2824 } 2825 2826 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2827 // 2828 // Return lo, hi of result 2829 // 2830 // %cvt.lo = G_UITOFP Val.lo 2831 // %cvt.hi = G_UITOFP Val.hi 2832 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2833 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2834 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2835 // %mul2 = G_FMUL %mul1, 2**(-32) 2836 // %trunc = G_INTRINSIC_TRUNC %mul2 2837 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2838 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2839 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2840 Register Val) { 2841 const LLT S32 = LLT::scalar(32); 2842 auto Unmerge = B.buildUnmerge(S32, Val); 2843 2844 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2845 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2846 2847 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2848 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2849 2850 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2851 auto Mul1 = 2852 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2853 2854 // 2**(-32) 2855 auto Mul2 = 2856 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2857 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2858 2859 // -(2**32) 2860 auto Mad2 = B.buildFMAD(S32, Trunc, 2861 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2862 2863 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2864 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2865 2866 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2867 } 2868 2869 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2870 Register DstReg, 2871 Register Numer, 2872 Register Denom, 2873 bool IsDiv) const { 2874 const LLT S32 = LLT::scalar(32); 2875 const LLT S64 = LLT::scalar(64); 2876 const LLT S1 = LLT::scalar(1); 2877 Register RcpLo, RcpHi; 2878 2879 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2880 2881 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2882 2883 auto Zero64 = B.buildConstant(S64, 0); 2884 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2885 2886 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2887 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2888 2889 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2890 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2891 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2892 2893 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2894 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2895 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2896 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2897 2898 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2899 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2900 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2901 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2902 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2903 2904 auto Zero32 = B.buildConstant(S32, 0); 2905 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2906 auto Add2_HiC = 2907 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2908 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2909 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2910 2911 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2912 Register NumerLo = UnmergeNumer.getReg(0); 2913 Register NumerHi = UnmergeNumer.getReg(1); 2914 2915 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2916 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2917 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2918 Register Mul3_Lo = UnmergeMul3.getReg(0); 2919 Register Mul3_Hi = UnmergeMul3.getReg(1); 2920 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2921 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2922 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2923 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2924 2925 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2926 Register DenomLo = UnmergeDenom.getReg(0); 2927 Register DenomHi = UnmergeDenom.getReg(1); 2928 2929 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2930 auto C1 = B.buildSExt(S32, CmpHi); 2931 2932 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2933 auto C2 = B.buildSExt(S32, CmpLo); 2934 2935 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2936 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2937 2938 // TODO: Here and below portions of the code can be enclosed into if/endif. 2939 // Currently control flow is unconditional and we have 4 selects after 2940 // potential endif to substitute PHIs. 2941 2942 // if C3 != 0 ... 2943 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2944 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2945 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2946 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2947 2948 auto One64 = B.buildConstant(S64, 1); 2949 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2950 2951 auto C4 = 2952 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2953 auto C5 = 2954 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2955 auto C6 = B.buildSelect( 2956 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2957 2958 // if (C6 != 0) 2959 auto Add4 = B.buildAdd(S64, Add3, One64); 2960 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2961 2962 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2963 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2964 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2965 2966 // endif C6 2967 // endif C3 2968 2969 if (IsDiv) { 2970 auto Sel1 = B.buildSelect( 2971 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2972 B.buildSelect(DstReg, 2973 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2974 } else { 2975 auto Sel2 = B.buildSelect( 2976 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2977 B.buildSelect(DstReg, 2978 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2979 } 2980 } 2981 2982 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2983 MachineRegisterInfo &MRI, 2984 MachineIRBuilder &B) const { 2985 const LLT S64 = LLT::scalar(64); 2986 const LLT S32 = LLT::scalar(32); 2987 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2988 Register DstReg = MI.getOperand(0).getReg(); 2989 Register Num = MI.getOperand(1).getReg(); 2990 Register Den = MI.getOperand(2).getReg(); 2991 LLT Ty = MRI.getType(DstReg); 2992 2993 if (Ty == S32) 2994 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2995 else if (Ty == S64) 2996 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2997 else 2998 return false; 2999 3000 MI.eraseFromParent(); 3001 return true; 3002 3003 } 3004 3005 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 3006 MachineRegisterInfo &MRI, 3007 MachineIRBuilder &B) const { 3008 const LLT S64 = LLT::scalar(64); 3009 const LLT S32 = LLT::scalar(32); 3010 3011 Register DstReg = MI.getOperand(0).getReg(); 3012 const LLT Ty = MRI.getType(DstReg); 3013 if (Ty != S32 && Ty != S64) 3014 return false; 3015 3016 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3017 3018 Register LHS = MI.getOperand(1).getReg(); 3019 Register RHS = MI.getOperand(2).getReg(); 3020 3021 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3022 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3023 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3024 3025 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3026 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3027 3028 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3029 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3030 3031 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3032 if (Ty == S32) 3033 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3034 else 3035 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3036 3037 Register Sign; 3038 if (IsDiv) 3039 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3040 else 3041 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3042 3043 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3044 B.buildSub(DstReg, UDivRem, Sign); 3045 3046 MI.eraseFromParent(); 3047 return true; 3048 } 3049 3050 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3051 MachineRegisterInfo &MRI, 3052 MachineIRBuilder &B) const { 3053 Register Res = MI.getOperand(0).getReg(); 3054 Register LHS = MI.getOperand(1).getReg(); 3055 Register RHS = MI.getOperand(2).getReg(); 3056 3057 uint16_t Flags = MI.getFlags(); 3058 3059 LLT ResTy = MRI.getType(Res); 3060 LLT S32 = LLT::scalar(32); 3061 LLT S64 = LLT::scalar(64); 3062 3063 const MachineFunction &MF = B.getMF(); 3064 bool Unsafe = 3065 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 3066 3067 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 3068 return false; 3069 3070 if (!Unsafe && ResTy == S32 && 3071 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 3072 return false; 3073 3074 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3075 // 1 / x -> RCP(x) 3076 if (CLHS->isExactlyValue(1.0)) { 3077 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3078 .addUse(RHS) 3079 .setMIFlags(Flags); 3080 3081 MI.eraseFromParent(); 3082 return true; 3083 } 3084 3085 // -1 / x -> RCP( FNEG(x) ) 3086 if (CLHS->isExactlyValue(-1.0)) { 3087 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3088 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3089 .addUse(FNeg.getReg(0)) 3090 .setMIFlags(Flags); 3091 3092 MI.eraseFromParent(); 3093 return true; 3094 } 3095 } 3096 3097 // x / y -> x * (1.0 / y) 3098 if (Unsafe) { 3099 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3100 .addUse(RHS) 3101 .setMIFlags(Flags); 3102 B.buildFMul(Res, LHS, RCP, Flags); 3103 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 return false; 3109 } 3110 3111 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3112 MachineRegisterInfo &MRI, 3113 MachineIRBuilder &B) const { 3114 Register Res = MI.getOperand(0).getReg(); 3115 Register LHS = MI.getOperand(1).getReg(); 3116 Register RHS = MI.getOperand(2).getReg(); 3117 3118 uint16_t Flags = MI.getFlags(); 3119 3120 LLT S16 = LLT::scalar(16); 3121 LLT S32 = LLT::scalar(32); 3122 3123 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3124 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3125 3126 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3127 .addUse(RHSExt.getReg(0)) 3128 .setMIFlags(Flags); 3129 3130 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3131 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3132 3133 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3134 .addUse(RDst.getReg(0)) 3135 .addUse(RHS) 3136 .addUse(LHS) 3137 .setMIFlags(Flags); 3138 3139 MI.eraseFromParent(); 3140 return true; 3141 } 3142 3143 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3144 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3145 static void toggleSPDenormMode(bool Enable, 3146 MachineIRBuilder &B, 3147 const GCNSubtarget &ST, 3148 AMDGPU::SIModeRegisterDefaults Mode) { 3149 // Set SP denorm mode to this value. 3150 unsigned SPDenormMode = 3151 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3152 3153 if (ST.hasDenormModeInst()) { 3154 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3155 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3156 3157 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3158 B.buildInstr(AMDGPU::S_DENORM_MODE) 3159 .addImm(NewDenormModeValue); 3160 3161 } else { 3162 // Select FP32 bit field in mode register. 3163 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3164 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3165 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3166 3167 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3168 .addImm(SPDenormMode) 3169 .addImm(SPDenormModeBitField); 3170 } 3171 } 3172 3173 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3174 MachineRegisterInfo &MRI, 3175 MachineIRBuilder &B) const { 3176 Register Res = MI.getOperand(0).getReg(); 3177 Register LHS = MI.getOperand(1).getReg(); 3178 Register RHS = MI.getOperand(2).getReg(); 3179 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3180 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3181 3182 uint16_t Flags = MI.getFlags(); 3183 3184 LLT S32 = LLT::scalar(32); 3185 LLT S1 = LLT::scalar(1); 3186 3187 auto One = B.buildFConstant(S32, 1.0f); 3188 3189 auto DenominatorScaled = 3190 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3191 .addUse(LHS) 3192 .addUse(RHS) 3193 .addImm(0) 3194 .setMIFlags(Flags); 3195 auto NumeratorScaled = 3196 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3197 .addUse(LHS) 3198 .addUse(RHS) 3199 .addImm(1) 3200 .setMIFlags(Flags); 3201 3202 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3203 .addUse(DenominatorScaled.getReg(0)) 3204 .setMIFlags(Flags); 3205 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3206 3207 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3208 // aren't modeled as reading it. 3209 if (!Mode.allFP32Denormals()) 3210 toggleSPDenormMode(true, B, ST, Mode); 3211 3212 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3213 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3214 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3215 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3216 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3217 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3218 3219 if (!Mode.allFP32Denormals()) 3220 toggleSPDenormMode(false, B, ST, Mode); 3221 3222 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3223 .addUse(Fma4.getReg(0)) 3224 .addUse(Fma1.getReg(0)) 3225 .addUse(Fma3.getReg(0)) 3226 .addUse(NumeratorScaled.getReg(1)) 3227 .setMIFlags(Flags); 3228 3229 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3230 .addUse(Fmas.getReg(0)) 3231 .addUse(RHS) 3232 .addUse(LHS) 3233 .setMIFlags(Flags); 3234 3235 MI.eraseFromParent(); 3236 return true; 3237 } 3238 3239 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3240 MachineRegisterInfo &MRI, 3241 MachineIRBuilder &B) const { 3242 Register Res = MI.getOperand(0).getReg(); 3243 Register LHS = MI.getOperand(1).getReg(); 3244 Register RHS = MI.getOperand(2).getReg(); 3245 3246 uint16_t Flags = MI.getFlags(); 3247 3248 LLT S64 = LLT::scalar(64); 3249 LLT S1 = LLT::scalar(1); 3250 3251 auto One = B.buildFConstant(S64, 1.0); 3252 3253 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3254 .addUse(LHS) 3255 .addUse(RHS) 3256 .addImm(0) 3257 .setMIFlags(Flags); 3258 3259 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3260 3261 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3262 .addUse(DivScale0.getReg(0)) 3263 .setMIFlags(Flags); 3264 3265 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3266 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3267 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3268 3269 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3270 .addUse(LHS) 3271 .addUse(RHS) 3272 .addImm(1) 3273 .setMIFlags(Flags); 3274 3275 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3276 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3277 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3278 3279 Register Scale; 3280 if (!ST.hasUsableDivScaleConditionOutput()) { 3281 // Workaround a hardware bug on SI where the condition output from div_scale 3282 // is not usable. 3283 3284 LLT S32 = LLT::scalar(32); 3285 3286 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3287 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3288 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3289 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3290 3291 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3292 Scale1Unmerge.getReg(1)); 3293 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3294 Scale0Unmerge.getReg(1)); 3295 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3296 } else { 3297 Scale = DivScale1.getReg(1); 3298 } 3299 3300 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3301 .addUse(Fma4.getReg(0)) 3302 .addUse(Fma3.getReg(0)) 3303 .addUse(Mul.getReg(0)) 3304 .addUse(Scale) 3305 .setMIFlags(Flags); 3306 3307 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3308 .addUse(Fmas.getReg(0)) 3309 .addUse(RHS) 3310 .addUse(LHS) 3311 .setMIFlags(Flags); 3312 3313 MI.eraseFromParent(); 3314 return true; 3315 } 3316 3317 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3318 MachineRegisterInfo &MRI, 3319 MachineIRBuilder &B) const { 3320 Register Res = MI.getOperand(0).getReg(); 3321 Register LHS = MI.getOperand(2).getReg(); 3322 Register RHS = MI.getOperand(3).getReg(); 3323 uint16_t Flags = MI.getFlags(); 3324 3325 LLT S32 = LLT::scalar(32); 3326 LLT S1 = LLT::scalar(1); 3327 3328 auto Abs = B.buildFAbs(S32, RHS, Flags); 3329 const APFloat C0Val(1.0f); 3330 3331 auto C0 = B.buildConstant(S32, 0x6f800000); 3332 auto C1 = B.buildConstant(S32, 0x2f800000); 3333 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3334 3335 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3336 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3337 3338 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3339 3340 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3341 .addUse(Mul0.getReg(0)) 3342 .setMIFlags(Flags); 3343 3344 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3345 3346 B.buildFMul(Res, Sel, Mul1, Flags); 3347 3348 MI.eraseFromParent(); 3349 return true; 3350 } 3351 3352 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3353 // FIXME: Why do we handle this one but not other removed instructions? 3354 // 3355 // Reciprocal square root. The clamp prevents infinite results, clamping 3356 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3357 // +-max_float. 3358 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3359 MachineRegisterInfo &MRI, 3360 MachineIRBuilder &B) const { 3361 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3362 return true; 3363 3364 Register Dst = MI.getOperand(0).getReg(); 3365 Register Src = MI.getOperand(2).getReg(); 3366 auto Flags = MI.getFlags(); 3367 3368 LLT Ty = MRI.getType(Dst); 3369 3370 const fltSemantics *FltSemantics; 3371 if (Ty == LLT::scalar(32)) 3372 FltSemantics = &APFloat::IEEEsingle(); 3373 else if (Ty == LLT::scalar(64)) 3374 FltSemantics = &APFloat::IEEEdouble(); 3375 else 3376 return false; 3377 3378 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3379 .addUse(Src) 3380 .setMIFlags(Flags); 3381 3382 // We don't need to concern ourselves with the snan handling difference, since 3383 // the rsq quieted (or not) so use the one which will directly select. 3384 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3385 const bool UseIEEE = MFI->getMode().IEEE; 3386 3387 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3388 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3389 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3390 3391 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3392 3393 if (UseIEEE) 3394 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3395 else 3396 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3397 MI.eraseFromParent(); 3398 return true; 3399 } 3400 3401 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3402 switch (IID) { 3403 case Intrinsic::amdgcn_ds_fadd: 3404 return AMDGPU::G_ATOMICRMW_FADD; 3405 case Intrinsic::amdgcn_ds_fmin: 3406 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3407 case Intrinsic::amdgcn_ds_fmax: 3408 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3409 default: 3410 llvm_unreachable("not a DS FP intrinsic"); 3411 } 3412 } 3413 3414 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3415 MachineInstr &MI, 3416 Intrinsic::ID IID) const { 3417 GISelChangeObserver &Observer = Helper.Observer; 3418 Observer.changingInstr(MI); 3419 3420 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3421 3422 // The remaining operands were used to set fields in the MemOperand on 3423 // construction. 3424 for (int I = 6; I > 3; --I) 3425 MI.RemoveOperand(I); 3426 3427 MI.RemoveOperand(1); // Remove the intrinsic ID. 3428 Observer.changedInstr(MI); 3429 return true; 3430 } 3431 3432 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3433 MachineRegisterInfo &MRI, 3434 MachineIRBuilder &B) const { 3435 uint64_t Offset = 3436 ST.getTargetLowering()->getImplicitParameterOffset( 3437 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3438 LLT DstTy = MRI.getType(DstReg); 3439 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3440 3441 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3442 if (!loadInputValue(KernargPtrReg, B, 3443 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3444 return false; 3445 3446 // FIXME: This should be nuw 3447 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3448 return true; 3449 } 3450 3451 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3452 MachineRegisterInfo &MRI, 3453 MachineIRBuilder &B) const { 3454 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3455 if (!MFI->isEntryFunction()) { 3456 return legalizePreloadedArgIntrin(MI, MRI, B, 3457 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3458 } 3459 3460 Register DstReg = MI.getOperand(0).getReg(); 3461 if (!getImplicitArgPtr(DstReg, MRI, B)) 3462 return false; 3463 3464 MI.eraseFromParent(); 3465 return true; 3466 } 3467 3468 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3469 MachineRegisterInfo &MRI, 3470 MachineIRBuilder &B, 3471 unsigned AddrSpace) const { 3472 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3473 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3474 Register Hi32 = Unmerge.getReg(1); 3475 3476 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3477 MI.eraseFromParent(); 3478 return true; 3479 } 3480 3481 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3482 // offset (the offset that is included in bounds checking and swizzling, to be 3483 // split between the instruction's voffset and immoffset fields) and soffset 3484 // (the offset that is excluded from bounds checking and swizzling, to go in 3485 // the instruction's soffset field). This function takes the first kind of 3486 // offset and figures out how to split it between voffset and immoffset. 3487 std::tuple<Register, unsigned, unsigned> 3488 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3489 Register OrigOffset) const { 3490 const unsigned MaxImm = 4095; 3491 Register BaseReg; 3492 unsigned TotalConstOffset; 3493 const LLT S32 = LLT::scalar(32); 3494 3495 std::tie(BaseReg, TotalConstOffset) = 3496 AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3497 3498 unsigned ImmOffset = TotalConstOffset; 3499 3500 // If the immediate value is too big for the immoffset field, put the value 3501 // and -4096 into the immoffset field so that the value that is copied/added 3502 // for the voffset field is a multiple of 4096, and it stands more chance 3503 // of being CSEd with the copy/add for another similar load/store. 3504 // However, do not do that rounding down to a multiple of 4096 if that is a 3505 // negative number, as it appears to be illegal to have a negative offset 3506 // in the vgpr, even if adding the immediate offset makes it positive. 3507 unsigned Overflow = ImmOffset & ~MaxImm; 3508 ImmOffset -= Overflow; 3509 if ((int32_t)Overflow < 0) { 3510 Overflow += ImmOffset; 3511 ImmOffset = 0; 3512 } 3513 3514 if (Overflow != 0) { 3515 if (!BaseReg) { 3516 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3517 } else { 3518 auto OverflowVal = B.buildConstant(S32, Overflow); 3519 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3520 } 3521 } 3522 3523 if (!BaseReg) 3524 BaseReg = B.buildConstant(S32, 0).getReg(0); 3525 3526 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3527 } 3528 3529 /// Handle register layout difference for f16 images for some subtargets. 3530 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3531 MachineRegisterInfo &MRI, 3532 Register Reg, 3533 bool ImageStore) const { 3534 const LLT S16 = LLT::scalar(16); 3535 const LLT S32 = LLT::scalar(32); 3536 LLT StoreVT = MRI.getType(Reg); 3537 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3538 3539 if (ST.hasUnpackedD16VMem()) { 3540 auto Unmerge = B.buildUnmerge(S16, Reg); 3541 3542 SmallVector<Register, 4> WideRegs; 3543 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3544 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3545 3546 int NumElts = StoreVT.getNumElements(); 3547 3548 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3549 } 3550 3551 if (ImageStore && ST.hasImageStoreD16Bug()) { 3552 if (StoreVT.getNumElements() == 2) { 3553 SmallVector<Register, 4> PackedRegs; 3554 Reg = B.buildBitcast(S32, Reg).getReg(0); 3555 PackedRegs.push_back(Reg); 3556 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3557 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3558 } 3559 3560 if (StoreVT.getNumElements() == 3) { 3561 SmallVector<Register, 4> PackedRegs; 3562 auto Unmerge = B.buildUnmerge(S16, Reg); 3563 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3564 PackedRegs.push_back(Unmerge.getReg(I)); 3565 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3566 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3567 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3568 } 3569 3570 if (StoreVT.getNumElements() == 4) { 3571 SmallVector<Register, 4> PackedRegs; 3572 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3573 auto Unmerge = B.buildUnmerge(S32, Reg); 3574 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3575 PackedRegs.push_back(Unmerge.getReg(I)); 3576 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3577 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3578 } 3579 3580 llvm_unreachable("invalid data type"); 3581 } 3582 3583 return Reg; 3584 } 3585 3586 Register AMDGPULegalizerInfo::fixStoreSourceType( 3587 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3588 MachineRegisterInfo *MRI = B.getMRI(); 3589 LLT Ty = MRI->getType(VData); 3590 3591 const LLT S16 = LLT::scalar(16); 3592 3593 // Fixup illegal register types for i8 stores. 3594 if (Ty == LLT::scalar(8) || Ty == S16) { 3595 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3596 return AnyExt; 3597 } 3598 3599 if (Ty.isVector()) { 3600 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3601 if (IsFormat) 3602 return handleD16VData(B, *MRI, VData); 3603 } 3604 } 3605 3606 return VData; 3607 } 3608 3609 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3610 MachineRegisterInfo &MRI, 3611 MachineIRBuilder &B, 3612 bool IsTyped, 3613 bool IsFormat) const { 3614 Register VData = MI.getOperand(1).getReg(); 3615 LLT Ty = MRI.getType(VData); 3616 LLT EltTy = Ty.getScalarType(); 3617 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3618 const LLT S32 = LLT::scalar(32); 3619 3620 VData = fixStoreSourceType(B, VData, IsFormat); 3621 Register RSrc = MI.getOperand(2).getReg(); 3622 3623 MachineMemOperand *MMO = *MI.memoperands_begin(); 3624 const int MemSize = MMO->getSize(); 3625 3626 unsigned ImmOffset; 3627 unsigned TotalOffset; 3628 3629 // The typed intrinsics add an immediate after the registers. 3630 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3631 3632 // The struct intrinsic variants add one additional operand over raw. 3633 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3634 Register VIndex; 3635 int OpOffset = 0; 3636 if (HasVIndex) { 3637 VIndex = MI.getOperand(3).getReg(); 3638 OpOffset = 1; 3639 } 3640 3641 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3642 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3643 3644 unsigned Format = 0; 3645 if (IsTyped) { 3646 Format = MI.getOperand(5 + OpOffset).getImm(); 3647 ++OpOffset; 3648 } 3649 3650 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3651 3652 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3653 if (TotalOffset != 0) 3654 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3655 3656 unsigned Opc; 3657 if (IsTyped) { 3658 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3659 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3660 } else if (IsFormat) { 3661 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3662 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3663 } else { 3664 switch (MemSize) { 3665 case 1: 3666 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3667 break; 3668 case 2: 3669 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3670 break; 3671 default: 3672 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3673 break; 3674 } 3675 } 3676 3677 if (!VIndex) 3678 VIndex = B.buildConstant(S32, 0).getReg(0); 3679 3680 auto MIB = B.buildInstr(Opc) 3681 .addUse(VData) // vdata 3682 .addUse(RSrc) // rsrc 3683 .addUse(VIndex) // vindex 3684 .addUse(VOffset) // voffset 3685 .addUse(SOffset) // soffset 3686 .addImm(ImmOffset); // offset(imm) 3687 3688 if (IsTyped) 3689 MIB.addImm(Format); 3690 3691 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3692 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3693 .addMemOperand(MMO); 3694 3695 MI.eraseFromParent(); 3696 return true; 3697 } 3698 3699 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3700 MachineRegisterInfo &MRI, 3701 MachineIRBuilder &B, 3702 bool IsFormat, 3703 bool IsTyped) const { 3704 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3705 MachineMemOperand *MMO = *MI.memoperands_begin(); 3706 const int MemSize = MMO->getSize(); 3707 const LLT S32 = LLT::scalar(32); 3708 3709 Register Dst = MI.getOperand(0).getReg(); 3710 Register RSrc = MI.getOperand(2).getReg(); 3711 3712 // The typed intrinsics add an immediate after the registers. 3713 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3714 3715 // The struct intrinsic variants add one additional operand over raw. 3716 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3717 Register VIndex; 3718 int OpOffset = 0; 3719 if (HasVIndex) { 3720 VIndex = MI.getOperand(3).getReg(); 3721 OpOffset = 1; 3722 } 3723 3724 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3725 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3726 3727 unsigned Format = 0; 3728 if (IsTyped) { 3729 Format = MI.getOperand(5 + OpOffset).getImm(); 3730 ++OpOffset; 3731 } 3732 3733 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3734 unsigned ImmOffset; 3735 unsigned TotalOffset; 3736 3737 LLT Ty = MRI.getType(Dst); 3738 LLT EltTy = Ty.getScalarType(); 3739 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3740 const bool Unpacked = ST.hasUnpackedD16VMem(); 3741 3742 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3743 if (TotalOffset != 0) 3744 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3745 3746 unsigned Opc; 3747 3748 if (IsTyped) { 3749 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3750 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3751 } else if (IsFormat) { 3752 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3753 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3754 } else { 3755 switch (MemSize) { 3756 case 1: 3757 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3758 break; 3759 case 2: 3760 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3761 break; 3762 default: 3763 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3764 break; 3765 } 3766 } 3767 3768 Register LoadDstReg; 3769 3770 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3771 LLT UnpackedTy = Ty.changeElementSize(32); 3772 3773 if (IsExtLoad) 3774 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3775 else if (Unpacked && IsD16 && Ty.isVector()) 3776 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3777 else 3778 LoadDstReg = Dst; 3779 3780 if (!VIndex) 3781 VIndex = B.buildConstant(S32, 0).getReg(0); 3782 3783 auto MIB = B.buildInstr(Opc) 3784 .addDef(LoadDstReg) // vdata 3785 .addUse(RSrc) // rsrc 3786 .addUse(VIndex) // vindex 3787 .addUse(VOffset) // voffset 3788 .addUse(SOffset) // soffset 3789 .addImm(ImmOffset); // offset(imm) 3790 3791 if (IsTyped) 3792 MIB.addImm(Format); 3793 3794 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3795 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3796 .addMemOperand(MMO); 3797 3798 if (LoadDstReg != Dst) { 3799 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3800 3801 // Widen result for extending loads was widened. 3802 if (IsExtLoad) 3803 B.buildTrunc(Dst, LoadDstReg); 3804 else { 3805 // Repack to original 16-bit vector result 3806 // FIXME: G_TRUNC should work, but legalization currently fails 3807 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3808 SmallVector<Register, 4> Repack; 3809 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3810 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3811 B.buildMerge(Dst, Repack); 3812 } 3813 } 3814 3815 MI.eraseFromParent(); 3816 return true; 3817 } 3818 3819 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3820 MachineIRBuilder &B, 3821 bool IsInc) const { 3822 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3823 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3824 B.buildInstr(Opc) 3825 .addDef(MI.getOperand(0).getReg()) 3826 .addUse(MI.getOperand(2).getReg()) 3827 .addUse(MI.getOperand(3).getReg()) 3828 .cloneMemRefs(MI); 3829 MI.eraseFromParent(); 3830 return true; 3831 } 3832 3833 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3834 switch (IntrID) { 3835 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3836 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3837 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3838 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3839 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3840 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3841 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3842 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3843 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3844 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3845 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3846 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3847 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3848 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3849 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3850 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3851 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3852 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3853 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3854 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3856 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3857 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3858 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3859 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3860 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3861 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3862 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3863 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3864 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3865 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3866 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3868 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3869 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3871 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3872 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3873 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3874 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3875 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3876 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3877 default: 3878 llvm_unreachable("unhandled atomic opcode"); 3879 } 3880 } 3881 3882 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3883 MachineIRBuilder &B, 3884 Intrinsic::ID IID) const { 3885 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3886 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3887 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3888 3889 Register Dst; 3890 3891 int OpOffset = 0; 3892 if (HasReturn) { 3893 // A few FP atomics do not support return values. 3894 Dst = MI.getOperand(0).getReg(); 3895 } else { 3896 OpOffset = -1; 3897 } 3898 3899 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3900 Register CmpVal; 3901 3902 if (IsCmpSwap) { 3903 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3904 ++OpOffset; 3905 } 3906 3907 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3908 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3909 3910 // The struct intrinsic variants add one additional operand over raw. 3911 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3912 Register VIndex; 3913 if (HasVIndex) { 3914 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3915 ++OpOffset; 3916 } 3917 3918 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3919 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3920 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3921 3922 MachineMemOperand *MMO = *MI.memoperands_begin(); 3923 3924 unsigned ImmOffset; 3925 unsigned TotalOffset; 3926 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3927 if (TotalOffset != 0) 3928 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3929 3930 if (!VIndex) 3931 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3932 3933 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3934 3935 if (HasReturn) 3936 MIB.addDef(Dst); 3937 3938 MIB.addUse(VData); // vdata 3939 3940 if (IsCmpSwap) 3941 MIB.addReg(CmpVal); 3942 3943 MIB.addUse(RSrc) // rsrc 3944 .addUse(VIndex) // vindex 3945 .addUse(VOffset) // voffset 3946 .addUse(SOffset) // soffset 3947 .addImm(ImmOffset) // offset(imm) 3948 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3949 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3950 .addMemOperand(MMO); 3951 3952 MI.eraseFromParent(); 3953 return true; 3954 } 3955 3956 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3957 /// vector with s16 typed elements. 3958 static void packImageA16AddressToDwords( 3959 MachineIRBuilder &B, MachineInstr &MI, 3960 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 3961 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 3962 const LLT S16 = LLT::scalar(16); 3963 const LLT V2S16 = LLT::vector(2, 16); 3964 3965 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 3966 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 3967 if (!SrcOp.isReg()) 3968 continue; // _L to _LZ may have eliminated this. 3969 3970 Register AddrReg = SrcOp.getReg(); 3971 3972 if (I < Intr->GradientStart) { 3973 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3974 PackedAddrs.push_back(AddrReg); 3975 } else { 3976 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3977 // derivatives dx/dh and dx/dv are packed with undef. 3978 if (((I + 1) >= EndIdx) || 3979 ((Intr->NumGradients / 2) % 2 == 1 && 3980 (I == static_cast<unsigned>(Intr->GradientStart + 3981 (Intr->NumGradients / 2) - 1) || 3982 I == static_cast<unsigned>(Intr->GradientStart + 3983 Intr->NumGradients - 1))) || 3984 // Check for _L to _LZ optimization 3985 !MI.getOperand(ArgOffset + I + 1).isReg()) { 3986 PackedAddrs.push_back( 3987 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3988 .getReg(0)); 3989 } else { 3990 PackedAddrs.push_back( 3991 B.buildBuildVector( 3992 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 3993 .getReg(0)); 3994 ++I; 3995 } 3996 } 3997 } 3998 } 3999 4000 /// Convert from separate vaddr components to a single vector address register, 4001 /// and replace the remaining operands with $noreg. 4002 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4003 int DimIdx, int NumVAddrs) { 4004 const LLT S32 = LLT::scalar(32); 4005 4006 SmallVector<Register, 8> AddrRegs; 4007 for (int I = 0; I != NumVAddrs; ++I) { 4008 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4009 if (SrcOp.isReg()) { 4010 AddrRegs.push_back(SrcOp.getReg()); 4011 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4012 } 4013 } 4014 4015 int NumAddrRegs = AddrRegs.size(); 4016 if (NumAddrRegs != 1) { 4017 // Round up to 8 elements for v5-v7 4018 // FIXME: Missing intermediate sized register classes and instructions. 4019 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4020 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4021 auto Undef = B.buildUndef(S32); 4022 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4023 NumAddrRegs = RoundedNumRegs; 4024 } 4025 4026 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4027 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4028 } 4029 4030 for (int I = 1; I != NumVAddrs; ++I) { 4031 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4032 if (SrcOp.isReg()) 4033 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4034 } 4035 } 4036 4037 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4038 /// 4039 /// Depending on the subtarget, load/store with 16-bit element data need to be 4040 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4041 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4042 /// registers. 4043 /// 4044 /// We don't want to directly select image instructions just yet, but also want 4045 /// to exposes all register repacking to the legalizer/combiners. We also don't 4046 /// want a selected instrution entering RegBankSelect. In order to avoid 4047 /// defining a multitude of intermediate image instructions, directly hack on 4048 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4049 /// now unnecessary arguments with $noreg. 4050 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4051 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4052 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4053 4054 const unsigned NumDefs = MI.getNumExplicitDefs(); 4055 const unsigned ArgOffset = NumDefs + 1; 4056 bool IsTFE = NumDefs == 2; 4057 // We are only processing the operands of d16 image operations on subtargets 4058 // that use the unpacked register layout, or need to repack the TFE result. 4059 4060 // TODO: Do we need to guard against already legalized intrinsics? 4061 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4062 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4063 4064 MachineRegisterInfo *MRI = B.getMRI(); 4065 const LLT S32 = LLT::scalar(32); 4066 const LLT S16 = LLT::scalar(16); 4067 const LLT V2S16 = LLT::vector(2, 16); 4068 4069 unsigned DMask = 0; 4070 4071 // Check for 16 bit addresses and pack if true. 4072 LLT GradTy = 4073 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4074 LLT AddrTy = 4075 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4076 const bool IsG16 = GradTy == S16; 4077 const bool IsA16 = AddrTy == S16; 4078 4079 int DMaskLanes = 0; 4080 if (!BaseOpcode->Atomic) { 4081 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4082 if (BaseOpcode->Gather4) { 4083 DMaskLanes = 4; 4084 } else if (DMask != 0) { 4085 DMaskLanes = countPopulation(DMask); 4086 } else if (!IsTFE && !BaseOpcode->Store) { 4087 // If dmask is 0, this is a no-op load. This can be eliminated. 4088 B.buildUndef(MI.getOperand(0)); 4089 MI.eraseFromParent(); 4090 return true; 4091 } 4092 } 4093 4094 Observer.changingInstr(MI); 4095 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4096 4097 unsigned NewOpcode = NumDefs == 0 ? 4098 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4099 4100 // Track that we legalized this 4101 MI.setDesc(B.getTII().get(NewOpcode)); 4102 4103 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4104 // dmask to be at least 1 otherwise the instruction will fail 4105 if (IsTFE && DMask == 0) { 4106 DMask = 0x1; 4107 DMaskLanes = 1; 4108 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4109 } 4110 4111 if (BaseOpcode->Atomic) { 4112 Register VData0 = MI.getOperand(2).getReg(); 4113 LLT Ty = MRI->getType(VData0); 4114 4115 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4116 if (Ty.isVector()) 4117 return false; 4118 4119 if (BaseOpcode->AtomicX2) { 4120 Register VData1 = MI.getOperand(3).getReg(); 4121 // The two values are packed in one register. 4122 LLT PackedTy = LLT::vector(2, Ty); 4123 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4124 MI.getOperand(2).setReg(Concat.getReg(0)); 4125 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4126 } 4127 } 4128 4129 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4130 4131 // Optimize _L to _LZ when _L is zero 4132 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4133 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4134 const ConstantFP *ConstantLod; 4135 4136 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4137 m_GFCst(ConstantLod))) { 4138 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4139 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4140 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4141 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4142 Intr->Dim); 4143 4144 // The starting indexes should remain in the same place. 4145 --CorrectedNumVAddrs; 4146 4147 MI.getOperand(MI.getNumExplicitDefs()) 4148 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4149 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4150 Intr = NewImageDimIntr; 4151 } 4152 } 4153 } 4154 4155 // Optimize _mip away, when 'lod' is zero 4156 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4157 int64_t ConstantLod; 4158 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4159 m_ICst(ConstantLod))) { 4160 if (ConstantLod == 0) { 4161 // TODO: Change intrinsic opcode and remove operand instead or replacing 4162 // it with 0, as the _L to _LZ handling is done above. 4163 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4164 --CorrectedNumVAddrs; 4165 } 4166 } 4167 } 4168 4169 // Rewrite the addressing register layout before doing anything else. 4170 if (IsA16 || IsG16) { 4171 if (IsA16) { 4172 // Target must support the feature and gradients need to be 16 bit too 4173 if (!ST.hasA16() || !IsG16) 4174 return false; 4175 } else if (!ST.hasG16()) 4176 return false; 4177 4178 if (Intr->NumVAddrs > 1) { 4179 SmallVector<Register, 4> PackedRegs; 4180 // Don't compress addresses for G16 4181 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4182 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4183 PackEndIdx); 4184 4185 if (!IsA16) { 4186 // Add uncompressed address 4187 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4188 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4189 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4190 PackedRegs.push_back(AddrReg); 4191 } 4192 } 4193 4194 // See also below in the non-a16 branch 4195 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4196 4197 if (!UseNSA && PackedRegs.size() > 1) { 4198 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4199 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4200 PackedRegs[0] = Concat.getReg(0); 4201 PackedRegs.resize(1); 4202 } 4203 4204 const unsigned NumPacked = PackedRegs.size(); 4205 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4206 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4207 if (!SrcOp.isReg()) { 4208 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4209 continue; 4210 } 4211 4212 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4213 4214 if (I - Intr->VAddrStart < NumPacked) 4215 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4216 else 4217 SrcOp.setReg(AMDGPU::NoRegister); 4218 } 4219 } 4220 } else { 4221 // If the register allocator cannot place the address registers contiguously 4222 // without introducing moves, then using the non-sequential address encoding 4223 // is always preferable, since it saves VALU instructions and is usually a 4224 // wash in terms of code size or even better. 4225 // 4226 // However, we currently have no way of hinting to the register allocator 4227 // that MIMG addresses should be placed contiguously when it is possible to 4228 // do so, so force non-NSA for the common 2-address case as a heuristic. 4229 // 4230 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4231 // allocation when possible. 4232 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4233 4234 if (!UseNSA && Intr->NumVAddrs > 1) 4235 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4236 Intr->NumVAddrs); 4237 } 4238 4239 int Flags = 0; 4240 if (IsA16) 4241 Flags |= 1; 4242 if (IsG16) 4243 Flags |= 2; 4244 MI.addOperand(MachineOperand::CreateImm(Flags)); 4245 4246 if (BaseOpcode->Store) { // No TFE for stores? 4247 // TODO: Handle dmask trim 4248 Register VData = MI.getOperand(1).getReg(); 4249 LLT Ty = MRI->getType(VData); 4250 if (!Ty.isVector() || Ty.getElementType() != S16) 4251 return true; 4252 4253 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4254 if (RepackedReg != VData) { 4255 MI.getOperand(1).setReg(RepackedReg); 4256 } 4257 4258 return true; 4259 } 4260 4261 Register DstReg = MI.getOperand(0).getReg(); 4262 LLT Ty = MRI->getType(DstReg); 4263 const LLT EltTy = Ty.getScalarType(); 4264 const bool IsD16 = Ty.getScalarType() == S16; 4265 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4266 4267 // Confirm that the return type is large enough for the dmask specified 4268 if (NumElts < DMaskLanes) 4269 return false; 4270 4271 if (NumElts > 4 || DMaskLanes > 4) 4272 return false; 4273 4274 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4275 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4276 4277 // The raw dword aligned data component of the load. The only legal cases 4278 // where this matters should be when using the packed D16 format, for 4279 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4280 LLT RoundedTy; 4281 4282 // S32 vector to to cover all data, plus TFE result element. 4283 LLT TFETy; 4284 4285 // Register type to use for each loaded component. Will be S32 or V2S16. 4286 LLT RegTy; 4287 4288 if (IsD16 && ST.hasUnpackedD16VMem()) { 4289 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4290 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4291 RegTy = S32; 4292 } else { 4293 unsigned EltSize = EltTy.getSizeInBits(); 4294 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4295 unsigned RoundedSize = 32 * RoundedElts; 4296 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4297 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4298 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4299 } 4300 4301 // The return type does not need adjustment. 4302 // TODO: Should we change s16 case to s32 or <2 x s16>? 4303 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4304 return true; 4305 4306 Register Dst1Reg; 4307 4308 // Insert after the instruction. 4309 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4310 4311 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4312 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4313 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4314 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4315 4316 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4317 4318 MI.getOperand(0).setReg(NewResultReg); 4319 4320 // In the IR, TFE is supposed to be used with a 2 element struct return 4321 // type. The intruction really returns these two values in one contiguous 4322 // register, with one additional dword beyond the loaded data. Rewrite the 4323 // return type to use a single register result. 4324 4325 if (IsTFE) { 4326 Dst1Reg = MI.getOperand(1).getReg(); 4327 if (MRI->getType(Dst1Reg) != S32) 4328 return false; 4329 4330 // TODO: Make sure the TFE operand bit is set. 4331 MI.RemoveOperand(1); 4332 4333 // Handle the easy case that requires no repack instructions. 4334 if (Ty == S32) { 4335 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4336 return true; 4337 } 4338 } 4339 4340 // Now figure out how to copy the new result register back into the old 4341 // result. 4342 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4343 4344 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4345 4346 if (ResultNumRegs == 1) { 4347 assert(!IsTFE); 4348 ResultRegs[0] = NewResultReg; 4349 } else { 4350 // We have to repack into a new vector of some kind. 4351 for (int I = 0; I != NumDataRegs; ++I) 4352 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4353 B.buildUnmerge(ResultRegs, NewResultReg); 4354 4355 // Drop the final TFE element to get the data part. The TFE result is 4356 // directly written to the right place already. 4357 if (IsTFE) 4358 ResultRegs.resize(NumDataRegs); 4359 } 4360 4361 // For an s16 scalar result, we form an s32 result with a truncate regardless 4362 // of packed vs. unpacked. 4363 if (IsD16 && !Ty.isVector()) { 4364 B.buildTrunc(DstReg, ResultRegs[0]); 4365 return true; 4366 } 4367 4368 // Avoid a build/concat_vector of 1 entry. 4369 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4370 B.buildBitcast(DstReg, ResultRegs[0]); 4371 return true; 4372 } 4373 4374 assert(Ty.isVector()); 4375 4376 if (IsD16) { 4377 // For packed D16 results with TFE enabled, all the data components are 4378 // S32. Cast back to the expected type. 4379 // 4380 // TODO: We don't really need to use load s32 elements. We would only need one 4381 // cast for the TFE result if a multiple of v2s16 was used. 4382 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4383 for (Register &Reg : ResultRegs) 4384 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4385 } else if (ST.hasUnpackedD16VMem()) { 4386 for (Register &Reg : ResultRegs) 4387 Reg = B.buildTrunc(S16, Reg).getReg(0); 4388 } 4389 } 4390 4391 auto padWithUndef = [&](LLT Ty, int NumElts) { 4392 if (NumElts == 0) 4393 return; 4394 Register Undef = B.buildUndef(Ty).getReg(0); 4395 for (int I = 0; I != NumElts; ++I) 4396 ResultRegs.push_back(Undef); 4397 }; 4398 4399 // Pad out any elements eliminated due to the dmask. 4400 LLT ResTy = MRI->getType(ResultRegs[0]); 4401 if (!ResTy.isVector()) { 4402 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4403 B.buildBuildVector(DstReg, ResultRegs); 4404 return true; 4405 } 4406 4407 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4408 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4409 4410 // Deal with the one annoying legal case. 4411 const LLT V3S16 = LLT::vector(3, 16); 4412 if (Ty == V3S16) { 4413 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4414 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4415 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4416 return true; 4417 } 4418 4419 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4420 B.buildConcatVectors(DstReg, ResultRegs); 4421 return true; 4422 } 4423 4424 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4425 LegalizerHelper &Helper, MachineInstr &MI) const { 4426 MachineIRBuilder &B = Helper.MIRBuilder; 4427 GISelChangeObserver &Observer = Helper.Observer; 4428 4429 Register Dst = MI.getOperand(0).getReg(); 4430 LLT Ty = B.getMRI()->getType(Dst); 4431 unsigned Size = Ty.getSizeInBits(); 4432 MachineFunction &MF = B.getMF(); 4433 4434 Observer.changingInstr(MI); 4435 4436 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4437 Ty = getBitcastRegisterType(Ty); 4438 Helper.bitcastDst(MI, Ty, 0); 4439 Dst = MI.getOperand(0).getReg(); 4440 B.setInsertPt(B.getMBB(), MI); 4441 } 4442 4443 // FIXME: We don't really need this intermediate instruction. The intrinsic 4444 // should be fixed to have a memory operand. Since it's readnone, we're not 4445 // allowed to add one. 4446 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4447 MI.RemoveOperand(1); // Remove intrinsic ID 4448 4449 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4450 // TODO: Should this use datalayout alignment? 4451 const unsigned MemSize = (Size + 7) / 8; 4452 const Align MemAlign(4); 4453 MachineMemOperand *MMO = MF.getMachineMemOperand( 4454 MachinePointerInfo(), 4455 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4456 MachineMemOperand::MOInvariant, 4457 MemSize, MemAlign); 4458 MI.addMemOperand(MF, MMO); 4459 4460 // There are no 96-bit result scalar loads, but widening to 128-bit should 4461 // always be legal. We may need to restore this to a 96-bit result if it turns 4462 // out this needs to be converted to a vector load during RegBankSelect. 4463 if (!isPowerOf2_32(Size)) { 4464 if (Ty.isVector()) 4465 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4466 else 4467 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4468 } 4469 4470 Observer.changedInstr(MI); 4471 return true; 4472 } 4473 4474 // TODO: Move to selection 4475 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4476 MachineRegisterInfo &MRI, 4477 MachineIRBuilder &B) const { 4478 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4479 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4480 !ST.isTrapHandlerEnabled()) { 4481 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4482 } else { 4483 // Pass queue pointer to trap handler as input, and insert trap instruction 4484 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4485 MachineRegisterInfo &MRI = *B.getMRI(); 4486 4487 Register LiveIn = 4488 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4489 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4490 return false; 4491 4492 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4493 B.buildCopy(SGPR01, LiveIn); 4494 B.buildInstr(AMDGPU::S_TRAP) 4495 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4496 .addReg(SGPR01, RegState::Implicit); 4497 } 4498 4499 MI.eraseFromParent(); 4500 return true; 4501 } 4502 4503 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4504 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4505 // Is non-HSA path or trap-handler disabled? then, report a warning 4506 // accordingly 4507 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4508 !ST.isTrapHandlerEnabled()) { 4509 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4510 "debugtrap handler not supported", 4511 MI.getDebugLoc(), DS_Warning); 4512 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4513 Ctx.diagnose(NoTrap); 4514 } else { 4515 // Insert debug-trap instruction 4516 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4517 } 4518 4519 MI.eraseFromParent(); 4520 return true; 4521 } 4522 4523 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4524 MachineIRBuilder &B) const { 4525 MachineRegisterInfo &MRI = *B.getMRI(); 4526 const LLT S16 = LLT::scalar(16); 4527 const LLT S32 = LLT::scalar(32); 4528 4529 Register DstReg = MI.getOperand(0).getReg(); 4530 Register NodePtr = MI.getOperand(2).getReg(); 4531 Register RayExtent = MI.getOperand(3).getReg(); 4532 Register RayOrigin = MI.getOperand(4).getReg(); 4533 Register RayDir = MI.getOperand(5).getReg(); 4534 Register RayInvDir = MI.getOperand(6).getReg(); 4535 Register TDescr = MI.getOperand(7).getReg(); 4536 4537 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4538 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4539 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4540 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4541 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4542 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4543 4544 SmallVector<Register, 12> Ops; 4545 if (Is64) { 4546 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4547 Ops.push_back(Unmerge.getReg(0)); 4548 Ops.push_back(Unmerge.getReg(1)); 4549 } else { 4550 Ops.push_back(NodePtr); 4551 } 4552 Ops.push_back(RayExtent); 4553 4554 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4555 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4556 Ops.push_back(Unmerge.getReg(0)); 4557 Ops.push_back(Unmerge.getReg(1)); 4558 Ops.push_back(Unmerge.getReg(2)); 4559 }; 4560 4561 packLanes(RayOrigin); 4562 if (IsA16) { 4563 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4564 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4565 Register R1 = MRI.createGenericVirtualRegister(S32); 4566 Register R2 = MRI.createGenericVirtualRegister(S32); 4567 Register R3 = MRI.createGenericVirtualRegister(S32); 4568 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4569 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4570 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4571 Ops.push_back(R1); 4572 Ops.push_back(R2); 4573 Ops.push_back(R3); 4574 } else { 4575 packLanes(RayDir); 4576 packLanes(RayInvDir); 4577 } 4578 4579 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4580 .addDef(DstReg) 4581 .addImm(Opcode); 4582 4583 for (Register R : Ops) { 4584 MIB.addUse(R); 4585 } 4586 4587 MIB.addUse(TDescr) 4588 .addImm(IsA16 ? 1 : 0) 4589 .cloneMemRefs(MI); 4590 4591 MI.eraseFromParent(); 4592 return true; 4593 } 4594 4595 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4596 MachineInstr &MI) const { 4597 MachineIRBuilder &B = Helper.MIRBuilder; 4598 MachineRegisterInfo &MRI = *B.getMRI(); 4599 4600 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4601 auto IntrID = MI.getIntrinsicID(); 4602 switch (IntrID) { 4603 case Intrinsic::amdgcn_if: 4604 case Intrinsic::amdgcn_else: { 4605 MachineInstr *Br = nullptr; 4606 MachineBasicBlock *UncondBrTarget = nullptr; 4607 bool Negated = false; 4608 if (MachineInstr *BrCond = 4609 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4610 const SIRegisterInfo *TRI 4611 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4612 4613 Register Def = MI.getOperand(1).getReg(); 4614 Register Use = MI.getOperand(3).getReg(); 4615 4616 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4617 4618 if (Negated) 4619 std::swap(CondBrTarget, UncondBrTarget); 4620 4621 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4622 if (IntrID == Intrinsic::amdgcn_if) { 4623 B.buildInstr(AMDGPU::SI_IF) 4624 .addDef(Def) 4625 .addUse(Use) 4626 .addMBB(UncondBrTarget); 4627 } else { 4628 B.buildInstr(AMDGPU::SI_ELSE) 4629 .addDef(Def) 4630 .addUse(Use) 4631 .addMBB(UncondBrTarget); 4632 } 4633 4634 if (Br) { 4635 Br->getOperand(0).setMBB(CondBrTarget); 4636 } else { 4637 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4638 // since we're swapping branch targets it needs to be reinserted. 4639 // FIXME: IRTranslator should probably not do this 4640 B.buildBr(*CondBrTarget); 4641 } 4642 4643 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4644 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4645 MI.eraseFromParent(); 4646 BrCond->eraseFromParent(); 4647 return true; 4648 } 4649 4650 return false; 4651 } 4652 case Intrinsic::amdgcn_loop: { 4653 MachineInstr *Br = nullptr; 4654 MachineBasicBlock *UncondBrTarget = nullptr; 4655 bool Negated = false; 4656 if (MachineInstr *BrCond = 4657 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4658 const SIRegisterInfo *TRI 4659 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4660 4661 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4662 Register Reg = MI.getOperand(2).getReg(); 4663 4664 if (Negated) 4665 std::swap(CondBrTarget, UncondBrTarget); 4666 4667 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4668 B.buildInstr(AMDGPU::SI_LOOP) 4669 .addUse(Reg) 4670 .addMBB(UncondBrTarget); 4671 4672 if (Br) 4673 Br->getOperand(0).setMBB(CondBrTarget); 4674 else 4675 B.buildBr(*CondBrTarget); 4676 4677 MI.eraseFromParent(); 4678 BrCond->eraseFromParent(); 4679 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4680 return true; 4681 } 4682 4683 return false; 4684 } 4685 case Intrinsic::amdgcn_kernarg_segment_ptr: 4686 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4687 // This only makes sense to call in a kernel, so just lower to null. 4688 B.buildConstant(MI.getOperand(0).getReg(), 0); 4689 MI.eraseFromParent(); 4690 return true; 4691 } 4692 4693 return legalizePreloadedArgIntrin( 4694 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4695 case Intrinsic::amdgcn_implicitarg_ptr: 4696 return legalizeImplicitArgPtr(MI, MRI, B); 4697 case Intrinsic::amdgcn_workitem_id_x: 4698 return legalizePreloadedArgIntrin(MI, MRI, B, 4699 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4700 case Intrinsic::amdgcn_workitem_id_y: 4701 return legalizePreloadedArgIntrin(MI, MRI, B, 4702 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4703 case Intrinsic::amdgcn_workitem_id_z: 4704 return legalizePreloadedArgIntrin(MI, MRI, B, 4705 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4706 case Intrinsic::amdgcn_workgroup_id_x: 4707 return legalizePreloadedArgIntrin(MI, MRI, B, 4708 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4709 case Intrinsic::amdgcn_workgroup_id_y: 4710 return legalizePreloadedArgIntrin(MI, MRI, B, 4711 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4712 case Intrinsic::amdgcn_workgroup_id_z: 4713 return legalizePreloadedArgIntrin(MI, MRI, B, 4714 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4715 case Intrinsic::amdgcn_dispatch_ptr: 4716 return legalizePreloadedArgIntrin(MI, MRI, B, 4717 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4718 case Intrinsic::amdgcn_queue_ptr: 4719 return legalizePreloadedArgIntrin(MI, MRI, B, 4720 AMDGPUFunctionArgInfo::QUEUE_PTR); 4721 case Intrinsic::amdgcn_implicit_buffer_ptr: 4722 return legalizePreloadedArgIntrin( 4723 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4724 case Intrinsic::amdgcn_dispatch_id: 4725 return legalizePreloadedArgIntrin(MI, MRI, B, 4726 AMDGPUFunctionArgInfo::DISPATCH_ID); 4727 case Intrinsic::amdgcn_fdiv_fast: 4728 return legalizeFDIVFastIntrin(MI, MRI, B); 4729 case Intrinsic::amdgcn_is_shared: 4730 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4731 case Intrinsic::amdgcn_is_private: 4732 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4733 case Intrinsic::amdgcn_wavefrontsize: { 4734 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4735 MI.eraseFromParent(); 4736 return true; 4737 } 4738 case Intrinsic::amdgcn_s_buffer_load: 4739 return legalizeSBufferLoad(Helper, MI); 4740 case Intrinsic::amdgcn_raw_buffer_store: 4741 case Intrinsic::amdgcn_struct_buffer_store: 4742 return legalizeBufferStore(MI, MRI, B, false, false); 4743 case Intrinsic::amdgcn_raw_buffer_store_format: 4744 case Intrinsic::amdgcn_struct_buffer_store_format: 4745 return legalizeBufferStore(MI, MRI, B, false, true); 4746 case Intrinsic::amdgcn_raw_tbuffer_store: 4747 case Intrinsic::amdgcn_struct_tbuffer_store: 4748 return legalizeBufferStore(MI, MRI, B, true, true); 4749 case Intrinsic::amdgcn_raw_buffer_load: 4750 case Intrinsic::amdgcn_struct_buffer_load: 4751 return legalizeBufferLoad(MI, MRI, B, false, false); 4752 case Intrinsic::amdgcn_raw_buffer_load_format: 4753 case Intrinsic::amdgcn_struct_buffer_load_format: 4754 return legalizeBufferLoad(MI, MRI, B, true, false); 4755 case Intrinsic::amdgcn_raw_tbuffer_load: 4756 case Intrinsic::amdgcn_struct_tbuffer_load: 4757 return legalizeBufferLoad(MI, MRI, B, true, true); 4758 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4759 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4760 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4761 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4762 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4763 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4764 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4765 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4766 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4767 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4768 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4769 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4770 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4771 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4772 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4773 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4774 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4775 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4776 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4777 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4778 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4779 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4780 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4781 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4782 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4783 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4784 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4785 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4786 return legalizeBufferAtomic(MI, B, IntrID); 4787 case Intrinsic::amdgcn_atomic_inc: 4788 return legalizeAtomicIncDec(MI, B, true); 4789 case Intrinsic::amdgcn_atomic_dec: 4790 return legalizeAtomicIncDec(MI, B, false); 4791 case Intrinsic::trap: 4792 return legalizeTrapIntrinsic(MI, MRI, B); 4793 case Intrinsic::debugtrap: 4794 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4795 case Intrinsic::amdgcn_rsq_clamp: 4796 return legalizeRsqClampIntrinsic(MI, MRI, B); 4797 case Intrinsic::amdgcn_ds_fadd: 4798 case Intrinsic::amdgcn_ds_fmin: 4799 case Intrinsic::amdgcn_ds_fmax: 4800 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4801 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4802 return legalizeBVHIntrinsic(MI, B); 4803 default: { 4804 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4805 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4806 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4807 return true; 4808 } 4809 } 4810 4811 return true; 4812 } 4813