1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 364 const GCNTargetMachine &TM) 365 : ST(ST_) { 366 using namespace TargetOpcode; 367 368 auto GetAddrSpacePtr = [&TM](unsigned AS) { 369 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 370 }; 371 372 const LLT S1 = LLT::scalar(1); 373 const LLT S16 = LLT::scalar(16); 374 const LLT S32 = LLT::scalar(32); 375 const LLT S64 = LLT::scalar(64); 376 const LLT S128 = LLT::scalar(128); 377 const LLT S256 = LLT::scalar(256); 378 const LLT S512 = LLT::scalar(512); 379 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 380 381 const LLT V2S16 = LLT::vector(2, 16); 382 const LLT V4S16 = LLT::vector(4, 16); 383 384 const LLT V2S32 = LLT::vector(2, 32); 385 const LLT V3S32 = LLT::vector(3, 32); 386 const LLT V4S32 = LLT::vector(4, 32); 387 const LLT V5S32 = LLT::vector(5, 32); 388 const LLT V6S32 = LLT::vector(6, 32); 389 const LLT V7S32 = LLT::vector(7, 32); 390 const LLT V8S32 = LLT::vector(8, 32); 391 const LLT V9S32 = LLT::vector(9, 32); 392 const LLT V10S32 = LLT::vector(10, 32); 393 const LLT V11S32 = LLT::vector(11, 32); 394 const LLT V12S32 = LLT::vector(12, 32); 395 const LLT V13S32 = LLT::vector(13, 32); 396 const LLT V14S32 = LLT::vector(14, 32); 397 const LLT V15S32 = LLT::vector(15, 32); 398 const LLT V16S32 = LLT::vector(16, 32); 399 const LLT V32S32 = LLT::vector(32, 32); 400 401 const LLT V2S64 = LLT::vector(2, 64); 402 const LLT V3S64 = LLT::vector(3, 64); 403 const LLT V4S64 = LLT::vector(4, 64); 404 const LLT V5S64 = LLT::vector(5, 64); 405 const LLT V6S64 = LLT::vector(6, 64); 406 const LLT V7S64 = LLT::vector(7, 64); 407 const LLT V8S64 = LLT::vector(8, 64); 408 const LLT V16S64 = LLT::vector(16, 64); 409 410 std::initializer_list<LLT> AllS32Vectors = 411 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 412 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 413 std::initializer_list<LLT> AllS64Vectors = 414 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 415 416 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 417 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 418 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 419 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 420 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 421 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 422 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 423 424 const LLT CodePtr = FlatPtr; 425 426 const std::initializer_list<LLT> AddrSpaces64 = { 427 GlobalPtr, ConstantPtr, FlatPtr 428 }; 429 430 const std::initializer_list<LLT> AddrSpaces32 = { 431 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 432 }; 433 434 const std::initializer_list<LLT> FPTypesBase = { 435 S32, S64 436 }; 437 438 const std::initializer_list<LLT> FPTypes16 = { 439 S32, S64, S16 440 }; 441 442 const std::initializer_list<LLT> FPTypesPK16 = { 443 S32, S64, S16, V2S16 444 }; 445 446 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 447 448 setAction({G_BRCOND, S1}, Legal); // VCC branches 449 setAction({G_BRCOND, S32}, Legal); // SCC branches 450 451 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 452 // elements for v3s16 453 getActionDefinitionsBuilder(G_PHI) 454 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 455 .legalFor(AllS32Vectors) 456 .legalFor(AllS64Vectors) 457 .legalFor(AddrSpaces64) 458 .legalFor(AddrSpaces32) 459 .legalIf(isPointer(0)) 460 .clampScalar(0, S16, S256) 461 .widenScalarToNextPow2(0, 32) 462 .clampMaxNumElements(0, S32, 16) 463 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 464 .scalarize(0); 465 466 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 467 // Full set of gfx9 features. 468 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 469 .legalFor({S32, S16, V2S16}) 470 .clampScalar(0, S16, S32) 471 .clampMaxNumElements(0, S16, 2) 472 .scalarize(0) 473 .widenScalarToNextPow2(0, 32); 474 475 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 476 .legalFor({S32, S16, V2S16}) // Clamp modifier 477 .minScalarOrElt(0, S16) 478 .clampMaxNumElements(0, S16, 2) 479 .scalarize(0) 480 .widenScalarToNextPow2(0, 32) 481 .lower(); 482 } else if (ST.has16BitInsts()) { 483 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 484 .legalFor({S32, S16}) 485 .clampScalar(0, S16, S32) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 488 489 // Technically the saturating operations require clamp bit support, but this 490 // was introduced at the same time as 16-bit operations. 491 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 492 .legalFor({S32, S16}) // Clamp modifier 493 .minScalar(0, S16) 494 .scalarize(0) 495 .widenScalarToNextPow2(0, 16) 496 .lower(); 497 498 // We're just lowering this, but it helps get a better result to try to 499 // coerce to the desired type first. 500 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 501 .minScalar(0, S16) 502 .scalarize(0) 503 .lower(); 504 } else { 505 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 506 .legalFor({S32}) 507 .clampScalar(0, S32, S32) 508 .scalarize(0); 509 510 if (ST.hasIntClamp()) { 511 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 512 .legalFor({S32}) // Clamp modifier. 513 .scalarize(0) 514 .minScalarOrElt(0, S32) 515 .lower(); 516 } else { 517 // Clamp bit support was added in VI, along with 16-bit operations. 518 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 // FIXME: DAG expansion gets better results. The widening uses the smaller 525 // range values and goes for the min/max lowering directly. 526 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 527 .minScalar(0, S32) 528 .scalarize(0) 529 .lower(); 530 } 531 532 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 533 .customFor({S32, S64}) 534 .clampScalar(0, S32, S64) 535 .widenScalarToNextPow2(0, 32) 536 .scalarize(0); 537 538 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 // Report legal for any types we can handle anywhere. For the cases only legal 544 // on the SALU, RegBankSelect will be able to re-legalize. 545 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 546 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 547 .clampScalar(0, S32, S64) 548 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 549 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 553 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 554 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 555 .legalFor({{S32, S1}, {S32, S32}}) 556 .minScalar(0, S32) 557 // TODO: .scalarize(0) 558 .lower(); 559 560 getActionDefinitionsBuilder(G_BITCAST) 561 // Don't worry about the size constraint. 562 .legalIf(all(isRegisterType(0), isRegisterType(1))) 563 .lower(); 564 565 566 getActionDefinitionsBuilder(G_CONSTANT) 567 .legalFor({S1, S32, S64, S16, GlobalPtr, 568 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 569 .legalIf(isPointer(0)) 570 .clampScalar(0, S32, S64) 571 .widenScalarToNextPow2(0); 572 573 getActionDefinitionsBuilder(G_FCONSTANT) 574 .legalFor({S32, S64, S16}) 575 .clampScalar(0, S16, S64); 576 577 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 578 .legalIf(isRegisterType(0)) 579 // s1 and s16 are special cases because they have legal operations on 580 // them, but don't really occupy registers in the normal way. 581 .legalFor({S1, S16}) 582 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 583 .clampScalarOrElt(0, S32, MaxScalar) 584 .widenScalarToNextPow2(0, 32) 585 .clampMaxNumElements(0, S32, 16); 586 587 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 588 589 // If the amount is divergent, we have to do a wave reduction to get the 590 // maximum value, so this is expanded during RegBankSelect. 591 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 592 .legalFor({{PrivatePtr, S32}}); 593 594 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 595 .customIf(typeIsNot(0, PrivatePtr)); 596 597 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 598 599 auto &FPOpActions = getActionDefinitionsBuilder( 600 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 601 .legalFor({S32, S64}); 602 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 603 .customFor({S32, S64}); 604 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 605 .customFor({S32, S64}); 606 607 if (ST.has16BitInsts()) { 608 if (ST.hasVOP3PInsts()) 609 FPOpActions.legalFor({S16, V2S16}); 610 else 611 FPOpActions.legalFor({S16}); 612 613 TrigActions.customFor({S16}); 614 FDIVActions.customFor({S16}); 615 } 616 617 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 618 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 619 620 if (ST.hasVOP3PInsts()) { 621 MinNumMaxNum.customFor(FPTypesPK16) 622 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 623 .clampMaxNumElements(0, S16, 2) 624 .clampScalar(0, S16, S64) 625 .scalarize(0); 626 } else if (ST.has16BitInsts()) { 627 MinNumMaxNum.customFor(FPTypes16) 628 .clampScalar(0, S16, S64) 629 .scalarize(0); 630 } else { 631 MinNumMaxNum.customFor(FPTypesBase) 632 .clampScalar(0, S32, S64) 633 .scalarize(0); 634 } 635 636 if (ST.hasVOP3PInsts()) 637 FPOpActions.clampMaxNumElements(0, S16, 2); 638 639 FPOpActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 TrigActions 644 .scalarize(0) 645 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 646 647 FDIVActions 648 .scalarize(0) 649 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 650 651 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 652 .legalFor(FPTypesPK16) 653 .clampMaxNumElements(0, S16, 2) 654 .scalarize(0) 655 .clampScalar(0, S16, S64); 656 657 if (ST.has16BitInsts()) { 658 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 659 .legalFor({S32, S64, S16}) 660 .scalarize(0) 661 .clampScalar(0, S16, S64); 662 } else { 663 getActionDefinitionsBuilder(G_FSQRT) 664 .legalFor({S32, S64}) 665 .scalarize(0) 666 .clampScalar(0, S32, S64); 667 668 if (ST.hasFractBug()) { 669 getActionDefinitionsBuilder(G_FFLOOR) 670 .customFor({S64}) 671 .legalFor({S32, S64}) 672 .scalarize(0) 673 .clampScalar(0, S32, S64); 674 } else { 675 getActionDefinitionsBuilder(G_FFLOOR) 676 .legalFor({S32, S64}) 677 .scalarize(0) 678 .clampScalar(0, S32, S64); 679 } 680 } 681 682 getActionDefinitionsBuilder(G_FPTRUNC) 683 .legalFor({{S32, S64}, {S16, S32}}) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder(G_FPEXT) 688 .legalFor({{S64, S32}, {S32, S16}}) 689 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 690 .scalarize(0); 691 692 getActionDefinitionsBuilder(G_FSUB) 693 // Use actual fsub instruction 694 .legalFor({S32}) 695 // Must use fadd + fneg 696 .lowerFor({S64, S16, V2S16}) 697 .scalarize(0) 698 .clampScalar(0, S32, S64); 699 700 // Whether this is legal depends on the floating point mode for the function. 701 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 702 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 703 FMad.customFor({S32, S16}); 704 else if (ST.hasMadMacF32Insts()) 705 FMad.customFor({S32}); 706 else if (ST.hasMadF16()) 707 FMad.customFor({S16}); 708 FMad.scalarize(0) 709 .lower(); 710 711 // TODO: Do we need to clamp maximum bitwidth? 712 getActionDefinitionsBuilder(G_TRUNC) 713 .legalIf(isScalar(0)) 714 .legalFor({{V2S16, V2S32}}) 715 .clampMaxNumElements(0, S16, 2) 716 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 717 // situations (like an invalid implicit use), we don't want to infinite loop 718 // in the legalizer. 719 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 720 .alwaysLegal(); 721 722 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 723 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 724 {S32, S1}, {S64, S1}, {S16, S1}}) 725 .scalarize(0) 726 .clampScalar(0, S32, S64) 727 .widenScalarToNextPow2(1, 32); 728 729 // TODO: Split s1->s64 during regbankselect for VALU. 730 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 731 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 732 .lowerFor({{S32, S64}}) 733 .lowerIf(typeIs(1, S1)) 734 .customFor({{S64, S64}}); 735 if (ST.has16BitInsts()) 736 IToFP.legalFor({{S16, S16}}); 737 IToFP.clampScalar(1, S32, S64) 738 .minScalar(0, S32) 739 .scalarize(0) 740 .widenScalarToNextPow2(1); 741 742 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 743 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 744 .customFor({{S64, S64}}) 745 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 746 if (ST.has16BitInsts()) 747 FPToI.legalFor({{S16, S16}}); 748 else 749 FPToI.minScalar(1, S32); 750 751 FPToI.minScalar(0, S32) 752 .scalarize(0) 753 .lower(); 754 755 // Lower roundeven into G_FRINT 756 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 757 .scalarize(0) 758 .lower(); 759 760 if (ST.has16BitInsts()) { 761 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 762 .legalFor({S16, S32, S64}) 763 .clampScalar(0, S16, S64) 764 .scalarize(0); 765 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 766 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 767 .legalFor({S32, S64}) 768 .clampScalar(0, S32, S64) 769 .scalarize(0); 770 } else { 771 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 772 .legalFor({S32}) 773 .customFor({S64}) 774 .clampScalar(0, S32, S64) 775 .scalarize(0); 776 } 777 778 getActionDefinitionsBuilder(G_PTR_ADD) 779 .legalIf(all(isPointer(0), sameSize(0, 1))) 780 .scalarize(0) 781 .scalarSameSizeAs(1, 0); 782 783 getActionDefinitionsBuilder(G_PTRMASK) 784 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 785 .scalarSameSizeAs(1, 0) 786 .scalarize(0); 787 788 auto &CmpBuilder = 789 getActionDefinitionsBuilder(G_ICMP) 790 // The compare output type differs based on the register bank of the output, 791 // so make both s1 and s32 legal. 792 // 793 // Scalar compares producing output in scc will be promoted to s32, as that 794 // is the allocatable register type that will be needed for the copy from 795 // scc. This will be promoted during RegBankSelect, and we assume something 796 // before that won't try to use s32 result types. 797 // 798 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 799 // bank. 800 .legalForCartesianProduct( 801 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 802 .legalForCartesianProduct( 803 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 804 if (ST.has16BitInsts()) { 805 CmpBuilder.legalFor({{S1, S16}}); 806 } 807 808 CmpBuilder 809 .widenScalarToNextPow2(1) 810 .clampScalar(1, S32, S64) 811 .scalarize(0) 812 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 813 814 getActionDefinitionsBuilder(G_FCMP) 815 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 816 .widenScalarToNextPow2(1) 817 .clampScalar(1, S32, S64) 818 .scalarize(0); 819 820 // FIXME: fpow has a selection pattern that should move to custom lowering. 821 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 822 if (ST.has16BitInsts()) 823 Exp2Ops.legalFor({S32, S16}); 824 else 825 Exp2Ops.legalFor({S32}); 826 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 827 Exp2Ops.scalarize(0); 828 829 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 830 if (ST.has16BitInsts()) 831 ExpOps.customFor({{S32}, {S16}}); 832 else 833 ExpOps.customFor({S32}); 834 ExpOps.clampScalar(0, MinScalarFPTy, S32) 835 .scalarize(0); 836 837 getActionDefinitionsBuilder(G_FPOWI) 838 .clampScalar(0, MinScalarFPTy, S32) 839 .lower(); 840 841 // The 64-bit versions produce 32-bit results, but only on the SALU. 842 getActionDefinitionsBuilder(G_CTPOP) 843 .legalFor({{S32, S32}, {S32, S64}}) 844 .clampScalar(0, S32, S32) 845 .clampScalar(1, S32, S64) 846 .scalarize(0) 847 .widenScalarToNextPow2(0, 32) 848 .widenScalarToNextPow2(1, 32); 849 850 // The hardware instructions return a different result on 0 than the generic 851 // instructions expect. The hardware produces -1, but these produce the 852 // bitwidth. 853 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 854 .scalarize(0) 855 .clampScalar(0, S32, S32) 856 .clampScalar(1, S32, S64) 857 .widenScalarToNextPow2(0, 32) 858 .widenScalarToNextPow2(1, 32) 859 .lower(); 860 861 // The 64-bit versions produce 32-bit results, but only on the SALU. 862 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 863 .legalFor({{S32, S32}, {S32, S64}}) 864 .clampScalar(0, S32, S32) 865 .clampScalar(1, S32, S64) 866 .scalarize(0) 867 .widenScalarToNextPow2(0, 32) 868 .widenScalarToNextPow2(1, 32); 869 870 getActionDefinitionsBuilder(G_BITREVERSE) 871 .legalFor({S32}) 872 .clampScalar(0, S32, S32) 873 .scalarize(0); 874 875 if (ST.has16BitInsts()) { 876 getActionDefinitionsBuilder(G_BSWAP) 877 .legalFor({S16, S32, V2S16}) 878 .clampMaxNumElements(0, S16, 2) 879 // FIXME: Fixing non-power-of-2 before clamp is workaround for 880 // narrowScalar limitation. 881 .widenScalarToNextPow2(0) 882 .clampScalar(0, S16, S32) 883 .scalarize(0); 884 885 if (ST.hasVOP3PInsts()) { 886 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 887 .legalFor({S32, S16, V2S16}) 888 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 889 .clampMaxNumElements(0, S16, 2) 890 .minScalar(0, S16) 891 .widenScalarToNextPow2(0) 892 .scalarize(0) 893 .lower(); 894 } else { 895 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 896 .legalFor({S32, S16}) 897 .widenScalarToNextPow2(0) 898 .minScalar(0, S16) 899 .scalarize(0) 900 .lower(); 901 } 902 } else { 903 // TODO: Should have same legality without v_perm_b32 904 getActionDefinitionsBuilder(G_BSWAP) 905 .legalFor({S32}) 906 .lowerIf(scalarNarrowerThan(0, 32)) 907 // FIXME: Fixing non-power-of-2 before clamp is workaround for 908 // narrowScalar limitation. 909 .widenScalarToNextPow2(0) 910 .maxScalar(0, S32) 911 .scalarize(0) 912 .lower(); 913 914 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 915 .legalFor({S32}) 916 .minScalar(0, S32) 917 .widenScalarToNextPow2(0) 918 .scalarize(0) 919 .lower(); 920 } 921 922 getActionDefinitionsBuilder(G_INTTOPTR) 923 // List the common cases 924 .legalForCartesianProduct(AddrSpaces64, {S64}) 925 .legalForCartesianProduct(AddrSpaces32, {S32}) 926 .scalarize(0) 927 // Accept any address space as long as the size matches 928 .legalIf(sameSize(0, 1)) 929 .widenScalarIf(smallerThan(1, 0), 930 [](const LegalityQuery &Query) { 931 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 932 }) 933 .narrowScalarIf(largerThan(1, 0), 934 [](const LegalityQuery &Query) { 935 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 936 }); 937 938 getActionDefinitionsBuilder(G_PTRTOINT) 939 // List the common cases 940 .legalForCartesianProduct(AddrSpaces64, {S64}) 941 .legalForCartesianProduct(AddrSpaces32, {S32}) 942 .scalarize(0) 943 // Accept any address space as long as the size matches 944 .legalIf(sameSize(0, 1)) 945 .widenScalarIf(smallerThan(0, 1), 946 [](const LegalityQuery &Query) { 947 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 948 }) 949 .narrowScalarIf( 950 largerThan(0, 1), 951 [](const LegalityQuery &Query) { 952 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 953 }); 954 955 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 956 .scalarize(0) 957 .custom(); 958 959 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 960 bool IsLoad) -> bool { 961 const LLT DstTy = Query.Types[0]; 962 963 // Split vector extloads. 964 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 965 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 966 967 if (MemSize < DstTy.getSizeInBits()) 968 MemSize = std::max(MemSize, AlignBits); 969 970 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 971 return true; 972 973 const LLT PtrTy = Query.Types[1]; 974 unsigned AS = PtrTy.getAddressSpace(); 975 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 976 return true; 977 978 // Catch weird sized loads that don't evenly divide into the access sizes 979 // TODO: May be able to widen depending on alignment etc. 980 unsigned NumRegs = (MemSize + 31) / 32; 981 if (NumRegs == 3) { 982 if (!ST.hasDwordx3LoadStores()) 983 return true; 984 } else { 985 // If the alignment allows, these should have been widened. 986 if (!isPowerOf2_32(NumRegs)) 987 return true; 988 } 989 990 if (AlignBits < MemSize) { 991 const SITargetLowering *TLI = ST.getTargetLowering(); 992 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 993 Align(AlignBits / 8)); 994 } 995 996 return false; 997 }; 998 999 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 1000 unsigned Opc) -> bool { 1001 unsigned Size = Query.Types[0].getSizeInBits(); 1002 if (isPowerOf2_32(Size)) 1003 return false; 1004 1005 if (Size == 96 && ST.hasDwordx3LoadStores()) 1006 return false; 1007 1008 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1009 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1010 return false; 1011 1012 unsigned Align = Query.MMODescrs[0].AlignInBits; 1013 unsigned RoundedSize = NextPowerOf2(Size); 1014 return (Align >= RoundedSize); 1015 }; 1016 1017 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1018 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1019 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1020 1021 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1022 // LDS 1023 // TODO: Unsupported flat for SI. 1024 1025 for (unsigned Op : {G_LOAD, G_STORE}) { 1026 const bool IsStore = Op == G_STORE; 1027 1028 auto &Actions = getActionDefinitionsBuilder(Op); 1029 // Explicitly list some common cases. 1030 // TODO: Does this help compile time at all? 1031 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1032 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1033 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1034 {S64, GlobalPtr, 64, GlobalAlign32}, 1035 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1036 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1037 {S32, GlobalPtr, 8, GlobalAlign8}, 1038 {S32, GlobalPtr, 16, GlobalAlign16}, 1039 1040 {S32, LocalPtr, 32, 32}, 1041 {S64, LocalPtr, 64, 32}, 1042 {V2S32, LocalPtr, 64, 32}, 1043 {S32, LocalPtr, 8, 8}, 1044 {S32, LocalPtr, 16, 16}, 1045 {V2S16, LocalPtr, 32, 32}, 1046 1047 {S32, PrivatePtr, 32, 32}, 1048 {S32, PrivatePtr, 8, 8}, 1049 {S32, PrivatePtr, 16, 16}, 1050 {V2S16, PrivatePtr, 32, 32}, 1051 1052 {S32, ConstantPtr, 32, GlobalAlign32}, 1053 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1054 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1055 {S64, ConstantPtr, 64, GlobalAlign32}, 1056 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1057 Actions.legalIf( 1058 [=](const LegalityQuery &Query) -> bool { 1059 return isLoadStoreLegal(ST, Query, Op); 1060 }); 1061 1062 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1063 // 64-bits. 1064 // 1065 // TODO: Should generalize bitcast action into coerce, which will also cover 1066 // inserting addrspacecasts. 1067 Actions.customIf(typeIs(1, Constant32Ptr)); 1068 1069 // Turn any illegal element vectors into something easier to deal 1070 // with. These will ultimately produce 32-bit scalar shifts to extract the 1071 // parts anyway. 1072 // 1073 // For odd 16-bit element vectors, prefer to split those into pieces with 1074 // 16-bit vector parts. 1075 Actions.bitcastIf( 1076 [=](const LegalityQuery &Query) -> bool { 1077 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1078 Query.MMODescrs[0].SizeInBits); 1079 }, bitcastToRegisterType(0)); 1080 1081 Actions 1082 .customIf(typeIs(1, Constant32Ptr)) 1083 // Widen suitably aligned loads by loading extra elements. 1084 .moreElementsIf([=](const LegalityQuery &Query) { 1085 const LLT Ty = Query.Types[0]; 1086 return Op == G_LOAD && Ty.isVector() && 1087 shouldWidenLoadResult(Query, Op); 1088 }, moreElementsToNextPow2(0)) 1089 .widenScalarIf([=](const LegalityQuery &Query) { 1090 const LLT Ty = Query.Types[0]; 1091 return Op == G_LOAD && !Ty.isVector() && 1092 shouldWidenLoadResult(Query, Op); 1093 }, widenScalarOrEltToNextPow2(0)) 1094 .narrowScalarIf( 1095 [=](const LegalityQuery &Query) -> bool { 1096 return !Query.Types[0].isVector() && 1097 needToSplitMemOp(Query, Op == G_LOAD); 1098 }, 1099 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1100 const LLT DstTy = Query.Types[0]; 1101 const LLT PtrTy = Query.Types[1]; 1102 1103 const unsigned DstSize = DstTy.getSizeInBits(); 1104 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1105 1106 // Split extloads. 1107 if (DstSize > MemSize) 1108 return std::make_pair(0, LLT::scalar(MemSize)); 1109 1110 if (!isPowerOf2_32(DstSize)) { 1111 // We're probably decomposing an odd sized store. Try to split 1112 // to the widest type. TODO: Account for alignment. As-is it 1113 // should be OK, since the new parts will be further legalized. 1114 unsigned FloorSize = PowerOf2Floor(DstSize); 1115 return std::make_pair(0, LLT::scalar(FloorSize)); 1116 } 1117 1118 if (DstSize > 32 && (DstSize % 32 != 0)) { 1119 // FIXME: Need a way to specify non-extload of larger size if 1120 // suitably aligned. 1121 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1122 } 1123 1124 unsigned MaxSize = maxSizeForAddrSpace(ST, 1125 PtrTy.getAddressSpace(), 1126 Op == G_LOAD); 1127 if (MemSize > MaxSize) 1128 return std::make_pair(0, LLT::scalar(MaxSize)); 1129 1130 unsigned Align = Query.MMODescrs[0].AlignInBits; 1131 return std::make_pair(0, LLT::scalar(Align)); 1132 }) 1133 .fewerElementsIf( 1134 [=](const LegalityQuery &Query) -> bool { 1135 return Query.Types[0].isVector() && 1136 needToSplitMemOp(Query, Op == G_LOAD); 1137 }, 1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1139 const LLT DstTy = Query.Types[0]; 1140 const LLT PtrTy = Query.Types[1]; 1141 1142 LLT EltTy = DstTy.getElementType(); 1143 unsigned MaxSize = maxSizeForAddrSpace(ST, 1144 PtrTy.getAddressSpace(), 1145 Op == G_LOAD); 1146 1147 // FIXME: Handle widened to power of 2 results better. This ends 1148 // up scalarizing. 1149 // FIXME: 3 element stores scalarized on SI 1150 1151 // Split if it's too large for the address space. 1152 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1153 unsigned NumElts = DstTy.getNumElements(); 1154 unsigned EltSize = EltTy.getSizeInBits(); 1155 1156 if (MaxSize % EltSize == 0) { 1157 return std::make_pair( 1158 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1159 } 1160 1161 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1162 1163 // FIXME: Refine when odd breakdowns handled 1164 // The scalars will need to be re-legalized. 1165 if (NumPieces == 1 || NumPieces >= NumElts || 1166 NumElts % NumPieces != 0) 1167 return std::make_pair(0, EltTy); 1168 1169 return std::make_pair(0, 1170 LLT::vector(NumElts / NumPieces, EltTy)); 1171 } 1172 1173 // FIXME: We could probably handle weird extending loads better. 1174 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1175 if (DstTy.getSizeInBits() > MemSize) 1176 return std::make_pair(0, EltTy); 1177 1178 unsigned EltSize = EltTy.getSizeInBits(); 1179 unsigned DstSize = DstTy.getSizeInBits(); 1180 if (!isPowerOf2_32(DstSize)) { 1181 // We're probably decomposing an odd sized store. Try to split 1182 // to the widest type. TODO: Account for alignment. As-is it 1183 // should be OK, since the new parts will be further legalized. 1184 unsigned FloorSize = PowerOf2Floor(DstSize); 1185 return std::make_pair( 1186 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1187 } 1188 1189 // Need to split because of alignment. 1190 unsigned Align = Query.MMODescrs[0].AlignInBits; 1191 if (EltSize > Align && 1192 (EltSize / Align < DstTy.getNumElements())) { 1193 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1194 } 1195 1196 // May need relegalization for the scalars. 1197 return std::make_pair(0, EltTy); 1198 }) 1199 .minScalar(0, S32); 1200 1201 if (IsStore) 1202 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1203 1204 // TODO: Need a bitcast lower option? 1205 Actions 1206 .widenScalarToNextPow2(0) 1207 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1208 } 1209 1210 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1211 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1212 {S32, GlobalPtr, 16, 2 * 8}, 1213 {S32, LocalPtr, 8, 8}, 1214 {S32, LocalPtr, 16, 16}, 1215 {S32, PrivatePtr, 8, 8}, 1216 {S32, PrivatePtr, 16, 16}, 1217 {S32, ConstantPtr, 8, 8}, 1218 {S32, ConstantPtr, 16, 2 * 8}}); 1219 if (ST.hasFlatAddressSpace()) { 1220 ExtLoads.legalForTypesWithMemDesc( 1221 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1222 } 1223 1224 ExtLoads.clampScalar(0, S32, S32) 1225 .widenScalarToNextPow2(0) 1226 .unsupportedIfMemSizeNotPow2() 1227 .lower(); 1228 1229 auto &Atomics = getActionDefinitionsBuilder( 1230 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1231 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1232 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1233 G_ATOMICRMW_UMIN}) 1234 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1235 {S64, GlobalPtr}, {S64, LocalPtr}, 1236 {S32, RegionPtr}, {S64, RegionPtr}}); 1237 if (ST.hasFlatAddressSpace()) { 1238 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1239 } 1240 1241 if (ST.hasLDSFPAtomics()) { 1242 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1243 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1244 } 1245 1246 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1247 // demarshalling 1248 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1249 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1250 {S32, FlatPtr}, {S64, FlatPtr}}) 1251 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1252 {S32, RegionPtr}, {S64, RegionPtr}}); 1253 // TODO: Pointer types, any 32-bit or 64-bit vector 1254 1255 // Condition should be s32 for scalar, s1 for vector. 1256 getActionDefinitionsBuilder(G_SELECT) 1257 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1258 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1259 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1260 .clampScalar(0, S16, S64) 1261 .scalarize(1) 1262 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1263 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1264 .clampMaxNumElements(0, S32, 2) 1265 .clampMaxNumElements(0, LocalPtr, 2) 1266 .clampMaxNumElements(0, PrivatePtr, 2) 1267 .scalarize(0) 1268 .widenScalarToNextPow2(0) 1269 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1270 1271 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1272 // be more flexible with the shift amount type. 1273 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1274 .legalFor({{S32, S32}, {S64, S32}}); 1275 if (ST.has16BitInsts()) { 1276 if (ST.hasVOP3PInsts()) { 1277 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1278 .clampMaxNumElements(0, S16, 2); 1279 } else 1280 Shifts.legalFor({{S16, S16}}); 1281 1282 // TODO: Support 16-bit shift amounts for all types 1283 Shifts.widenScalarIf( 1284 [=](const LegalityQuery &Query) { 1285 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1286 // 32-bit amount. 1287 const LLT ValTy = Query.Types[0]; 1288 const LLT AmountTy = Query.Types[1]; 1289 return ValTy.getSizeInBits() <= 16 && 1290 AmountTy.getSizeInBits() < 16; 1291 }, changeTo(1, S16)); 1292 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1293 Shifts.clampScalar(1, S32, S32); 1294 Shifts.clampScalar(0, S16, S64); 1295 Shifts.widenScalarToNextPow2(0, 16); 1296 1297 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1298 .minScalar(0, S16) 1299 .scalarize(0) 1300 .lower(); 1301 } else { 1302 // Make sure we legalize the shift amount type first, as the general 1303 // expansion for the shifted type will produce much worse code if it hasn't 1304 // been truncated already. 1305 Shifts.clampScalar(1, S32, S32); 1306 Shifts.clampScalar(0, S32, S64); 1307 Shifts.widenScalarToNextPow2(0, 32); 1308 1309 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1310 .minScalar(0, S32) 1311 .scalarize(0) 1312 .lower(); 1313 } 1314 Shifts.scalarize(0); 1315 1316 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1317 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1318 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1319 unsigned IdxTypeIdx = 2; 1320 1321 getActionDefinitionsBuilder(Op) 1322 .customIf([=](const LegalityQuery &Query) { 1323 const LLT EltTy = Query.Types[EltTypeIdx]; 1324 const LLT VecTy = Query.Types[VecTypeIdx]; 1325 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1326 const unsigned EltSize = EltTy.getSizeInBits(); 1327 return (EltSize == 32 || EltSize == 64) && 1328 VecTy.getSizeInBits() % 32 == 0 && 1329 VecTy.getSizeInBits() <= MaxRegisterSize && 1330 IdxTy.getSizeInBits() == 32; 1331 }) 1332 .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), 1333 bitcastToVectorElement32(1)) 1334 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1335 .bitcastIf( 1336 all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), 1337 [=](const LegalityQuery &Query) { 1338 // For > 64-bit element types, try to turn this into a 64-bit 1339 // element vector since we may be able to do better indexing 1340 // if this is scalar. If not, fall back to 32. 1341 const LLT EltTy = Query.Types[EltTypeIdx]; 1342 const LLT VecTy = Query.Types[VecTypeIdx]; 1343 const unsigned DstEltSize = EltTy.getSizeInBits(); 1344 const unsigned VecSize = VecTy.getSizeInBits(); 1345 1346 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1347 return std::make_pair( 1348 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1349 }) 1350 .clampScalar(EltTypeIdx, S32, S64) 1351 .clampScalar(VecTypeIdx, S32, S64) 1352 .clampScalar(IdxTypeIdx, S32, S32) 1353 .clampMaxNumElements(1, S32, 32) 1354 // TODO: Clamp elements for 64-bit vectors? 1355 // It should only be necessary with variable indexes. 1356 // As a last resort, lower to the stack 1357 .lower(); 1358 } 1359 1360 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1361 .unsupportedIf([=](const LegalityQuery &Query) { 1362 const LLT &EltTy = Query.Types[1].getElementType(); 1363 return Query.Types[0] != EltTy; 1364 }); 1365 1366 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1367 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1368 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1369 1370 // FIXME: Doesn't handle extract of illegal sizes. 1371 getActionDefinitionsBuilder(Op) 1372 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1373 // FIXME: Multiples of 16 should not be legal. 1374 .legalIf([=](const LegalityQuery &Query) { 1375 const LLT BigTy = Query.Types[BigTyIdx]; 1376 const LLT LitTy = Query.Types[LitTyIdx]; 1377 return (BigTy.getSizeInBits() % 32 == 0) && 1378 (LitTy.getSizeInBits() % 16 == 0); 1379 }) 1380 .widenScalarIf( 1381 [=](const LegalityQuery &Query) { 1382 const LLT BigTy = Query.Types[BigTyIdx]; 1383 return (BigTy.getScalarSizeInBits() < 16); 1384 }, 1385 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1386 .widenScalarIf( 1387 [=](const LegalityQuery &Query) { 1388 const LLT LitTy = Query.Types[LitTyIdx]; 1389 return (LitTy.getScalarSizeInBits() < 16); 1390 }, 1391 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1392 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1393 .widenScalarToNextPow2(BigTyIdx, 32); 1394 1395 } 1396 1397 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1398 .legalForCartesianProduct(AllS32Vectors, {S32}) 1399 .legalForCartesianProduct(AllS64Vectors, {S64}) 1400 .clampNumElements(0, V16S32, V32S32) 1401 .clampNumElements(0, V2S64, V16S64) 1402 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1403 1404 if (ST.hasScalarPackInsts()) { 1405 BuildVector 1406 // FIXME: Should probably widen s1 vectors straight to s32 1407 .minScalarOrElt(0, S16) 1408 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1409 .minScalar(1, S32); 1410 1411 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1412 .legalFor({V2S16, S32}) 1413 .lower(); 1414 BuildVector.minScalarOrElt(0, S32); 1415 } else { 1416 BuildVector.customFor({V2S16, S16}); 1417 BuildVector.minScalarOrElt(0, S32); 1418 1419 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1420 .customFor({V2S16, S32}) 1421 .lower(); 1422 } 1423 1424 BuildVector.legalIf(isRegisterType(0)); 1425 1426 // FIXME: Clamp maximum size 1427 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1428 .legalIf(isRegisterType(0)); 1429 1430 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1431 // pre-legalize. 1432 if (ST.hasVOP3PInsts()) { 1433 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1434 .customFor({V2S16, V2S16}) 1435 .lower(); 1436 } else 1437 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1438 1439 // Merge/Unmerge 1440 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1441 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1442 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1443 1444 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1445 const LLT Ty = Query.Types[TypeIdx]; 1446 if (Ty.isVector()) { 1447 const LLT &EltTy = Ty.getElementType(); 1448 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1449 return true; 1450 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1451 return true; 1452 } 1453 return false; 1454 }; 1455 1456 auto &Builder = getActionDefinitionsBuilder(Op) 1457 .lowerFor({{S16, V2S16}}) 1458 .lowerIf([=](const LegalityQuery &Query) { 1459 const LLT BigTy = Query.Types[BigTyIdx]; 1460 return BigTy.getSizeInBits() == 32; 1461 }) 1462 // Try to widen to s16 first for small types. 1463 // TODO: Only do this on targets with legal s16 shifts 1464 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1465 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1466 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1467 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1468 elementTypeIs(1, S16)), 1469 changeTo(1, V2S16)) 1470 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1471 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1472 // valid. 1473 .clampScalar(LitTyIdx, S32, S512) 1474 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1475 // Break up vectors with weird elements into scalars 1476 .fewerElementsIf( 1477 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1478 scalarize(0)) 1479 .fewerElementsIf( 1480 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1481 scalarize(1)) 1482 .clampScalar(BigTyIdx, S32, MaxScalar); 1483 1484 if (Op == G_MERGE_VALUES) { 1485 Builder.widenScalarIf( 1486 // TODO: Use 16-bit shifts if legal for 8-bit values? 1487 [=](const LegalityQuery &Query) { 1488 const LLT Ty = Query.Types[LitTyIdx]; 1489 return Ty.getSizeInBits() < 32; 1490 }, 1491 changeTo(LitTyIdx, S32)); 1492 } 1493 1494 Builder.widenScalarIf( 1495 [=](const LegalityQuery &Query) { 1496 const LLT Ty = Query.Types[BigTyIdx]; 1497 return !isPowerOf2_32(Ty.getSizeInBits()) && 1498 Ty.getSizeInBits() % 16 != 0; 1499 }, 1500 [=](const LegalityQuery &Query) { 1501 // Pick the next power of 2, or a multiple of 64 over 128. 1502 // Whichever is smaller. 1503 const LLT &Ty = Query.Types[BigTyIdx]; 1504 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1505 if (NewSizeInBits >= 256) { 1506 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1507 if (RoundedTo < NewSizeInBits) 1508 NewSizeInBits = RoundedTo; 1509 } 1510 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1511 }) 1512 .legalIf([=](const LegalityQuery &Query) { 1513 const LLT &BigTy = Query.Types[BigTyIdx]; 1514 const LLT &LitTy = Query.Types[LitTyIdx]; 1515 1516 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1517 return false; 1518 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1519 return false; 1520 1521 return BigTy.getSizeInBits() % 16 == 0 && 1522 LitTy.getSizeInBits() % 16 == 0 && 1523 BigTy.getSizeInBits() <= MaxRegisterSize; 1524 }) 1525 // Any vectors left are the wrong size. Scalarize them. 1526 .scalarize(0) 1527 .scalarize(1); 1528 } 1529 1530 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1531 // RegBankSelect. 1532 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1533 .legalFor({{S32}, {S64}}); 1534 1535 if (ST.hasVOP3PInsts()) { 1536 SextInReg.lowerFor({{V2S16}}) 1537 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1538 // get more vector shift opportunities, since we'll get those when 1539 // expanded. 1540 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1541 } else if (ST.has16BitInsts()) { 1542 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1543 } else { 1544 // Prefer to promote to s32 before lowering if we don't have 16-bit 1545 // shifts. This avoid a lot of intermediate truncate and extend operations. 1546 SextInReg.lowerFor({{S32}, {S64}}); 1547 } 1548 1549 SextInReg 1550 .scalarize(0) 1551 .clampScalar(0, S32, S64) 1552 .lower(); 1553 1554 getActionDefinitionsBuilder(G_FSHR) 1555 .legalFor({{S32, S32}}) 1556 .scalarize(0) 1557 .lower(); 1558 1559 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1560 .legalFor({S64}); 1561 1562 getActionDefinitionsBuilder(G_FENCE) 1563 .alwaysLegal(); 1564 1565 getActionDefinitionsBuilder({ 1566 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1567 G_FCOPYSIGN, 1568 1569 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1570 G_ATOMICRMW_NAND, 1571 G_ATOMICRMW_FSUB, 1572 G_READ_REGISTER, 1573 G_WRITE_REGISTER, 1574 1575 G_SADDO, G_SSUBO, 1576 1577 // TODO: Implement 1578 G_FMINIMUM, G_FMAXIMUM, 1579 G_FSHL 1580 }).lower(); 1581 1582 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1583 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1584 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1585 .unsupported(); 1586 1587 computeTables(); 1588 verify(*ST.getInstrInfo()); 1589 } 1590 1591 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1592 MachineInstr &MI) const { 1593 MachineIRBuilder &B = Helper.MIRBuilder; 1594 MachineRegisterInfo &MRI = *B.getMRI(); 1595 GISelChangeObserver &Observer = Helper.Observer; 1596 1597 switch (MI.getOpcode()) { 1598 case TargetOpcode::G_ADDRSPACE_CAST: 1599 return legalizeAddrSpaceCast(MI, MRI, B); 1600 case TargetOpcode::G_FRINT: 1601 return legalizeFrint(MI, MRI, B); 1602 case TargetOpcode::G_FCEIL: 1603 return legalizeFceil(MI, MRI, B); 1604 case TargetOpcode::G_INTRINSIC_TRUNC: 1605 return legalizeIntrinsicTrunc(MI, MRI, B); 1606 case TargetOpcode::G_SITOFP: 1607 return legalizeITOFP(MI, MRI, B, true); 1608 case TargetOpcode::G_UITOFP: 1609 return legalizeITOFP(MI, MRI, B, false); 1610 case TargetOpcode::G_FPTOSI: 1611 return legalizeFPTOI(MI, MRI, B, true); 1612 case TargetOpcode::G_FPTOUI: 1613 return legalizeFPTOI(MI, MRI, B, false); 1614 case TargetOpcode::G_FMINNUM: 1615 case TargetOpcode::G_FMAXNUM: 1616 case TargetOpcode::G_FMINNUM_IEEE: 1617 case TargetOpcode::G_FMAXNUM_IEEE: 1618 return legalizeMinNumMaxNum(Helper, MI); 1619 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1620 return legalizeExtractVectorElt(MI, MRI, B); 1621 case TargetOpcode::G_INSERT_VECTOR_ELT: 1622 return legalizeInsertVectorElt(MI, MRI, B); 1623 case TargetOpcode::G_SHUFFLE_VECTOR: 1624 return legalizeShuffleVector(MI, MRI, B); 1625 case TargetOpcode::G_FSIN: 1626 case TargetOpcode::G_FCOS: 1627 return legalizeSinCos(MI, MRI, B); 1628 case TargetOpcode::G_GLOBAL_VALUE: 1629 return legalizeGlobalValue(MI, MRI, B); 1630 case TargetOpcode::G_LOAD: 1631 return legalizeLoad(MI, MRI, B, Observer); 1632 case TargetOpcode::G_FMAD: 1633 return legalizeFMad(MI, MRI, B); 1634 case TargetOpcode::G_FDIV: 1635 return legalizeFDIV(MI, MRI, B); 1636 case TargetOpcode::G_UDIV: 1637 case TargetOpcode::G_UREM: 1638 return legalizeUDIV_UREM(MI, MRI, B); 1639 case TargetOpcode::G_SDIV: 1640 case TargetOpcode::G_SREM: 1641 return legalizeSDIV_SREM(MI, MRI, B); 1642 case TargetOpcode::G_ATOMIC_CMPXCHG: 1643 return legalizeAtomicCmpXChg(MI, MRI, B); 1644 case TargetOpcode::G_FLOG: 1645 return legalizeFlog(MI, B, numbers::ln2f); 1646 case TargetOpcode::G_FLOG10: 1647 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1648 case TargetOpcode::G_FEXP: 1649 return legalizeFExp(MI, B); 1650 case TargetOpcode::G_FPOW: 1651 return legalizeFPow(MI, B); 1652 case TargetOpcode::G_FFLOOR: 1653 return legalizeFFloor(MI, MRI, B); 1654 case TargetOpcode::G_BUILD_VECTOR: 1655 return legalizeBuildVector(MI, MRI, B); 1656 default: 1657 return false; 1658 } 1659 1660 llvm_unreachable("expected switch to return"); 1661 } 1662 1663 Register AMDGPULegalizerInfo::getSegmentAperture( 1664 unsigned AS, 1665 MachineRegisterInfo &MRI, 1666 MachineIRBuilder &B) const { 1667 MachineFunction &MF = B.getMF(); 1668 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1669 const LLT S32 = LLT::scalar(32); 1670 1671 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1672 1673 if (ST.hasApertureRegs()) { 1674 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1675 // getreg. 1676 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1677 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1678 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1679 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1680 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1681 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1682 unsigned Encoding = 1683 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1684 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1685 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1686 1687 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1688 1689 B.buildInstr(AMDGPU::S_GETREG_B32) 1690 .addDef(GetReg) 1691 .addImm(Encoding); 1692 MRI.setType(GetReg, S32); 1693 1694 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1695 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1696 } 1697 1698 Register QueuePtr = MRI.createGenericVirtualRegister( 1699 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1700 1701 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1702 return Register(); 1703 1704 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1705 // private_segment_aperture_base_hi. 1706 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1707 1708 // TODO: can we be smarter about machine pointer info? 1709 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1710 MachineMemOperand *MMO = MF.getMachineMemOperand( 1711 PtrInfo, 1712 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1713 MachineMemOperand::MOInvariant, 1714 4, commonAlignment(Align(64), StructOffset)); 1715 1716 Register LoadAddr; 1717 1718 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1719 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1720 } 1721 1722 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1723 MachineInstr &MI, MachineRegisterInfo &MRI, 1724 MachineIRBuilder &B) const { 1725 MachineFunction &MF = B.getMF(); 1726 1727 const LLT S32 = LLT::scalar(32); 1728 Register Dst = MI.getOperand(0).getReg(); 1729 Register Src = MI.getOperand(1).getReg(); 1730 1731 LLT DstTy = MRI.getType(Dst); 1732 LLT SrcTy = MRI.getType(Src); 1733 unsigned DestAS = DstTy.getAddressSpace(); 1734 unsigned SrcAS = SrcTy.getAddressSpace(); 1735 1736 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1737 // vector element. 1738 assert(!DstTy.isVector()); 1739 1740 const AMDGPUTargetMachine &TM 1741 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1742 1743 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1744 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1745 return true; 1746 } 1747 1748 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1749 // Truncate. 1750 B.buildExtract(Dst, Src, 0); 1751 MI.eraseFromParent(); 1752 return true; 1753 } 1754 1755 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1756 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1757 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1758 1759 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1760 // another. Merge operands are required to be the same type, but creating an 1761 // extra ptrtoint would be kind of pointless. 1762 auto HighAddr = B.buildConstant( 1763 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1764 B.buildMerge(Dst, {Src, HighAddr}); 1765 MI.eraseFromParent(); 1766 return true; 1767 } 1768 1769 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1770 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1771 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1772 unsigned NullVal = TM.getNullPointerValue(DestAS); 1773 1774 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1775 auto FlatNull = B.buildConstant(SrcTy, 0); 1776 1777 // Extract low 32-bits of the pointer. 1778 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1779 1780 auto CmpRes = 1781 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1782 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1783 1784 MI.eraseFromParent(); 1785 return true; 1786 } 1787 1788 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1789 return false; 1790 1791 if (!ST.hasFlatAddressSpace()) 1792 return false; 1793 1794 auto SegmentNull = 1795 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1796 auto FlatNull = 1797 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1798 1799 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1800 if (!ApertureReg.isValid()) 1801 return false; 1802 1803 auto CmpRes = 1804 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1805 1806 // Coerce the type of the low half of the result so we can use merge_values. 1807 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1808 1809 // TODO: Should we allow mismatched types but matching sizes in merges to 1810 // avoid the ptrtoint? 1811 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1812 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1813 1814 MI.eraseFromParent(); 1815 return true; 1816 } 1817 1818 bool AMDGPULegalizerInfo::legalizeFrint( 1819 MachineInstr &MI, MachineRegisterInfo &MRI, 1820 MachineIRBuilder &B) const { 1821 Register Src = MI.getOperand(1).getReg(); 1822 LLT Ty = MRI.getType(Src); 1823 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1824 1825 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1826 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1827 1828 auto C1 = B.buildFConstant(Ty, C1Val); 1829 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1830 1831 // TODO: Should this propagate fast-math-flags? 1832 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1833 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1834 1835 auto C2 = B.buildFConstant(Ty, C2Val); 1836 auto Fabs = B.buildFAbs(Ty, Src); 1837 1838 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1839 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1840 MI.eraseFromParent(); 1841 return true; 1842 } 1843 1844 bool AMDGPULegalizerInfo::legalizeFceil( 1845 MachineInstr &MI, MachineRegisterInfo &MRI, 1846 MachineIRBuilder &B) const { 1847 1848 const LLT S1 = LLT::scalar(1); 1849 const LLT S64 = LLT::scalar(64); 1850 1851 Register Src = MI.getOperand(1).getReg(); 1852 assert(MRI.getType(Src) == S64); 1853 1854 // result = trunc(src) 1855 // if (src > 0.0 && src != result) 1856 // result += 1.0 1857 1858 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1859 1860 const auto Zero = B.buildFConstant(S64, 0.0); 1861 const auto One = B.buildFConstant(S64, 1.0); 1862 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1863 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1864 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1865 auto Add = B.buildSelect(S64, And, One, Zero); 1866 1867 // TODO: Should this propagate fast-math-flags? 1868 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1869 return true; 1870 } 1871 1872 static MachineInstrBuilder extractF64Exponent(Register Hi, 1873 MachineIRBuilder &B) { 1874 const unsigned FractBits = 52; 1875 const unsigned ExpBits = 11; 1876 LLT S32 = LLT::scalar(32); 1877 1878 auto Const0 = B.buildConstant(S32, FractBits - 32); 1879 auto Const1 = B.buildConstant(S32, ExpBits); 1880 1881 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1882 .addUse(Hi) 1883 .addUse(Const0.getReg(0)) 1884 .addUse(Const1.getReg(0)); 1885 1886 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1887 } 1888 1889 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1890 MachineInstr &MI, MachineRegisterInfo &MRI, 1891 MachineIRBuilder &B) const { 1892 const LLT S1 = LLT::scalar(1); 1893 const LLT S32 = LLT::scalar(32); 1894 const LLT S64 = LLT::scalar(64); 1895 1896 Register Src = MI.getOperand(1).getReg(); 1897 assert(MRI.getType(Src) == S64); 1898 1899 // TODO: Should this use extract since the low half is unused? 1900 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1901 Register Hi = Unmerge.getReg(1); 1902 1903 // Extract the upper half, since this is where we will find the sign and 1904 // exponent. 1905 auto Exp = extractF64Exponent(Hi, B); 1906 1907 const unsigned FractBits = 52; 1908 1909 // Extract the sign bit. 1910 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1911 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1912 1913 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1914 1915 const auto Zero32 = B.buildConstant(S32, 0); 1916 1917 // Extend back to 64-bits. 1918 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1919 1920 auto Shr = B.buildAShr(S64, FractMask, Exp); 1921 auto Not = B.buildNot(S64, Shr); 1922 auto Tmp0 = B.buildAnd(S64, Src, Not); 1923 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1924 1925 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1926 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1927 1928 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1929 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1930 MI.eraseFromParent(); 1931 return true; 1932 } 1933 1934 bool AMDGPULegalizerInfo::legalizeITOFP( 1935 MachineInstr &MI, MachineRegisterInfo &MRI, 1936 MachineIRBuilder &B, bool Signed) const { 1937 1938 Register Dst = MI.getOperand(0).getReg(); 1939 Register Src = MI.getOperand(1).getReg(); 1940 1941 const LLT S64 = LLT::scalar(64); 1942 const LLT S32 = LLT::scalar(32); 1943 1944 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1945 1946 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1947 1948 auto CvtHi = Signed ? 1949 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1950 B.buildUITOFP(S64, Unmerge.getReg(1)); 1951 1952 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1953 1954 auto ThirtyTwo = B.buildConstant(S32, 32); 1955 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1956 .addUse(CvtHi.getReg(0)) 1957 .addUse(ThirtyTwo.getReg(0)); 1958 1959 // TODO: Should this propagate fast-math-flags? 1960 B.buildFAdd(Dst, LdExp, CvtLo); 1961 MI.eraseFromParent(); 1962 return true; 1963 } 1964 1965 // TODO: Copied from DAG implementation. Verify logic and document how this 1966 // actually works. 1967 bool AMDGPULegalizerInfo::legalizeFPTOI( 1968 MachineInstr &MI, MachineRegisterInfo &MRI, 1969 MachineIRBuilder &B, bool Signed) const { 1970 1971 Register Dst = MI.getOperand(0).getReg(); 1972 Register Src = MI.getOperand(1).getReg(); 1973 1974 const LLT S64 = LLT::scalar(64); 1975 const LLT S32 = LLT::scalar(32); 1976 1977 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1978 1979 unsigned Flags = MI.getFlags(); 1980 1981 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1982 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1983 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1984 1985 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1986 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1987 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1988 1989 auto Hi = Signed ? 1990 B.buildFPTOSI(S32, FloorMul) : 1991 B.buildFPTOUI(S32, FloorMul); 1992 auto Lo = B.buildFPTOUI(S32, Fma); 1993 1994 B.buildMerge(Dst, { Lo, Hi }); 1995 MI.eraseFromParent(); 1996 1997 return true; 1998 } 1999 2000 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2001 MachineInstr &MI) const { 2002 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2003 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2004 2005 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2006 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2007 2008 // With ieee_mode disabled, the instructions have the correct behavior 2009 // already for G_FMINNUM/G_FMAXNUM 2010 if (!MFI->getMode().IEEE) 2011 return !IsIEEEOp; 2012 2013 if (IsIEEEOp) 2014 return true; 2015 2016 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2017 } 2018 2019 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2020 MachineInstr &MI, MachineRegisterInfo &MRI, 2021 MachineIRBuilder &B) const { 2022 // TODO: Should move some of this into LegalizerHelper. 2023 2024 // TODO: Promote dynamic indexing of s16 to s32 2025 2026 // FIXME: Artifact combiner probably should have replaced the truncated 2027 // constant before this, so we shouldn't need 2028 // getConstantVRegValWithLookThrough. 2029 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2030 MI.getOperand(2).getReg(), MRI); 2031 if (!IdxVal) // Dynamic case will be selected to register indexing. 2032 return true; 2033 2034 Register Dst = MI.getOperand(0).getReg(); 2035 Register Vec = MI.getOperand(1).getReg(); 2036 2037 LLT VecTy = MRI.getType(Vec); 2038 LLT EltTy = VecTy.getElementType(); 2039 assert(EltTy == MRI.getType(Dst)); 2040 2041 if (IdxVal->Value < VecTy.getNumElements()) 2042 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2043 else 2044 B.buildUndef(Dst); 2045 2046 MI.eraseFromParent(); 2047 return true; 2048 } 2049 2050 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2051 MachineInstr &MI, MachineRegisterInfo &MRI, 2052 MachineIRBuilder &B) const { 2053 // TODO: Should move some of this into LegalizerHelper. 2054 2055 // TODO: Promote dynamic indexing of s16 to s32 2056 2057 // FIXME: Artifact combiner probably should have replaced the truncated 2058 // constant before this, so we shouldn't need 2059 // getConstantVRegValWithLookThrough. 2060 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2061 MI.getOperand(3).getReg(), MRI); 2062 if (!IdxVal) // Dynamic case will be selected to register indexing. 2063 return true; 2064 2065 Register Dst = MI.getOperand(0).getReg(); 2066 Register Vec = MI.getOperand(1).getReg(); 2067 Register Ins = MI.getOperand(2).getReg(); 2068 2069 LLT VecTy = MRI.getType(Vec); 2070 LLT EltTy = VecTy.getElementType(); 2071 assert(EltTy == MRI.getType(Ins)); 2072 2073 if (IdxVal->Value < VecTy.getNumElements()) 2074 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2075 else 2076 B.buildUndef(Dst); 2077 2078 MI.eraseFromParent(); 2079 return true; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2083 MachineInstr &MI, MachineRegisterInfo &MRI, 2084 MachineIRBuilder &B) const { 2085 const LLT V2S16 = LLT::vector(2, 16); 2086 2087 Register Dst = MI.getOperand(0).getReg(); 2088 Register Src0 = MI.getOperand(1).getReg(); 2089 LLT DstTy = MRI.getType(Dst); 2090 LLT SrcTy = MRI.getType(Src0); 2091 2092 if (SrcTy == V2S16 && DstTy == V2S16 && 2093 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2094 return true; 2095 2096 MachineIRBuilder HelperBuilder(MI); 2097 GISelObserverWrapper DummyObserver; 2098 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2099 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2100 } 2101 2102 bool AMDGPULegalizerInfo::legalizeSinCos( 2103 MachineInstr &MI, MachineRegisterInfo &MRI, 2104 MachineIRBuilder &B) const { 2105 2106 Register DstReg = MI.getOperand(0).getReg(); 2107 Register SrcReg = MI.getOperand(1).getReg(); 2108 LLT Ty = MRI.getType(DstReg); 2109 unsigned Flags = MI.getFlags(); 2110 2111 Register TrigVal; 2112 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2113 if (ST.hasTrigReducedRange()) { 2114 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2115 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2116 .addUse(MulVal.getReg(0)) 2117 .setMIFlags(Flags).getReg(0); 2118 } else 2119 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2120 2121 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2122 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2123 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2124 .addUse(TrigVal) 2125 .setMIFlags(Flags); 2126 MI.eraseFromParent(); 2127 return true; 2128 } 2129 2130 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2131 MachineIRBuilder &B, 2132 const GlobalValue *GV, 2133 int64_t Offset, 2134 unsigned GAFlags) const { 2135 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2136 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2137 // to the following code sequence: 2138 // 2139 // For constant address space: 2140 // s_getpc_b64 s[0:1] 2141 // s_add_u32 s0, s0, $symbol 2142 // s_addc_u32 s1, s1, 0 2143 // 2144 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2145 // a fixup or relocation is emitted to replace $symbol with a literal 2146 // constant, which is a pc-relative offset from the encoding of the $symbol 2147 // operand to the global variable. 2148 // 2149 // For global address space: 2150 // s_getpc_b64 s[0:1] 2151 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2152 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2153 // 2154 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2155 // fixups or relocations are emitted to replace $symbol@*@lo and 2156 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2157 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2158 // operand to the global variable. 2159 // 2160 // What we want here is an offset from the value returned by s_getpc 2161 // (which is the address of the s_add_u32 instruction) to the global 2162 // variable, but since the encoding of $symbol starts 4 bytes after the start 2163 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2164 // small. This requires us to add 4 to the global variable offset in order to 2165 // compute the correct address. 2166 2167 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2168 2169 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2170 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2171 2172 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2173 .addDef(PCReg); 2174 2175 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2176 if (GAFlags == SIInstrInfo::MO_NONE) 2177 MIB.addImm(0); 2178 else 2179 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2180 2181 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2182 2183 if (PtrTy.getSizeInBits() == 32) 2184 B.buildExtract(DstReg, PCReg, 0); 2185 return true; 2186 } 2187 2188 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2189 MachineInstr &MI, MachineRegisterInfo &MRI, 2190 MachineIRBuilder &B) const { 2191 Register DstReg = MI.getOperand(0).getReg(); 2192 LLT Ty = MRI.getType(DstReg); 2193 unsigned AS = Ty.getAddressSpace(); 2194 2195 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2196 MachineFunction &MF = B.getMF(); 2197 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2198 2199 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2200 if (!MFI->isEntryFunction()) { 2201 const Function &Fn = MF.getFunction(); 2202 DiagnosticInfoUnsupported BadLDSDecl( 2203 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2204 DS_Warning); 2205 Fn.getContext().diagnose(BadLDSDecl); 2206 2207 // We currently don't have a way to correctly allocate LDS objects that 2208 // aren't directly associated with a kernel. We do force inlining of 2209 // functions that use local objects. However, if these dead functions are 2210 // not eliminated, we don't want a compile time error. Just emit a warning 2211 // and a trap, since there should be no callable path here. 2212 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2213 B.buildUndef(DstReg); 2214 MI.eraseFromParent(); 2215 return true; 2216 } 2217 2218 // TODO: We could emit code to handle the initialization somewhere. 2219 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2220 const SITargetLowering *TLI = ST.getTargetLowering(); 2221 if (!TLI->shouldUseLDSConstAddress(GV)) { 2222 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2223 return true; // Leave in place; 2224 } 2225 2226 B.buildConstant( 2227 DstReg, 2228 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2229 MI.eraseFromParent(); 2230 return true; 2231 } 2232 2233 const Function &Fn = MF.getFunction(); 2234 DiagnosticInfoUnsupported BadInit( 2235 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2236 Fn.getContext().diagnose(BadInit); 2237 return true; 2238 } 2239 2240 const SITargetLowering *TLI = ST.getTargetLowering(); 2241 2242 if (TLI->shouldEmitFixup(GV)) { 2243 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 if (TLI->shouldEmitPCReloc(GV)) { 2249 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2250 MI.eraseFromParent(); 2251 return true; 2252 } 2253 2254 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2255 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2256 2257 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2258 MachinePointerInfo::getGOT(MF), 2259 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2260 MachineMemOperand::MOInvariant, 2261 8 /*Size*/, Align(8)); 2262 2263 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2264 2265 if (Ty.getSizeInBits() == 32) { 2266 // Truncate if this is a 32-bit constant adrdess. 2267 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2268 B.buildExtract(DstReg, Load, 0); 2269 } else 2270 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2271 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeLoad( 2277 MachineInstr &MI, MachineRegisterInfo &MRI, 2278 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2279 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2280 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2281 Observer.changingInstr(MI); 2282 MI.getOperand(1).setReg(Cast.getReg(0)); 2283 Observer.changedInstr(MI); 2284 return true; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeFMad( 2288 MachineInstr &MI, MachineRegisterInfo &MRI, 2289 MachineIRBuilder &B) const { 2290 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2291 assert(Ty.isScalar()); 2292 2293 MachineFunction &MF = B.getMF(); 2294 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2295 2296 // TODO: Always legal with future ftz flag. 2297 // FIXME: Do we need just output? 2298 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2299 return true; 2300 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2301 return true; 2302 2303 MachineIRBuilder HelperBuilder(MI); 2304 GISelObserverWrapper DummyObserver; 2305 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2306 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2307 } 2308 2309 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2311 Register DstReg = MI.getOperand(0).getReg(); 2312 Register PtrReg = MI.getOperand(1).getReg(); 2313 Register CmpVal = MI.getOperand(2).getReg(); 2314 Register NewVal = MI.getOperand(3).getReg(); 2315 2316 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2317 "this should not have been custom lowered"); 2318 2319 LLT ValTy = MRI.getType(CmpVal); 2320 LLT VecTy = LLT::vector(2, ValTy); 2321 2322 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2323 2324 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2325 .addDef(DstReg) 2326 .addUse(PtrReg) 2327 .addUse(PackedVal) 2328 .setMemRefs(MI.memoperands()); 2329 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 2334 bool AMDGPULegalizerInfo::legalizeFlog( 2335 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2336 Register Dst = MI.getOperand(0).getReg(); 2337 Register Src = MI.getOperand(1).getReg(); 2338 LLT Ty = B.getMRI()->getType(Dst); 2339 unsigned Flags = MI.getFlags(); 2340 2341 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2342 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2343 2344 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2345 MI.eraseFromParent(); 2346 return true; 2347 } 2348 2349 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2350 MachineIRBuilder &B) const { 2351 Register Dst = MI.getOperand(0).getReg(); 2352 Register Src = MI.getOperand(1).getReg(); 2353 unsigned Flags = MI.getFlags(); 2354 LLT Ty = B.getMRI()->getType(Dst); 2355 2356 auto K = B.buildFConstant(Ty, numbers::log2e); 2357 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2358 B.buildFExp2(Dst, Mul, Flags); 2359 MI.eraseFromParent(); 2360 return true; 2361 } 2362 2363 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2364 MachineIRBuilder &B) const { 2365 Register Dst = MI.getOperand(0).getReg(); 2366 Register Src0 = MI.getOperand(1).getReg(); 2367 Register Src1 = MI.getOperand(2).getReg(); 2368 unsigned Flags = MI.getFlags(); 2369 LLT Ty = B.getMRI()->getType(Dst); 2370 const LLT S16 = LLT::scalar(16); 2371 const LLT S32 = LLT::scalar(32); 2372 2373 if (Ty == S32) { 2374 auto Log = B.buildFLog2(S32, Src0, Flags); 2375 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2376 .addUse(Log.getReg(0)) 2377 .addUse(Src1) 2378 .setMIFlags(Flags); 2379 B.buildFExp2(Dst, Mul, Flags); 2380 } else if (Ty == S16) { 2381 // There's no f16 fmul_legacy, so we need to convert for it. 2382 auto Log = B.buildFLog2(S16, Src0, Flags); 2383 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2384 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2385 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2386 .addUse(Ext0.getReg(0)) 2387 .addUse(Ext1.getReg(0)) 2388 .setMIFlags(Flags); 2389 2390 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2391 } else 2392 return false; 2393 2394 MI.eraseFromParent(); 2395 return true; 2396 } 2397 2398 // Find a source register, ignoring any possible source modifiers. 2399 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2400 Register ModSrc = OrigSrc; 2401 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2402 ModSrc = SrcFNeg->getOperand(1).getReg(); 2403 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2404 ModSrc = SrcFAbs->getOperand(1).getReg(); 2405 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2406 ModSrc = SrcFAbs->getOperand(1).getReg(); 2407 return ModSrc; 2408 } 2409 2410 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2411 MachineRegisterInfo &MRI, 2412 MachineIRBuilder &B) const { 2413 2414 const LLT S1 = LLT::scalar(1); 2415 const LLT S64 = LLT::scalar(64); 2416 Register Dst = MI.getOperand(0).getReg(); 2417 Register OrigSrc = MI.getOperand(1).getReg(); 2418 unsigned Flags = MI.getFlags(); 2419 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2420 "this should not have been custom lowered"); 2421 2422 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2423 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2424 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2425 // V_FRACT bug is: 2426 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2427 // 2428 // Convert floor(x) to (x - fract(x)) 2429 2430 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2431 .addUse(OrigSrc) 2432 .setMIFlags(Flags); 2433 2434 // Give source modifier matching some assistance before obscuring a foldable 2435 // pattern. 2436 2437 // TODO: We can avoid the neg on the fract? The input sign to fract 2438 // shouldn't matter? 2439 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2440 2441 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2442 2443 Register Min = MRI.createGenericVirtualRegister(S64); 2444 2445 // We don't need to concern ourselves with the snan handling difference, so 2446 // use the one which will directly select. 2447 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2448 if (MFI->getMode().IEEE) 2449 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2450 else 2451 B.buildFMinNum(Min, Fract, Const, Flags); 2452 2453 Register CorrectedFract = Min; 2454 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2455 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2456 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2457 } 2458 2459 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2460 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2461 2462 MI.eraseFromParent(); 2463 return true; 2464 } 2465 2466 // Turn an illegal packed v2s16 build vector into bit operations. 2467 // TODO: This should probably be a bitcast action in LegalizerHelper. 2468 bool AMDGPULegalizerInfo::legalizeBuildVector( 2469 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2470 Register Dst = MI.getOperand(0).getReg(); 2471 const LLT S32 = LLT::scalar(32); 2472 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2473 2474 Register Src0 = MI.getOperand(1).getReg(); 2475 Register Src1 = MI.getOperand(2).getReg(); 2476 assert(MRI.getType(Src0) == LLT::scalar(16)); 2477 2478 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2479 B.buildBitcast(Dst, Merge); 2480 2481 MI.eraseFromParent(); 2482 return true; 2483 } 2484 2485 // Return the use branch instruction, otherwise null if the usage is invalid. 2486 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2487 MachineRegisterInfo &MRI, 2488 MachineInstr *&Br, 2489 MachineBasicBlock *&UncondBrTarget) { 2490 Register CondDef = MI.getOperand(0).getReg(); 2491 if (!MRI.hasOneNonDBGUse(CondDef)) 2492 return nullptr; 2493 2494 MachineBasicBlock *Parent = MI.getParent(); 2495 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2496 if (UseMI.getParent() != Parent || 2497 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2498 return nullptr; 2499 2500 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2501 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2502 if (Next == Parent->end()) { 2503 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2504 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2505 return nullptr; 2506 UncondBrTarget = &*NextMBB; 2507 } else { 2508 if (Next->getOpcode() != AMDGPU::G_BR) 2509 return nullptr; 2510 Br = &*Next; 2511 UncondBrTarget = Br->getOperand(0).getMBB(); 2512 } 2513 2514 return &UseMI; 2515 } 2516 2517 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2518 const ArgDescriptor *Arg, 2519 const TargetRegisterClass *ArgRC, 2520 LLT ArgTy) const { 2521 MCRegister SrcReg = Arg->getRegister(); 2522 assert(SrcReg.isPhysical() && "Physical register expected"); 2523 assert(DstReg.isVirtual() && "Virtual register expected"); 2524 2525 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2526 ArgTy); 2527 if (Arg->isMasked()) { 2528 // TODO: Should we try to emit this once in the entry block? 2529 const LLT S32 = LLT::scalar(32); 2530 const unsigned Mask = Arg->getMask(); 2531 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2532 2533 Register AndMaskSrc = LiveIn; 2534 2535 if (Shift != 0) { 2536 auto ShiftAmt = B.buildConstant(S32, Shift); 2537 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2538 } 2539 2540 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2541 } else { 2542 B.buildCopy(DstReg, LiveIn); 2543 } 2544 2545 return true; 2546 } 2547 2548 bool AMDGPULegalizerInfo::loadInputValue( 2549 Register DstReg, MachineIRBuilder &B, 2550 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2551 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2552 const ArgDescriptor *Arg; 2553 const TargetRegisterClass *ArgRC; 2554 LLT ArgTy; 2555 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2556 2557 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2558 return false; // TODO: Handle these 2559 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2560 } 2561 2562 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2563 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2564 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2565 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2566 return false; 2567 2568 MI.eraseFromParent(); 2569 return true; 2570 } 2571 2572 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2573 MachineRegisterInfo &MRI, 2574 MachineIRBuilder &B) const { 2575 Register Dst = MI.getOperand(0).getReg(); 2576 LLT DstTy = MRI.getType(Dst); 2577 LLT S16 = LLT::scalar(16); 2578 LLT S32 = LLT::scalar(32); 2579 LLT S64 = LLT::scalar(64); 2580 2581 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2582 return true; 2583 2584 if (DstTy == S16) 2585 return legalizeFDIV16(MI, MRI, B); 2586 if (DstTy == S32) 2587 return legalizeFDIV32(MI, MRI, B); 2588 if (DstTy == S64) 2589 return legalizeFDIV64(MI, MRI, B); 2590 2591 return false; 2592 } 2593 2594 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2595 Register DstReg, 2596 Register X, 2597 Register Y, 2598 bool IsDiv) const { 2599 const LLT S1 = LLT::scalar(1); 2600 const LLT S32 = LLT::scalar(32); 2601 2602 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2603 // algorithm used here. 2604 2605 // Initial estimate of inv(y). 2606 auto FloatY = B.buildUITOFP(S32, Y); 2607 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2608 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2609 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2610 auto Z = B.buildFPTOUI(S32, ScaledY); 2611 2612 // One round of UNR. 2613 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2614 auto NegYZ = B.buildMul(S32, NegY, Z); 2615 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2616 2617 // Quotient/remainder estimate. 2618 auto Q = B.buildUMulH(S32, X, Z); 2619 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2620 2621 // First quotient/remainder refinement. 2622 auto One = B.buildConstant(S32, 1); 2623 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2624 if (IsDiv) 2625 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2626 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2627 2628 // Second quotient/remainder refinement. 2629 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2630 if (IsDiv) 2631 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2632 else 2633 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2634 } 2635 2636 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2637 MachineRegisterInfo &MRI, 2638 MachineIRBuilder &B) const { 2639 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2640 Register DstReg = MI.getOperand(0).getReg(); 2641 Register Num = MI.getOperand(1).getReg(); 2642 Register Den = MI.getOperand(2).getReg(); 2643 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2649 // 2650 // Return lo, hi of result 2651 // 2652 // %cvt.lo = G_UITOFP Val.lo 2653 // %cvt.hi = G_UITOFP Val.hi 2654 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2655 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2656 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2657 // %mul2 = G_FMUL %mul1, 2**(-32) 2658 // %trunc = G_INTRINSIC_TRUNC %mul2 2659 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2660 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2661 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2662 Register Val) { 2663 const LLT S32 = LLT::scalar(32); 2664 auto Unmerge = B.buildUnmerge(S32, Val); 2665 2666 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2667 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2668 2669 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2670 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2671 2672 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2673 auto Mul1 = 2674 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2675 2676 // 2**(-32) 2677 auto Mul2 = 2678 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2679 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2680 2681 // -(2**32) 2682 auto Mad2 = B.buildFMAD(S32, Trunc, 2683 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2684 2685 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2686 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2687 2688 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2689 } 2690 2691 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2692 Register DstReg, 2693 Register Numer, 2694 Register Denom, 2695 bool IsDiv) const { 2696 const LLT S32 = LLT::scalar(32); 2697 const LLT S64 = LLT::scalar(64); 2698 const LLT S1 = LLT::scalar(1); 2699 Register RcpLo, RcpHi; 2700 2701 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2702 2703 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2704 2705 auto Zero64 = B.buildConstant(S64, 0); 2706 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2707 2708 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2709 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2710 2711 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2712 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2713 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2714 2715 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2716 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2717 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2718 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2719 2720 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2721 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2722 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2723 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2724 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2725 2726 auto Zero32 = B.buildConstant(S32, 0); 2727 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2728 auto Add2_HiC = 2729 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2730 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2731 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2732 2733 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2734 Register NumerLo = UnmergeNumer.getReg(0); 2735 Register NumerHi = UnmergeNumer.getReg(1); 2736 2737 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2738 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2739 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2740 Register Mul3_Lo = UnmergeMul3.getReg(0); 2741 Register Mul3_Hi = UnmergeMul3.getReg(1); 2742 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2743 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2744 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2745 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2746 2747 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2748 Register DenomLo = UnmergeDenom.getReg(0); 2749 Register DenomHi = UnmergeDenom.getReg(1); 2750 2751 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2752 auto C1 = B.buildSExt(S32, CmpHi); 2753 2754 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2755 auto C2 = B.buildSExt(S32, CmpLo); 2756 2757 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2758 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2759 2760 // TODO: Here and below portions of the code can be enclosed into if/endif. 2761 // Currently control flow is unconditional and we have 4 selects after 2762 // potential endif to substitute PHIs. 2763 2764 // if C3 != 0 ... 2765 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2766 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2767 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2768 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2769 2770 auto One64 = B.buildConstant(S64, 1); 2771 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2772 2773 auto C4 = 2774 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2775 auto C5 = 2776 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2777 auto C6 = B.buildSelect( 2778 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2779 2780 // if (C6 != 0) 2781 auto Add4 = B.buildAdd(S64, Add3, One64); 2782 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2783 2784 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2785 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2786 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2787 2788 // endif C6 2789 // endif C3 2790 2791 if (IsDiv) { 2792 auto Sel1 = B.buildSelect( 2793 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2794 B.buildSelect(DstReg, 2795 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2796 } else { 2797 auto Sel2 = B.buildSelect( 2798 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2799 B.buildSelect(DstReg, 2800 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2801 } 2802 } 2803 2804 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2805 MachineRegisterInfo &MRI, 2806 MachineIRBuilder &B) const { 2807 const LLT S64 = LLT::scalar(64); 2808 const LLT S32 = LLT::scalar(32); 2809 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2810 Register DstReg = MI.getOperand(0).getReg(); 2811 Register Num = MI.getOperand(1).getReg(); 2812 Register Den = MI.getOperand(2).getReg(); 2813 LLT Ty = MRI.getType(DstReg); 2814 2815 if (Ty == S32) 2816 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2817 else if (Ty == S64) 2818 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2819 else 2820 return false; 2821 2822 MI.eraseFromParent(); 2823 return true; 2824 2825 } 2826 2827 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2828 MachineRegisterInfo &MRI, 2829 MachineIRBuilder &B) const { 2830 const LLT S64 = LLT::scalar(64); 2831 const LLT S32 = LLT::scalar(32); 2832 2833 Register DstReg = MI.getOperand(0).getReg(); 2834 const LLT Ty = MRI.getType(DstReg); 2835 if (Ty != S32 && Ty != S64) 2836 return false; 2837 2838 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2839 2840 Register LHS = MI.getOperand(1).getReg(); 2841 Register RHS = MI.getOperand(2).getReg(); 2842 2843 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2844 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2845 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2846 2847 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2848 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2849 2850 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2851 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2852 2853 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2854 if (Ty == S32) 2855 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2856 else 2857 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2858 2859 Register Sign; 2860 if (IsDiv) 2861 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2862 else 2863 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2864 2865 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2866 B.buildSub(DstReg, UDivRem, Sign); 2867 2868 MI.eraseFromParent(); 2869 return true; 2870 } 2871 2872 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2873 MachineRegisterInfo &MRI, 2874 MachineIRBuilder &B) const { 2875 Register Res = MI.getOperand(0).getReg(); 2876 Register LHS = MI.getOperand(1).getReg(); 2877 Register RHS = MI.getOperand(2).getReg(); 2878 2879 uint16_t Flags = MI.getFlags(); 2880 2881 LLT ResTy = MRI.getType(Res); 2882 LLT S32 = LLT::scalar(32); 2883 LLT S64 = LLT::scalar(64); 2884 2885 const MachineFunction &MF = B.getMF(); 2886 bool Unsafe = 2887 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2888 2889 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2890 return false; 2891 2892 if (!Unsafe && ResTy == S32 && 2893 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2894 return false; 2895 2896 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2897 // 1 / x -> RCP(x) 2898 if (CLHS->isExactlyValue(1.0)) { 2899 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2900 .addUse(RHS) 2901 .setMIFlags(Flags); 2902 2903 MI.eraseFromParent(); 2904 return true; 2905 } 2906 2907 // -1 / x -> RCP( FNEG(x) ) 2908 if (CLHS->isExactlyValue(-1.0)) { 2909 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2910 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2911 .addUse(FNeg.getReg(0)) 2912 .setMIFlags(Flags); 2913 2914 MI.eraseFromParent(); 2915 return true; 2916 } 2917 } 2918 2919 // x / y -> x * (1.0 / y) 2920 if (Unsafe) { 2921 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2922 .addUse(RHS) 2923 .setMIFlags(Flags); 2924 B.buildFMul(Res, LHS, RCP, Flags); 2925 2926 MI.eraseFromParent(); 2927 return true; 2928 } 2929 2930 return false; 2931 } 2932 2933 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2934 MachineRegisterInfo &MRI, 2935 MachineIRBuilder &B) const { 2936 Register Res = MI.getOperand(0).getReg(); 2937 Register LHS = MI.getOperand(1).getReg(); 2938 Register RHS = MI.getOperand(2).getReg(); 2939 2940 uint16_t Flags = MI.getFlags(); 2941 2942 LLT S16 = LLT::scalar(16); 2943 LLT S32 = LLT::scalar(32); 2944 2945 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2946 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2947 2948 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2949 .addUse(RHSExt.getReg(0)) 2950 .setMIFlags(Flags); 2951 2952 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2953 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2954 2955 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2956 .addUse(RDst.getReg(0)) 2957 .addUse(RHS) 2958 .addUse(LHS) 2959 .setMIFlags(Flags); 2960 2961 MI.eraseFromParent(); 2962 return true; 2963 } 2964 2965 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2966 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2967 static void toggleSPDenormMode(bool Enable, 2968 MachineIRBuilder &B, 2969 const GCNSubtarget &ST, 2970 AMDGPU::SIModeRegisterDefaults Mode) { 2971 // Set SP denorm mode to this value. 2972 unsigned SPDenormMode = 2973 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2974 2975 if (ST.hasDenormModeInst()) { 2976 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2977 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2978 2979 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2980 B.buildInstr(AMDGPU::S_DENORM_MODE) 2981 .addImm(NewDenormModeValue); 2982 2983 } else { 2984 // Select FP32 bit field in mode register. 2985 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2986 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2987 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2988 2989 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2990 .addImm(SPDenormMode) 2991 .addImm(SPDenormModeBitField); 2992 } 2993 } 2994 2995 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2996 MachineRegisterInfo &MRI, 2997 MachineIRBuilder &B) const { 2998 Register Res = MI.getOperand(0).getReg(); 2999 Register LHS = MI.getOperand(1).getReg(); 3000 Register RHS = MI.getOperand(2).getReg(); 3001 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3002 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3003 3004 uint16_t Flags = MI.getFlags(); 3005 3006 LLT S32 = LLT::scalar(32); 3007 LLT S1 = LLT::scalar(1); 3008 3009 auto One = B.buildFConstant(S32, 1.0f); 3010 3011 auto DenominatorScaled = 3012 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3013 .addUse(LHS) 3014 .addUse(RHS) 3015 .addImm(0) 3016 .setMIFlags(Flags); 3017 auto NumeratorScaled = 3018 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3019 .addUse(LHS) 3020 .addUse(RHS) 3021 .addImm(1) 3022 .setMIFlags(Flags); 3023 3024 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3025 .addUse(DenominatorScaled.getReg(0)) 3026 .setMIFlags(Flags); 3027 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3028 3029 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3030 // aren't modeled as reading it. 3031 if (!Mode.allFP32Denormals()) 3032 toggleSPDenormMode(true, B, ST, Mode); 3033 3034 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3035 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3036 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3037 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3038 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3039 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3040 3041 if (!Mode.allFP32Denormals()) 3042 toggleSPDenormMode(false, B, ST, Mode); 3043 3044 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3045 .addUse(Fma4.getReg(0)) 3046 .addUse(Fma1.getReg(0)) 3047 .addUse(Fma3.getReg(0)) 3048 .addUse(NumeratorScaled.getReg(1)) 3049 .setMIFlags(Flags); 3050 3051 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3052 .addUse(Fmas.getReg(0)) 3053 .addUse(RHS) 3054 .addUse(LHS) 3055 .setMIFlags(Flags); 3056 3057 MI.eraseFromParent(); 3058 return true; 3059 } 3060 3061 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3062 MachineRegisterInfo &MRI, 3063 MachineIRBuilder &B) const { 3064 Register Res = MI.getOperand(0).getReg(); 3065 Register LHS = MI.getOperand(1).getReg(); 3066 Register RHS = MI.getOperand(2).getReg(); 3067 3068 uint16_t Flags = MI.getFlags(); 3069 3070 LLT S64 = LLT::scalar(64); 3071 LLT S1 = LLT::scalar(1); 3072 3073 auto One = B.buildFConstant(S64, 1.0); 3074 3075 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3076 .addUse(LHS) 3077 .addUse(RHS) 3078 .addImm(0) 3079 .setMIFlags(Flags); 3080 3081 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3082 3083 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3084 .addUse(DivScale0.getReg(0)) 3085 .setMIFlags(Flags); 3086 3087 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3088 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3089 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3090 3091 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3092 .addUse(LHS) 3093 .addUse(RHS) 3094 .addImm(1) 3095 .setMIFlags(Flags); 3096 3097 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3098 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3099 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3100 3101 Register Scale; 3102 if (!ST.hasUsableDivScaleConditionOutput()) { 3103 // Workaround a hardware bug on SI where the condition output from div_scale 3104 // is not usable. 3105 3106 LLT S32 = LLT::scalar(32); 3107 3108 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3109 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3110 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3111 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3112 3113 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3114 Scale1Unmerge.getReg(1)); 3115 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3116 Scale0Unmerge.getReg(1)); 3117 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3118 } else { 3119 Scale = DivScale1.getReg(1); 3120 } 3121 3122 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3123 .addUse(Fma4.getReg(0)) 3124 .addUse(Fma3.getReg(0)) 3125 .addUse(Mul.getReg(0)) 3126 .addUse(Scale) 3127 .setMIFlags(Flags); 3128 3129 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3130 .addUse(Fmas.getReg(0)) 3131 .addUse(RHS) 3132 .addUse(LHS) 3133 .setMIFlags(Flags); 3134 3135 MI.eraseFromParent(); 3136 return true; 3137 } 3138 3139 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3140 MachineRegisterInfo &MRI, 3141 MachineIRBuilder &B) const { 3142 Register Res = MI.getOperand(0).getReg(); 3143 Register LHS = MI.getOperand(2).getReg(); 3144 Register RHS = MI.getOperand(3).getReg(); 3145 uint16_t Flags = MI.getFlags(); 3146 3147 LLT S32 = LLT::scalar(32); 3148 LLT S1 = LLT::scalar(1); 3149 3150 auto Abs = B.buildFAbs(S32, RHS, Flags); 3151 const APFloat C0Val(1.0f); 3152 3153 auto C0 = B.buildConstant(S32, 0x6f800000); 3154 auto C1 = B.buildConstant(S32, 0x2f800000); 3155 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3156 3157 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3158 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3159 3160 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3161 3162 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3163 .addUse(Mul0.getReg(0)) 3164 .setMIFlags(Flags); 3165 3166 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3167 3168 B.buildFMul(Res, Sel, Mul1, Flags); 3169 3170 MI.eraseFromParent(); 3171 return true; 3172 } 3173 3174 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3175 // FIXME: Why do we handle this one but not other removed instructions? 3176 // 3177 // Reciprocal square root. The clamp prevents infinite results, clamping 3178 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3179 // +-max_float. 3180 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3181 MachineRegisterInfo &MRI, 3182 MachineIRBuilder &B) const { 3183 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3184 return true; 3185 3186 Register Dst = MI.getOperand(0).getReg(); 3187 Register Src = MI.getOperand(2).getReg(); 3188 auto Flags = MI.getFlags(); 3189 3190 LLT Ty = MRI.getType(Dst); 3191 3192 const fltSemantics *FltSemantics; 3193 if (Ty == LLT::scalar(32)) 3194 FltSemantics = &APFloat::IEEEsingle(); 3195 else if (Ty == LLT::scalar(64)) 3196 FltSemantics = &APFloat::IEEEdouble(); 3197 else 3198 return false; 3199 3200 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3201 .addUse(Src) 3202 .setMIFlags(Flags); 3203 3204 // We don't need to concern ourselves with the snan handling difference, since 3205 // the rsq quieted (or not) so use the one which will directly select. 3206 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3207 const bool UseIEEE = MFI->getMode().IEEE; 3208 3209 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3210 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3211 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3212 3213 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3214 3215 if (UseIEEE) 3216 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3217 else 3218 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3219 MI.eraseFromParent(); 3220 return true; 3221 } 3222 3223 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3224 switch (IID) { 3225 case Intrinsic::amdgcn_ds_fadd: 3226 return AMDGPU::G_ATOMICRMW_FADD; 3227 case Intrinsic::amdgcn_ds_fmin: 3228 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3229 case Intrinsic::amdgcn_ds_fmax: 3230 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3231 default: 3232 llvm_unreachable("not a DS FP intrinsic"); 3233 } 3234 } 3235 3236 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3237 MachineInstr &MI, 3238 Intrinsic::ID IID) const { 3239 GISelChangeObserver &Observer = Helper.Observer; 3240 Observer.changingInstr(MI); 3241 3242 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3243 3244 // The remaining operands were used to set fields in the MemOperand on 3245 // construction. 3246 for (int I = 6; I > 3; --I) 3247 MI.RemoveOperand(I); 3248 3249 MI.RemoveOperand(1); // Remove the intrinsic ID. 3250 Observer.changedInstr(MI); 3251 return true; 3252 } 3253 3254 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3255 MachineRegisterInfo &MRI, 3256 MachineIRBuilder &B) const { 3257 uint64_t Offset = 3258 ST.getTargetLowering()->getImplicitParameterOffset( 3259 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3260 LLT DstTy = MRI.getType(DstReg); 3261 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3262 3263 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3264 if (!loadInputValue(KernargPtrReg, B, 3265 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3266 return false; 3267 3268 // FIXME: This should be nuw 3269 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3270 return true; 3271 } 3272 3273 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3274 MachineRegisterInfo &MRI, 3275 MachineIRBuilder &B) const { 3276 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3277 if (!MFI->isEntryFunction()) { 3278 return legalizePreloadedArgIntrin(MI, MRI, B, 3279 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3280 } 3281 3282 Register DstReg = MI.getOperand(0).getReg(); 3283 if (!getImplicitArgPtr(DstReg, MRI, B)) 3284 return false; 3285 3286 MI.eraseFromParent(); 3287 return true; 3288 } 3289 3290 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3291 MachineRegisterInfo &MRI, 3292 MachineIRBuilder &B, 3293 unsigned AddrSpace) const { 3294 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3295 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3296 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3297 MI.eraseFromParent(); 3298 return true; 3299 } 3300 3301 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3302 // offset (the offset that is included in bounds checking and swizzling, to be 3303 // split between the instruction's voffset and immoffset fields) and soffset 3304 // (the offset that is excluded from bounds checking and swizzling, to go in 3305 // the instruction's soffset field). This function takes the first kind of 3306 // offset and figures out how to split it between voffset and immoffset. 3307 std::tuple<Register, unsigned, unsigned> 3308 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3309 Register OrigOffset) const { 3310 const unsigned MaxImm = 4095; 3311 Register BaseReg; 3312 unsigned TotalConstOffset; 3313 MachineInstr *OffsetDef; 3314 const LLT S32 = LLT::scalar(32); 3315 3316 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3317 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3318 3319 unsigned ImmOffset = TotalConstOffset; 3320 3321 // If the immediate value is too big for the immoffset field, put the value 3322 // and -4096 into the immoffset field so that the value that is copied/added 3323 // for the voffset field is a multiple of 4096, and it stands more chance 3324 // of being CSEd with the copy/add for another similar load/store. 3325 // However, do not do that rounding down to a multiple of 4096 if that is a 3326 // negative number, as it appears to be illegal to have a negative offset 3327 // in the vgpr, even if adding the immediate offset makes it positive. 3328 unsigned Overflow = ImmOffset & ~MaxImm; 3329 ImmOffset -= Overflow; 3330 if ((int32_t)Overflow < 0) { 3331 Overflow += ImmOffset; 3332 ImmOffset = 0; 3333 } 3334 3335 if (Overflow != 0) { 3336 if (!BaseReg) { 3337 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3338 } else { 3339 auto OverflowVal = B.buildConstant(S32, Overflow); 3340 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3341 } 3342 } 3343 3344 if (!BaseReg) 3345 BaseReg = B.buildConstant(S32, 0).getReg(0); 3346 3347 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3348 } 3349 3350 /// Handle register layout difference for f16 images for some subtargets. 3351 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3352 MachineRegisterInfo &MRI, 3353 Register Reg) const { 3354 if (!ST.hasUnpackedD16VMem()) 3355 return Reg; 3356 3357 const LLT S16 = LLT::scalar(16); 3358 const LLT S32 = LLT::scalar(32); 3359 LLT StoreVT = MRI.getType(Reg); 3360 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3361 3362 auto Unmerge = B.buildUnmerge(S16, Reg); 3363 3364 SmallVector<Register, 4> WideRegs; 3365 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3366 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3367 3368 int NumElts = StoreVT.getNumElements(); 3369 3370 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3371 } 3372 3373 Register AMDGPULegalizerInfo::fixStoreSourceType( 3374 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3375 MachineRegisterInfo *MRI = B.getMRI(); 3376 LLT Ty = MRI->getType(VData); 3377 3378 const LLT S16 = LLT::scalar(16); 3379 3380 // Fixup illegal register types for i8 stores. 3381 if (Ty == LLT::scalar(8) || Ty == S16) { 3382 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3383 return AnyExt; 3384 } 3385 3386 if (Ty.isVector()) { 3387 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3388 if (IsFormat) 3389 return handleD16VData(B, *MRI, VData); 3390 } 3391 } 3392 3393 return VData; 3394 } 3395 3396 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3397 MachineRegisterInfo &MRI, 3398 MachineIRBuilder &B, 3399 bool IsTyped, 3400 bool IsFormat) const { 3401 Register VData = MI.getOperand(1).getReg(); 3402 LLT Ty = MRI.getType(VData); 3403 LLT EltTy = Ty.getScalarType(); 3404 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3405 const LLT S32 = LLT::scalar(32); 3406 3407 VData = fixStoreSourceType(B, VData, IsFormat); 3408 Register RSrc = MI.getOperand(2).getReg(); 3409 3410 MachineMemOperand *MMO = *MI.memoperands_begin(); 3411 const int MemSize = MMO->getSize(); 3412 3413 unsigned ImmOffset; 3414 unsigned TotalOffset; 3415 3416 // The typed intrinsics add an immediate after the registers. 3417 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3418 3419 // The struct intrinsic variants add one additional operand over raw. 3420 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3421 Register VIndex; 3422 int OpOffset = 0; 3423 if (HasVIndex) { 3424 VIndex = MI.getOperand(3).getReg(); 3425 OpOffset = 1; 3426 } 3427 3428 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3429 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3430 3431 unsigned Format = 0; 3432 if (IsTyped) { 3433 Format = MI.getOperand(5 + OpOffset).getImm(); 3434 ++OpOffset; 3435 } 3436 3437 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3438 3439 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3440 if (TotalOffset != 0) 3441 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3442 3443 unsigned Opc; 3444 if (IsTyped) { 3445 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3446 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3447 } else if (IsFormat) { 3448 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3449 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3450 } else { 3451 switch (MemSize) { 3452 case 1: 3453 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3454 break; 3455 case 2: 3456 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3457 break; 3458 default: 3459 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3460 break; 3461 } 3462 } 3463 3464 if (!VIndex) 3465 VIndex = B.buildConstant(S32, 0).getReg(0); 3466 3467 auto MIB = B.buildInstr(Opc) 3468 .addUse(VData) // vdata 3469 .addUse(RSrc) // rsrc 3470 .addUse(VIndex) // vindex 3471 .addUse(VOffset) // voffset 3472 .addUse(SOffset) // soffset 3473 .addImm(ImmOffset); // offset(imm) 3474 3475 if (IsTyped) 3476 MIB.addImm(Format); 3477 3478 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3479 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3480 .addMemOperand(MMO); 3481 3482 MI.eraseFromParent(); 3483 return true; 3484 } 3485 3486 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3487 MachineRegisterInfo &MRI, 3488 MachineIRBuilder &B, 3489 bool IsFormat, 3490 bool IsTyped) const { 3491 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3492 MachineMemOperand *MMO = *MI.memoperands_begin(); 3493 const int MemSize = MMO->getSize(); 3494 const LLT S32 = LLT::scalar(32); 3495 3496 Register Dst = MI.getOperand(0).getReg(); 3497 Register RSrc = MI.getOperand(2).getReg(); 3498 3499 // The typed intrinsics add an immediate after the registers. 3500 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3501 3502 // The struct intrinsic variants add one additional operand over raw. 3503 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3504 Register VIndex; 3505 int OpOffset = 0; 3506 if (HasVIndex) { 3507 VIndex = MI.getOperand(3).getReg(); 3508 OpOffset = 1; 3509 } 3510 3511 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3512 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3513 3514 unsigned Format = 0; 3515 if (IsTyped) { 3516 Format = MI.getOperand(5 + OpOffset).getImm(); 3517 ++OpOffset; 3518 } 3519 3520 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3521 unsigned ImmOffset; 3522 unsigned TotalOffset; 3523 3524 LLT Ty = MRI.getType(Dst); 3525 LLT EltTy = Ty.getScalarType(); 3526 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3527 const bool Unpacked = ST.hasUnpackedD16VMem(); 3528 3529 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3530 if (TotalOffset != 0) 3531 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3532 3533 unsigned Opc; 3534 3535 if (IsTyped) { 3536 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3537 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3538 } else if (IsFormat) { 3539 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3540 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3541 } else { 3542 switch (MemSize) { 3543 case 1: 3544 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3545 break; 3546 case 2: 3547 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3548 break; 3549 default: 3550 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3551 break; 3552 } 3553 } 3554 3555 Register LoadDstReg; 3556 3557 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3558 LLT UnpackedTy = Ty.changeElementSize(32); 3559 3560 if (IsExtLoad) 3561 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3562 else if (Unpacked && IsD16 && Ty.isVector()) 3563 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3564 else 3565 LoadDstReg = Dst; 3566 3567 if (!VIndex) 3568 VIndex = B.buildConstant(S32, 0).getReg(0); 3569 3570 auto MIB = B.buildInstr(Opc) 3571 .addDef(LoadDstReg) // vdata 3572 .addUse(RSrc) // rsrc 3573 .addUse(VIndex) // vindex 3574 .addUse(VOffset) // voffset 3575 .addUse(SOffset) // soffset 3576 .addImm(ImmOffset); // offset(imm) 3577 3578 if (IsTyped) 3579 MIB.addImm(Format); 3580 3581 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3582 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3583 .addMemOperand(MMO); 3584 3585 if (LoadDstReg != Dst) { 3586 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3587 3588 // Widen result for extending loads was widened. 3589 if (IsExtLoad) 3590 B.buildTrunc(Dst, LoadDstReg); 3591 else { 3592 // Repack to original 16-bit vector result 3593 // FIXME: G_TRUNC should work, but legalization currently fails 3594 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3595 SmallVector<Register, 4> Repack; 3596 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3597 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3598 B.buildMerge(Dst, Repack); 3599 } 3600 } 3601 3602 MI.eraseFromParent(); 3603 return true; 3604 } 3605 3606 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3607 MachineIRBuilder &B, 3608 bool IsInc) const { 3609 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3610 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3611 B.buildInstr(Opc) 3612 .addDef(MI.getOperand(0).getReg()) 3613 .addUse(MI.getOperand(2).getReg()) 3614 .addUse(MI.getOperand(3).getReg()) 3615 .cloneMemRefs(MI); 3616 MI.eraseFromParent(); 3617 return true; 3618 } 3619 3620 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3621 switch (IntrID) { 3622 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3623 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3624 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3625 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3626 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3627 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3628 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3629 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3631 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3632 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3633 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3634 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3635 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3636 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3637 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3638 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3639 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3640 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3641 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3642 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3643 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3644 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3646 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3647 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3648 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3649 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3650 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3652 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3653 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3654 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3655 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3656 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3657 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3658 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3659 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3661 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3662 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3663 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3664 default: 3665 llvm_unreachable("unhandled atomic opcode"); 3666 } 3667 } 3668 3669 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3670 MachineIRBuilder &B, 3671 Intrinsic::ID IID) const { 3672 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3673 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3674 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3675 3676 Register Dst; 3677 3678 int OpOffset = 0; 3679 if (HasReturn) { 3680 // A few FP atomics do not support return values. 3681 Dst = MI.getOperand(0).getReg(); 3682 } else { 3683 OpOffset = -1; 3684 } 3685 3686 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3687 Register CmpVal; 3688 3689 if (IsCmpSwap) { 3690 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3691 ++OpOffset; 3692 } 3693 3694 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3695 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3696 3697 // The struct intrinsic variants add one additional operand over raw. 3698 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3699 Register VIndex; 3700 if (HasVIndex) { 3701 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3702 ++OpOffset; 3703 } 3704 3705 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3706 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3707 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3708 3709 MachineMemOperand *MMO = *MI.memoperands_begin(); 3710 3711 unsigned ImmOffset; 3712 unsigned TotalOffset; 3713 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3714 if (TotalOffset != 0) 3715 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3716 3717 if (!VIndex) 3718 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3719 3720 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3721 3722 if (HasReturn) 3723 MIB.addDef(Dst); 3724 3725 MIB.addUse(VData); // vdata 3726 3727 if (IsCmpSwap) 3728 MIB.addReg(CmpVal); 3729 3730 MIB.addUse(RSrc) // rsrc 3731 .addUse(VIndex) // vindex 3732 .addUse(VOffset) // voffset 3733 .addUse(SOffset) // soffset 3734 .addImm(ImmOffset) // offset(imm) 3735 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3736 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3737 .addMemOperand(MMO); 3738 3739 MI.eraseFromParent(); 3740 return true; 3741 } 3742 3743 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3744 /// vector with s16 typed elements. 3745 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3746 SmallVectorImpl<Register> &PackedAddrs, 3747 int AddrIdx, int DimIdx, int EndIdx, 3748 int NumGradients) { 3749 const LLT S16 = LLT::scalar(16); 3750 const LLT V2S16 = LLT::vector(2, 16); 3751 3752 for (int I = AddrIdx; I < EndIdx; ++I) { 3753 MachineOperand &SrcOp = MI.getOperand(I); 3754 if (!SrcOp.isReg()) 3755 continue; // _L to _LZ may have eliminated this. 3756 3757 Register AddrReg = SrcOp.getReg(); 3758 3759 if (I < DimIdx) { 3760 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3761 PackedAddrs.push_back(AddrReg); 3762 } else { 3763 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3764 // derivatives dx/dh and dx/dv are packed with undef. 3765 if (((I + 1) >= EndIdx) || 3766 ((NumGradients / 2) % 2 == 1 && 3767 (I == DimIdx + (NumGradients / 2) - 1 || 3768 I == DimIdx + NumGradients - 1)) || 3769 // Check for _L to _LZ optimization 3770 !MI.getOperand(I + 1).isReg()) { 3771 PackedAddrs.push_back( 3772 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3773 .getReg(0)); 3774 } else { 3775 PackedAddrs.push_back( 3776 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3777 .getReg(0)); 3778 ++I; 3779 } 3780 } 3781 } 3782 } 3783 3784 /// Convert from separate vaddr components to a single vector address register, 3785 /// and replace the remaining operands with $noreg. 3786 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3787 int DimIdx, int NumVAddrs) { 3788 const LLT S32 = LLT::scalar(32); 3789 3790 SmallVector<Register, 8> AddrRegs; 3791 for (int I = 0; I != NumVAddrs; ++I) { 3792 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3793 if (SrcOp.isReg()) { 3794 AddrRegs.push_back(SrcOp.getReg()); 3795 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3796 } 3797 } 3798 3799 int NumAddrRegs = AddrRegs.size(); 3800 if (NumAddrRegs != 1) { 3801 // Round up to 8 elements for v5-v7 3802 // FIXME: Missing intermediate sized register classes and instructions. 3803 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3804 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3805 auto Undef = B.buildUndef(S32); 3806 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3807 NumAddrRegs = RoundedNumRegs; 3808 } 3809 3810 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3811 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3812 } 3813 3814 for (int I = 1; I != NumVAddrs; ++I) { 3815 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3816 if (SrcOp.isReg()) 3817 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3818 } 3819 } 3820 3821 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3822 /// 3823 /// Depending on the subtarget, load/store with 16-bit element data need to be 3824 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3825 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3826 /// registers. 3827 /// 3828 /// We don't want to directly select image instructions just yet, but also want 3829 /// to exposes all register repacking to the legalizer/combiners. We also don't 3830 /// want a selected instrution entering RegBankSelect. In order to avoid 3831 /// defining a multitude of intermediate image instructions, directly hack on 3832 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3833 /// now unnecessary arguments with $noreg. 3834 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3835 MachineInstr &MI, MachineIRBuilder &B, 3836 GISelChangeObserver &Observer, 3837 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3838 3839 const int NumDefs = MI.getNumExplicitDefs(); 3840 bool IsTFE = NumDefs == 2; 3841 // We are only processing the operands of d16 image operations on subtargets 3842 // that use the unpacked register layout, or need to repack the TFE result. 3843 3844 // TODO: Do we need to guard against already legalized intrinsics? 3845 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3846 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3847 3848 MachineRegisterInfo *MRI = B.getMRI(); 3849 const LLT S32 = LLT::scalar(32); 3850 const LLT S16 = LLT::scalar(16); 3851 const LLT V2S16 = LLT::vector(2, 16); 3852 3853 // Index of first address argument 3854 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3855 3856 int NumVAddrs, NumGradients; 3857 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3858 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3859 getDMaskIdx(BaseOpcode, NumDefs); 3860 unsigned DMask = 0; 3861 3862 // Check for 16 bit addresses and pack if true. 3863 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3864 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3865 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3866 const bool IsG16 = GradTy == S16; 3867 const bool IsA16 = AddrTy == S16; 3868 3869 int DMaskLanes = 0; 3870 if (!BaseOpcode->Atomic) { 3871 DMask = MI.getOperand(DMaskIdx).getImm(); 3872 if (BaseOpcode->Gather4) { 3873 DMaskLanes = 4; 3874 } else if (DMask != 0) { 3875 DMaskLanes = countPopulation(DMask); 3876 } else if (!IsTFE && !BaseOpcode->Store) { 3877 // If dmask is 0, this is a no-op load. This can be eliminated. 3878 B.buildUndef(MI.getOperand(0)); 3879 MI.eraseFromParent(); 3880 return true; 3881 } 3882 } 3883 3884 Observer.changingInstr(MI); 3885 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3886 3887 unsigned NewOpcode = NumDefs == 0 ? 3888 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3889 3890 // Track that we legalized this 3891 MI.setDesc(B.getTII().get(NewOpcode)); 3892 3893 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3894 // dmask to be at least 1 otherwise the instruction will fail 3895 if (IsTFE && DMask == 0) { 3896 DMask = 0x1; 3897 DMaskLanes = 1; 3898 MI.getOperand(DMaskIdx).setImm(DMask); 3899 } 3900 3901 if (BaseOpcode->Atomic) { 3902 Register VData0 = MI.getOperand(2).getReg(); 3903 LLT Ty = MRI->getType(VData0); 3904 3905 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3906 if (Ty.isVector()) 3907 return false; 3908 3909 if (BaseOpcode->AtomicX2) { 3910 Register VData1 = MI.getOperand(3).getReg(); 3911 // The two values are packed in one register. 3912 LLT PackedTy = LLT::vector(2, Ty); 3913 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3914 MI.getOperand(2).setReg(Concat.getReg(0)); 3915 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3916 } 3917 } 3918 3919 int CorrectedNumVAddrs = NumVAddrs; 3920 3921 // Optimize _L to _LZ when _L is zero 3922 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3923 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3924 const ConstantFP *ConstantLod; 3925 const int LodIdx = AddrIdx + NumVAddrs - 1; 3926 3927 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3928 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3929 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3930 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3931 LZMappingInfo->LZ, ImageDimIntr->Dim); 3932 3933 // The starting indexes should remain in the same place. 3934 --NumVAddrs; 3935 --CorrectedNumVAddrs; 3936 3937 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3938 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3939 MI.RemoveOperand(LodIdx); 3940 } 3941 } 3942 } 3943 3944 // Optimize _mip away, when 'lod' is zero 3945 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3946 int64_t ConstantLod; 3947 const int LodIdx = AddrIdx + NumVAddrs - 1; 3948 3949 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3950 if (ConstantLod == 0) { 3951 // TODO: Change intrinsic opcode and remove operand instead or replacing 3952 // it with 0, as the _L to _LZ handling is done above. 3953 MI.getOperand(LodIdx).ChangeToImmediate(0); 3954 --CorrectedNumVAddrs; 3955 } 3956 } 3957 } 3958 3959 // Rewrite the addressing register layout before doing anything else. 3960 if (IsA16 || IsG16) { 3961 if (IsA16) { 3962 // Target must support the feature and gradients need to be 16 bit too 3963 if (!ST.hasA16() || !IsG16) 3964 return false; 3965 } else if (!ST.hasG16()) 3966 return false; 3967 3968 if (NumVAddrs > 1) { 3969 SmallVector<Register, 4> PackedRegs; 3970 // Don't compress addresses for G16 3971 const int PackEndIdx = 3972 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3973 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3974 PackEndIdx, NumGradients); 3975 3976 if (!IsA16) { 3977 // Add uncompressed address 3978 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3979 int AddrReg = MI.getOperand(I).getReg(); 3980 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3981 PackedRegs.push_back(AddrReg); 3982 } 3983 } 3984 3985 // See also below in the non-a16 branch 3986 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3987 3988 if (!UseNSA && PackedRegs.size() > 1) { 3989 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3990 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3991 PackedRegs[0] = Concat.getReg(0); 3992 PackedRegs.resize(1); 3993 } 3994 3995 const int NumPacked = PackedRegs.size(); 3996 for (int I = 0; I != NumVAddrs; ++I) { 3997 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3998 if (!SrcOp.isReg()) { 3999 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4000 continue; 4001 } 4002 4003 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4004 4005 if (I < NumPacked) 4006 SrcOp.setReg(PackedRegs[I]); 4007 else 4008 SrcOp.setReg(AMDGPU::NoRegister); 4009 } 4010 } 4011 } else { 4012 // If the register allocator cannot place the address registers contiguously 4013 // without introducing moves, then using the non-sequential address encoding 4014 // is always preferable, since it saves VALU instructions and is usually a 4015 // wash in terms of code size or even better. 4016 // 4017 // However, we currently have no way of hinting to the register allocator 4018 // that MIMG addresses should be placed contiguously when it is possible to 4019 // do so, so force non-NSA for the common 2-address case as a heuristic. 4020 // 4021 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4022 // allocation when possible. 4023 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4024 4025 if (!UseNSA && NumVAddrs > 1) 4026 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4027 } 4028 4029 int Flags = 0; 4030 if (IsA16) 4031 Flags |= 1; 4032 if (IsG16) 4033 Flags |= 2; 4034 MI.addOperand(MachineOperand::CreateImm(Flags)); 4035 4036 if (BaseOpcode->Store) { // No TFE for stores? 4037 // TODO: Handle dmask trim 4038 Register VData = MI.getOperand(1).getReg(); 4039 LLT Ty = MRI->getType(VData); 4040 if (!Ty.isVector() || Ty.getElementType() != S16) 4041 return true; 4042 4043 Register RepackedReg = handleD16VData(B, *MRI, VData); 4044 if (RepackedReg != VData) { 4045 MI.getOperand(1).setReg(RepackedReg); 4046 } 4047 4048 return true; 4049 } 4050 4051 Register DstReg = MI.getOperand(0).getReg(); 4052 LLT Ty = MRI->getType(DstReg); 4053 const LLT EltTy = Ty.getScalarType(); 4054 const bool IsD16 = Ty.getScalarType() == S16; 4055 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4056 4057 // Confirm that the return type is large enough for the dmask specified 4058 if (NumElts < DMaskLanes) 4059 return false; 4060 4061 if (NumElts > 4 || DMaskLanes > 4) 4062 return false; 4063 4064 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4065 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4066 4067 // The raw dword aligned data component of the load. The only legal cases 4068 // where this matters should be when using the packed D16 format, for 4069 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4070 LLT RoundedTy; 4071 4072 // S32 vector to to cover all data, plus TFE result element. 4073 LLT TFETy; 4074 4075 // Register type to use for each loaded component. Will be S32 or V2S16. 4076 LLT RegTy; 4077 4078 if (IsD16 && ST.hasUnpackedD16VMem()) { 4079 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4080 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4081 RegTy = S32; 4082 } else { 4083 unsigned EltSize = EltTy.getSizeInBits(); 4084 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4085 unsigned RoundedSize = 32 * RoundedElts; 4086 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4087 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4088 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4089 } 4090 4091 // The return type does not need adjustment. 4092 // TODO: Should we change s16 case to s32 or <2 x s16>? 4093 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4094 return true; 4095 4096 Register Dst1Reg; 4097 4098 // Insert after the instruction. 4099 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4100 4101 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4102 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4103 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4104 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4105 4106 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4107 4108 MI.getOperand(0).setReg(NewResultReg); 4109 4110 // In the IR, TFE is supposed to be used with a 2 element struct return 4111 // type. The intruction really returns these two values in one contiguous 4112 // register, with one additional dword beyond the loaded data. Rewrite the 4113 // return type to use a single register result. 4114 4115 if (IsTFE) { 4116 Dst1Reg = MI.getOperand(1).getReg(); 4117 if (MRI->getType(Dst1Reg) != S32) 4118 return false; 4119 4120 // TODO: Make sure the TFE operand bit is set. 4121 MI.RemoveOperand(1); 4122 4123 // Handle the easy case that requires no repack instructions. 4124 if (Ty == S32) { 4125 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4126 return true; 4127 } 4128 } 4129 4130 // Now figure out how to copy the new result register back into the old 4131 // result. 4132 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4133 4134 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4135 4136 if (ResultNumRegs == 1) { 4137 assert(!IsTFE); 4138 ResultRegs[0] = NewResultReg; 4139 } else { 4140 // We have to repack into a new vector of some kind. 4141 for (int I = 0; I != NumDataRegs; ++I) 4142 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4143 B.buildUnmerge(ResultRegs, NewResultReg); 4144 4145 // Drop the final TFE element to get the data part. The TFE result is 4146 // directly written to the right place already. 4147 if (IsTFE) 4148 ResultRegs.resize(NumDataRegs); 4149 } 4150 4151 // For an s16 scalar result, we form an s32 result with a truncate regardless 4152 // of packed vs. unpacked. 4153 if (IsD16 && !Ty.isVector()) { 4154 B.buildTrunc(DstReg, ResultRegs[0]); 4155 return true; 4156 } 4157 4158 // Avoid a build/concat_vector of 1 entry. 4159 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4160 B.buildBitcast(DstReg, ResultRegs[0]); 4161 return true; 4162 } 4163 4164 assert(Ty.isVector()); 4165 4166 if (IsD16) { 4167 // For packed D16 results with TFE enabled, all the data components are 4168 // S32. Cast back to the expected type. 4169 // 4170 // TODO: We don't really need to use load s32 elements. We would only need one 4171 // cast for the TFE result if a multiple of v2s16 was used. 4172 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4173 for (Register &Reg : ResultRegs) 4174 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4175 } else if (ST.hasUnpackedD16VMem()) { 4176 for (Register &Reg : ResultRegs) 4177 Reg = B.buildTrunc(S16, Reg).getReg(0); 4178 } 4179 } 4180 4181 auto padWithUndef = [&](LLT Ty, int NumElts) { 4182 if (NumElts == 0) 4183 return; 4184 Register Undef = B.buildUndef(Ty).getReg(0); 4185 for (int I = 0; I != NumElts; ++I) 4186 ResultRegs.push_back(Undef); 4187 }; 4188 4189 // Pad out any elements eliminated due to the dmask. 4190 LLT ResTy = MRI->getType(ResultRegs[0]); 4191 if (!ResTy.isVector()) { 4192 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4193 B.buildBuildVector(DstReg, ResultRegs); 4194 return true; 4195 } 4196 4197 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4198 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4199 4200 // Deal with the one annoying legal case. 4201 const LLT V3S16 = LLT::vector(3, 16); 4202 if (Ty == V3S16) { 4203 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4204 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4205 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4206 return true; 4207 } 4208 4209 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4210 B.buildConcatVectors(DstReg, ResultRegs); 4211 return true; 4212 } 4213 4214 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4215 LegalizerHelper &Helper, MachineInstr &MI) const { 4216 MachineIRBuilder &B = Helper.MIRBuilder; 4217 GISelChangeObserver &Observer = Helper.Observer; 4218 4219 Register Dst = MI.getOperand(0).getReg(); 4220 LLT Ty = B.getMRI()->getType(Dst); 4221 unsigned Size = Ty.getSizeInBits(); 4222 MachineFunction &MF = B.getMF(); 4223 4224 Observer.changingInstr(MI); 4225 4226 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4227 Ty = getBitcastRegisterType(Ty); 4228 Helper.bitcastDst(MI, Ty, 0); 4229 Dst = MI.getOperand(0).getReg(); 4230 B.setInsertPt(B.getMBB(), MI); 4231 } 4232 4233 // FIXME: We don't really need this intermediate instruction. The intrinsic 4234 // should be fixed to have a memory operand. Since it's readnone, we're not 4235 // allowed to add one. 4236 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4237 MI.RemoveOperand(1); // Remove intrinsic ID 4238 4239 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4240 // TODO: Should this use datalayout alignment? 4241 const unsigned MemSize = (Size + 7) / 8; 4242 const Align MemAlign(4); 4243 MachineMemOperand *MMO = MF.getMachineMemOperand( 4244 MachinePointerInfo(), 4245 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4246 MachineMemOperand::MOInvariant, 4247 MemSize, MemAlign); 4248 MI.addMemOperand(MF, MMO); 4249 4250 // There are no 96-bit result scalar loads, but widening to 128-bit should 4251 // always be legal. We may need to restore this to a 96-bit result if it turns 4252 // out this needs to be converted to a vector load during RegBankSelect. 4253 if (!isPowerOf2_32(Size)) { 4254 if (Ty.isVector()) 4255 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4256 else 4257 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4258 } 4259 4260 Observer.changedInstr(MI); 4261 return true; 4262 } 4263 4264 // TODO: Move to selection 4265 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4266 MachineRegisterInfo &MRI, 4267 MachineIRBuilder &B) const { 4268 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4269 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4270 !ST.isTrapHandlerEnabled()) { 4271 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4272 } else { 4273 // Pass queue pointer to trap handler as input, and insert trap instruction 4274 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4275 MachineRegisterInfo &MRI = *B.getMRI(); 4276 4277 Register LiveIn = 4278 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4279 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4280 return false; 4281 4282 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4283 B.buildCopy(SGPR01, LiveIn); 4284 B.buildInstr(AMDGPU::S_TRAP) 4285 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4286 .addReg(SGPR01, RegState::Implicit); 4287 } 4288 4289 MI.eraseFromParent(); 4290 return true; 4291 } 4292 4293 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4294 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4295 // Is non-HSA path or trap-handler disabled? then, report a warning 4296 // accordingly 4297 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4298 !ST.isTrapHandlerEnabled()) { 4299 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4300 "debugtrap handler not supported", 4301 MI.getDebugLoc(), DS_Warning); 4302 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4303 Ctx.diagnose(NoTrap); 4304 } else { 4305 // Insert debug-trap instruction 4306 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4307 } 4308 4309 MI.eraseFromParent(); 4310 return true; 4311 } 4312 4313 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4314 MachineInstr &MI) const { 4315 MachineIRBuilder &B = Helper.MIRBuilder; 4316 MachineRegisterInfo &MRI = *B.getMRI(); 4317 4318 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4319 auto IntrID = MI.getIntrinsicID(); 4320 switch (IntrID) { 4321 case Intrinsic::amdgcn_if: 4322 case Intrinsic::amdgcn_else: { 4323 MachineInstr *Br = nullptr; 4324 MachineBasicBlock *UncondBrTarget = nullptr; 4325 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4326 const SIRegisterInfo *TRI 4327 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4328 4329 Register Def = MI.getOperand(1).getReg(); 4330 Register Use = MI.getOperand(3).getReg(); 4331 4332 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4333 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4334 if (IntrID == Intrinsic::amdgcn_if) { 4335 B.buildInstr(AMDGPU::SI_IF) 4336 .addDef(Def) 4337 .addUse(Use) 4338 .addMBB(UncondBrTarget); 4339 } else { 4340 B.buildInstr(AMDGPU::SI_ELSE) 4341 .addDef(Def) 4342 .addUse(Use) 4343 .addMBB(UncondBrTarget) 4344 .addImm(0); 4345 } 4346 4347 if (Br) { 4348 Br->getOperand(0).setMBB(CondBrTarget); 4349 } else { 4350 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4351 // since we're swapping branch targets it needs to be reinserted. 4352 // FIXME: IRTranslator should probably not do this 4353 B.buildBr(*CondBrTarget); 4354 } 4355 4356 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4357 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4358 MI.eraseFromParent(); 4359 BrCond->eraseFromParent(); 4360 return true; 4361 } 4362 4363 return false; 4364 } 4365 case Intrinsic::amdgcn_loop: { 4366 MachineInstr *Br = nullptr; 4367 MachineBasicBlock *UncondBrTarget = nullptr; 4368 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4369 const SIRegisterInfo *TRI 4370 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4371 4372 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4373 Register Reg = MI.getOperand(2).getReg(); 4374 4375 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4376 B.buildInstr(AMDGPU::SI_LOOP) 4377 .addUse(Reg) 4378 .addMBB(UncondBrTarget); 4379 4380 if (Br) 4381 Br->getOperand(0).setMBB(CondBrTarget); 4382 else 4383 B.buildBr(*CondBrTarget); 4384 4385 MI.eraseFromParent(); 4386 BrCond->eraseFromParent(); 4387 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4388 return true; 4389 } 4390 4391 return false; 4392 } 4393 case Intrinsic::amdgcn_kernarg_segment_ptr: 4394 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4395 // This only makes sense to call in a kernel, so just lower to null. 4396 B.buildConstant(MI.getOperand(0).getReg(), 0); 4397 MI.eraseFromParent(); 4398 return true; 4399 } 4400 4401 return legalizePreloadedArgIntrin( 4402 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4403 case Intrinsic::amdgcn_implicitarg_ptr: 4404 return legalizeImplicitArgPtr(MI, MRI, B); 4405 case Intrinsic::amdgcn_workitem_id_x: 4406 return legalizePreloadedArgIntrin(MI, MRI, B, 4407 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4408 case Intrinsic::amdgcn_workitem_id_y: 4409 return legalizePreloadedArgIntrin(MI, MRI, B, 4410 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4411 case Intrinsic::amdgcn_workitem_id_z: 4412 return legalizePreloadedArgIntrin(MI, MRI, B, 4413 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4414 case Intrinsic::amdgcn_workgroup_id_x: 4415 return legalizePreloadedArgIntrin(MI, MRI, B, 4416 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4417 case Intrinsic::amdgcn_workgroup_id_y: 4418 return legalizePreloadedArgIntrin(MI, MRI, B, 4419 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4420 case Intrinsic::amdgcn_workgroup_id_z: 4421 return legalizePreloadedArgIntrin(MI, MRI, B, 4422 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4423 case Intrinsic::amdgcn_dispatch_ptr: 4424 return legalizePreloadedArgIntrin(MI, MRI, B, 4425 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4426 case Intrinsic::amdgcn_queue_ptr: 4427 return legalizePreloadedArgIntrin(MI, MRI, B, 4428 AMDGPUFunctionArgInfo::QUEUE_PTR); 4429 case Intrinsic::amdgcn_implicit_buffer_ptr: 4430 return legalizePreloadedArgIntrin( 4431 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4432 case Intrinsic::amdgcn_dispatch_id: 4433 return legalizePreloadedArgIntrin(MI, MRI, B, 4434 AMDGPUFunctionArgInfo::DISPATCH_ID); 4435 case Intrinsic::amdgcn_fdiv_fast: 4436 return legalizeFDIVFastIntrin(MI, MRI, B); 4437 case Intrinsic::amdgcn_is_shared: 4438 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4439 case Intrinsic::amdgcn_is_private: 4440 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4441 case Intrinsic::amdgcn_wavefrontsize: { 4442 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4443 MI.eraseFromParent(); 4444 return true; 4445 } 4446 case Intrinsic::amdgcn_s_buffer_load: 4447 return legalizeSBufferLoad(Helper, MI); 4448 case Intrinsic::amdgcn_raw_buffer_store: 4449 case Intrinsic::amdgcn_struct_buffer_store: 4450 return legalizeBufferStore(MI, MRI, B, false, false); 4451 case Intrinsic::amdgcn_raw_buffer_store_format: 4452 case Intrinsic::amdgcn_struct_buffer_store_format: 4453 return legalizeBufferStore(MI, MRI, B, false, true); 4454 case Intrinsic::amdgcn_raw_tbuffer_store: 4455 case Intrinsic::amdgcn_struct_tbuffer_store: 4456 return legalizeBufferStore(MI, MRI, B, true, true); 4457 case Intrinsic::amdgcn_raw_buffer_load: 4458 case Intrinsic::amdgcn_struct_buffer_load: 4459 return legalizeBufferLoad(MI, MRI, B, false, false); 4460 case Intrinsic::amdgcn_raw_buffer_load_format: 4461 case Intrinsic::amdgcn_struct_buffer_load_format: 4462 return legalizeBufferLoad(MI, MRI, B, true, false); 4463 case Intrinsic::amdgcn_raw_tbuffer_load: 4464 case Intrinsic::amdgcn_struct_tbuffer_load: 4465 return legalizeBufferLoad(MI, MRI, B, true, true); 4466 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4467 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4468 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4469 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4470 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4471 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4472 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4473 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4474 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4475 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4476 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4477 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4478 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4479 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4480 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4481 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4482 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4483 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4484 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4485 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4486 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4487 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4488 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4489 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4490 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4491 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4492 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4493 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4494 return legalizeBufferAtomic(MI, B, IntrID); 4495 case Intrinsic::amdgcn_atomic_inc: 4496 return legalizeAtomicIncDec(MI, B, true); 4497 case Intrinsic::amdgcn_atomic_dec: 4498 return legalizeAtomicIncDec(MI, B, false); 4499 case Intrinsic::trap: 4500 return legalizeTrapIntrinsic(MI, MRI, B); 4501 case Intrinsic::debugtrap: 4502 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4503 case Intrinsic::amdgcn_rsq_clamp: 4504 return legalizeRsqClampIntrinsic(MI, MRI, B); 4505 case Intrinsic::amdgcn_ds_fadd: 4506 case Intrinsic::amdgcn_ds_fmin: 4507 case Intrinsic::amdgcn_ds_fmax: 4508 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4509 default: { 4510 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4511 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4512 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4513 return true; 4514 } 4515 } 4516 4517 return true; 4518 } 4519