1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 364 const GCNTargetMachine &TM) 365 : ST(ST_) { 366 using namespace TargetOpcode; 367 368 auto GetAddrSpacePtr = [&TM](unsigned AS) { 369 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 370 }; 371 372 const LLT S1 = LLT::scalar(1); 373 const LLT S16 = LLT::scalar(16); 374 const LLT S32 = LLT::scalar(32); 375 const LLT S64 = LLT::scalar(64); 376 const LLT S128 = LLT::scalar(128); 377 const LLT S256 = LLT::scalar(256); 378 const LLT S512 = LLT::scalar(512); 379 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 380 381 const LLT V2S16 = LLT::vector(2, 16); 382 const LLT V4S16 = LLT::vector(4, 16); 383 384 const LLT V2S32 = LLT::vector(2, 32); 385 const LLT V3S32 = LLT::vector(3, 32); 386 const LLT V4S32 = LLT::vector(4, 32); 387 const LLT V5S32 = LLT::vector(5, 32); 388 const LLT V6S32 = LLT::vector(6, 32); 389 const LLT V7S32 = LLT::vector(7, 32); 390 const LLT V8S32 = LLT::vector(8, 32); 391 const LLT V9S32 = LLT::vector(9, 32); 392 const LLT V10S32 = LLT::vector(10, 32); 393 const LLT V11S32 = LLT::vector(11, 32); 394 const LLT V12S32 = LLT::vector(12, 32); 395 const LLT V13S32 = LLT::vector(13, 32); 396 const LLT V14S32 = LLT::vector(14, 32); 397 const LLT V15S32 = LLT::vector(15, 32); 398 const LLT V16S32 = LLT::vector(16, 32); 399 const LLT V32S32 = LLT::vector(32, 32); 400 401 const LLT V2S64 = LLT::vector(2, 64); 402 const LLT V3S64 = LLT::vector(3, 64); 403 const LLT V4S64 = LLT::vector(4, 64); 404 const LLT V5S64 = LLT::vector(5, 64); 405 const LLT V6S64 = LLT::vector(6, 64); 406 const LLT V7S64 = LLT::vector(7, 64); 407 const LLT V8S64 = LLT::vector(8, 64); 408 const LLT V16S64 = LLT::vector(16, 64); 409 410 std::initializer_list<LLT> AllS32Vectors = 411 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 412 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 413 std::initializer_list<LLT> AllS64Vectors = 414 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 415 416 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 417 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 418 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 419 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 420 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 421 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 422 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 423 424 const LLT CodePtr = FlatPtr; 425 426 const std::initializer_list<LLT> AddrSpaces64 = { 427 GlobalPtr, ConstantPtr, FlatPtr 428 }; 429 430 const std::initializer_list<LLT> AddrSpaces32 = { 431 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 432 }; 433 434 const std::initializer_list<LLT> FPTypesBase = { 435 S32, S64 436 }; 437 438 const std::initializer_list<LLT> FPTypes16 = { 439 S32, S64, S16 440 }; 441 442 const std::initializer_list<LLT> FPTypesPK16 = { 443 S32, S64, S16, V2S16 444 }; 445 446 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 447 448 setAction({G_BRCOND, S1}, Legal); // VCC branches 449 setAction({G_BRCOND, S32}, Legal); // SCC branches 450 451 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 452 // elements for v3s16 453 getActionDefinitionsBuilder(G_PHI) 454 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 455 .legalFor(AllS32Vectors) 456 .legalFor(AllS64Vectors) 457 .legalFor(AddrSpaces64) 458 .legalFor(AddrSpaces32) 459 .legalIf(isPointer(0)) 460 .clampScalar(0, S16, S256) 461 .widenScalarToNextPow2(0, 32) 462 .clampMaxNumElements(0, S32, 16) 463 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 464 .scalarize(0); 465 466 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 467 // Full set of gfx9 features. 468 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 469 .legalFor({S32, S16, V2S16}) 470 .clampScalar(0, S16, S32) 471 .clampMaxNumElements(0, S16, 2) 472 .scalarize(0) 473 .widenScalarToNextPow2(0, 32); 474 475 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 476 .legalFor({S32, S16, V2S16}) // Clamp modifier 477 .minScalarOrElt(0, S16) 478 .clampMaxNumElements(0, S16, 2) 479 .scalarize(0) 480 .widenScalarToNextPow2(0, 32) 481 .lower(); 482 } else if (ST.has16BitInsts()) { 483 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 484 .legalFor({S32, S16}) 485 .clampScalar(0, S16, S32) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 488 489 // Technically the saturating operations require clamp bit support, but this 490 // was introduced at the same time as 16-bit operations. 491 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 492 .legalFor({S32, S16}) // Clamp modifier 493 .minScalar(0, S16) 494 .scalarize(0) 495 .widenScalarToNextPow2(0, 16) 496 .lower(); 497 498 // We're just lowering this, but it helps get a better result to try to 499 // coerce to the desired type first. 500 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 501 .minScalar(0, S16) 502 .scalarize(0) 503 .lower(); 504 } else { 505 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 506 .legalFor({S32}) 507 .clampScalar(0, S32, S32) 508 .scalarize(0); 509 510 if (ST.hasIntClamp()) { 511 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 512 .legalFor({S32}) // Clamp modifier. 513 .scalarize(0) 514 .minScalarOrElt(0, S32) 515 .lower(); 516 } else { 517 // Clamp bit support was added in VI, along with 16-bit operations. 518 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 // FIXME: DAG expansion gets better results. The widening uses the smaller 525 // range values and goes for the min/max lowering directly. 526 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 527 .minScalar(0, S32) 528 .scalarize(0) 529 .lower(); 530 } 531 532 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 533 .customFor({S32, S64}) 534 .clampScalar(0, S32, S64) 535 .widenScalarToNextPow2(0, 32) 536 .scalarize(0); 537 538 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 // Report legal for any types we can handle anywhere. For the cases only legal 544 // on the SALU, RegBankSelect will be able to re-legalize. 545 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 546 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 547 .clampScalar(0, S32, S64) 548 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 549 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 553 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 554 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 555 .legalFor({{S32, S1}, {S32, S32}}) 556 .minScalar(0, S32) 557 // TODO: .scalarize(0) 558 .lower(); 559 560 getActionDefinitionsBuilder(G_BITCAST) 561 // Don't worry about the size constraint. 562 .legalIf(all(isRegisterType(0), isRegisterType(1))) 563 .lower(); 564 565 566 getActionDefinitionsBuilder(G_CONSTANT) 567 .legalFor({S1, S32, S64, S16, GlobalPtr, 568 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 569 .legalIf(isPointer(0)) 570 .clampScalar(0, S32, S64) 571 .widenScalarToNextPow2(0); 572 573 getActionDefinitionsBuilder(G_FCONSTANT) 574 .legalFor({S32, S64, S16}) 575 .clampScalar(0, S16, S64); 576 577 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 578 .legalIf(isRegisterType(0)) 579 // s1 and s16 are special cases because they have legal operations on 580 // them, but don't really occupy registers in the normal way. 581 .legalFor({S1, S16}) 582 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 583 .clampScalarOrElt(0, S32, MaxScalar) 584 .widenScalarToNextPow2(0, 32) 585 .clampMaxNumElements(0, S32, 16); 586 587 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 588 589 // If the amount is divergent, we have to do a wave reduction to get the 590 // maximum value, so this is expanded during RegBankSelect. 591 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 592 .legalFor({{PrivatePtr, S32}}); 593 594 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 595 .customIf(typeIsNot(0, PrivatePtr)); 596 597 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 598 599 auto &FPOpActions = getActionDefinitionsBuilder( 600 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 601 .legalFor({S32, S64}); 602 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 603 .customFor({S32, S64}); 604 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 605 .customFor({S32, S64}); 606 607 if (ST.has16BitInsts()) { 608 if (ST.hasVOP3PInsts()) 609 FPOpActions.legalFor({S16, V2S16}); 610 else 611 FPOpActions.legalFor({S16}); 612 613 TrigActions.customFor({S16}); 614 FDIVActions.customFor({S16}); 615 } 616 617 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 618 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 619 620 if (ST.hasVOP3PInsts()) { 621 MinNumMaxNum.customFor(FPTypesPK16) 622 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 623 .clampMaxNumElements(0, S16, 2) 624 .clampScalar(0, S16, S64) 625 .scalarize(0); 626 } else if (ST.has16BitInsts()) { 627 MinNumMaxNum.customFor(FPTypes16) 628 .clampScalar(0, S16, S64) 629 .scalarize(0); 630 } else { 631 MinNumMaxNum.customFor(FPTypesBase) 632 .clampScalar(0, S32, S64) 633 .scalarize(0); 634 } 635 636 if (ST.hasVOP3PInsts()) 637 FPOpActions.clampMaxNumElements(0, S16, 2); 638 639 FPOpActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 TrigActions 644 .scalarize(0) 645 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 646 647 FDIVActions 648 .scalarize(0) 649 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 650 651 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 652 .legalFor(FPTypesPK16) 653 .clampMaxNumElements(0, S16, 2) 654 .scalarize(0) 655 .clampScalar(0, S16, S64); 656 657 if (ST.has16BitInsts()) { 658 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 659 .legalFor({S32, S64, S16}) 660 .scalarize(0) 661 .clampScalar(0, S16, S64); 662 } else { 663 getActionDefinitionsBuilder(G_FSQRT) 664 .legalFor({S32, S64}) 665 .scalarize(0) 666 .clampScalar(0, S32, S64); 667 668 if (ST.hasFractBug()) { 669 getActionDefinitionsBuilder(G_FFLOOR) 670 .customFor({S64}) 671 .legalFor({S32, S64}) 672 .scalarize(0) 673 .clampScalar(0, S32, S64); 674 } else { 675 getActionDefinitionsBuilder(G_FFLOOR) 676 .legalFor({S32, S64}) 677 .scalarize(0) 678 .clampScalar(0, S32, S64); 679 } 680 } 681 682 getActionDefinitionsBuilder(G_FPTRUNC) 683 .legalFor({{S32, S64}, {S16, S32}}) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder(G_FPEXT) 688 .legalFor({{S64, S32}, {S32, S16}}) 689 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 690 .scalarize(0); 691 692 getActionDefinitionsBuilder(G_FSUB) 693 // Use actual fsub instruction 694 .legalFor({S32}) 695 // Must use fadd + fneg 696 .lowerFor({S64, S16, V2S16}) 697 .scalarize(0) 698 .clampScalar(0, S32, S64); 699 700 // Whether this is legal depends on the floating point mode for the function. 701 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 702 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 703 FMad.customFor({S32, S16}); 704 else if (ST.hasMadMacF32Insts()) 705 FMad.customFor({S32}); 706 else if (ST.hasMadF16()) 707 FMad.customFor({S16}); 708 FMad.scalarize(0) 709 .lower(); 710 711 auto &FRem = getActionDefinitionsBuilder(G_FREM); 712 if (ST.has16BitInsts()) { 713 FRem.customFor({S16, S32, S64}); 714 } else { 715 FRem.minScalar(0, S32) 716 .customFor({S32, S64}); 717 } 718 FRem.scalarize(0); 719 720 // TODO: Do we need to clamp maximum bitwidth? 721 getActionDefinitionsBuilder(G_TRUNC) 722 .legalIf(isScalar(0)) 723 .legalFor({{V2S16, V2S32}}) 724 .clampMaxNumElements(0, S16, 2) 725 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 726 // situations (like an invalid implicit use), we don't want to infinite loop 727 // in the legalizer. 728 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 729 .alwaysLegal(); 730 731 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 732 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 733 {S32, S1}, {S64, S1}, {S16, S1}}) 734 .scalarize(0) 735 .clampScalar(0, S32, S64) 736 .widenScalarToNextPow2(1, 32); 737 738 // TODO: Split s1->s64 during regbankselect for VALU. 739 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 740 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 741 .lowerFor({{S32, S64}}) 742 .lowerIf(typeIs(1, S1)) 743 .customFor({{S64, S64}}); 744 if (ST.has16BitInsts()) 745 IToFP.legalFor({{S16, S16}}); 746 IToFP.clampScalar(1, S32, S64) 747 .minScalar(0, S32) 748 .scalarize(0) 749 .widenScalarToNextPow2(1); 750 751 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 752 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 753 .customFor({{S64, S64}}) 754 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 755 if (ST.has16BitInsts()) 756 FPToI.legalFor({{S16, S16}}); 757 else 758 FPToI.minScalar(1, S32); 759 760 FPToI.minScalar(0, S32) 761 .scalarize(0) 762 .lower(); 763 764 // Lower roundeven into G_FRINT 765 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 766 .scalarize(0) 767 .lower(); 768 769 if (ST.has16BitInsts()) { 770 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 771 .legalFor({S16, S32, S64}) 772 .clampScalar(0, S16, S64) 773 .scalarize(0); 774 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 775 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 776 .legalFor({S32, S64}) 777 .clampScalar(0, S32, S64) 778 .scalarize(0); 779 } else { 780 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 781 .legalFor({S32}) 782 .customFor({S64}) 783 .clampScalar(0, S32, S64) 784 .scalarize(0); 785 } 786 787 getActionDefinitionsBuilder(G_PTR_ADD) 788 .legalIf(all(isPointer(0), sameSize(0, 1))) 789 .scalarize(0) 790 .scalarSameSizeAs(1, 0); 791 792 getActionDefinitionsBuilder(G_PTRMASK) 793 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 794 .scalarSameSizeAs(1, 0) 795 .scalarize(0); 796 797 auto &CmpBuilder = 798 getActionDefinitionsBuilder(G_ICMP) 799 // The compare output type differs based on the register bank of the output, 800 // so make both s1 and s32 legal. 801 // 802 // Scalar compares producing output in scc will be promoted to s32, as that 803 // is the allocatable register type that will be needed for the copy from 804 // scc. This will be promoted during RegBankSelect, and we assume something 805 // before that won't try to use s32 result types. 806 // 807 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 808 // bank. 809 .legalForCartesianProduct( 810 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 811 .legalForCartesianProduct( 812 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 813 if (ST.has16BitInsts()) { 814 CmpBuilder.legalFor({{S1, S16}}); 815 } 816 817 CmpBuilder 818 .widenScalarToNextPow2(1) 819 .clampScalar(1, S32, S64) 820 .scalarize(0) 821 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 822 823 getActionDefinitionsBuilder(G_FCMP) 824 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 825 .widenScalarToNextPow2(1) 826 .clampScalar(1, S32, S64) 827 .scalarize(0); 828 829 // FIXME: fpow has a selection pattern that should move to custom lowering. 830 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 831 if (ST.has16BitInsts()) 832 Exp2Ops.legalFor({S32, S16}); 833 else 834 Exp2Ops.legalFor({S32}); 835 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 836 Exp2Ops.scalarize(0); 837 838 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 839 if (ST.has16BitInsts()) 840 ExpOps.customFor({{S32}, {S16}}); 841 else 842 ExpOps.customFor({S32}); 843 ExpOps.clampScalar(0, MinScalarFPTy, S32) 844 .scalarize(0); 845 846 getActionDefinitionsBuilder(G_FPOWI) 847 .clampScalar(0, MinScalarFPTy, S32) 848 .lower(); 849 850 // The 64-bit versions produce 32-bit results, but only on the SALU. 851 getActionDefinitionsBuilder(G_CTPOP) 852 .legalFor({{S32, S32}, {S32, S64}}) 853 .clampScalar(0, S32, S32) 854 .clampScalar(1, S32, S64) 855 .scalarize(0) 856 .widenScalarToNextPow2(0, 32) 857 .widenScalarToNextPow2(1, 32); 858 859 // The hardware instructions return a different result on 0 than the generic 860 // instructions expect. The hardware produces -1, but these produce the 861 // bitwidth. 862 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 863 .scalarize(0) 864 .clampScalar(0, S32, S32) 865 .clampScalar(1, S32, S64) 866 .widenScalarToNextPow2(0, 32) 867 .widenScalarToNextPow2(1, 32) 868 .lower(); 869 870 // The 64-bit versions produce 32-bit results, but only on the SALU. 871 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 872 .legalFor({{S32, S32}, {S32, S64}}) 873 .clampScalar(0, S32, S32) 874 .clampScalar(1, S32, S64) 875 .scalarize(0) 876 .widenScalarToNextPow2(0, 32) 877 .widenScalarToNextPow2(1, 32); 878 879 getActionDefinitionsBuilder(G_BITREVERSE) 880 .legalFor({S32}) 881 .clampScalar(0, S32, S32) 882 .scalarize(0); 883 884 if (ST.has16BitInsts()) { 885 getActionDefinitionsBuilder(G_BSWAP) 886 .legalFor({S16, S32, V2S16}) 887 .clampMaxNumElements(0, S16, 2) 888 // FIXME: Fixing non-power-of-2 before clamp is workaround for 889 // narrowScalar limitation. 890 .widenScalarToNextPow2(0) 891 .clampScalar(0, S16, S32) 892 .scalarize(0); 893 894 if (ST.hasVOP3PInsts()) { 895 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 896 .legalFor({S32, S16, V2S16}) 897 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 898 .clampMaxNumElements(0, S16, 2) 899 .minScalar(0, S16) 900 .widenScalarToNextPow2(0) 901 .scalarize(0) 902 .lower(); 903 } else { 904 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 905 .legalFor({S32, S16}) 906 .widenScalarToNextPow2(0) 907 .minScalar(0, S16) 908 .scalarize(0) 909 .lower(); 910 } 911 } else { 912 // TODO: Should have same legality without v_perm_b32 913 getActionDefinitionsBuilder(G_BSWAP) 914 .legalFor({S32}) 915 .lowerIf(scalarNarrowerThan(0, 32)) 916 // FIXME: Fixing non-power-of-2 before clamp is workaround for 917 // narrowScalar limitation. 918 .widenScalarToNextPow2(0) 919 .maxScalar(0, S32) 920 .scalarize(0) 921 .lower(); 922 923 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 924 .legalFor({S32}) 925 .minScalar(0, S32) 926 .widenScalarToNextPow2(0) 927 .scalarize(0) 928 .lower(); 929 } 930 931 getActionDefinitionsBuilder(G_INTTOPTR) 932 // List the common cases 933 .legalForCartesianProduct(AddrSpaces64, {S64}) 934 .legalForCartesianProduct(AddrSpaces32, {S32}) 935 .scalarize(0) 936 // Accept any address space as long as the size matches 937 .legalIf(sameSize(0, 1)) 938 .widenScalarIf(smallerThan(1, 0), 939 [](const LegalityQuery &Query) { 940 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 941 }) 942 .narrowScalarIf(largerThan(1, 0), 943 [](const LegalityQuery &Query) { 944 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 945 }); 946 947 getActionDefinitionsBuilder(G_PTRTOINT) 948 // List the common cases 949 .legalForCartesianProduct(AddrSpaces64, {S64}) 950 .legalForCartesianProduct(AddrSpaces32, {S32}) 951 .scalarize(0) 952 // Accept any address space as long as the size matches 953 .legalIf(sameSize(0, 1)) 954 .widenScalarIf(smallerThan(0, 1), 955 [](const LegalityQuery &Query) { 956 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 957 }) 958 .narrowScalarIf( 959 largerThan(0, 1), 960 [](const LegalityQuery &Query) { 961 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 962 }); 963 964 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 965 .scalarize(0) 966 .custom(); 967 968 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 969 bool IsLoad) -> bool { 970 const LLT DstTy = Query.Types[0]; 971 972 // Split vector extloads. 973 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 974 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 975 976 if (MemSize < DstTy.getSizeInBits()) 977 MemSize = std::max(MemSize, AlignBits); 978 979 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 980 return true; 981 982 const LLT PtrTy = Query.Types[1]; 983 unsigned AS = PtrTy.getAddressSpace(); 984 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 985 return true; 986 987 // Catch weird sized loads that don't evenly divide into the access sizes 988 // TODO: May be able to widen depending on alignment etc. 989 unsigned NumRegs = (MemSize + 31) / 32; 990 if (NumRegs == 3) { 991 if (!ST.hasDwordx3LoadStores()) 992 return true; 993 } else { 994 // If the alignment allows, these should have been widened. 995 if (!isPowerOf2_32(NumRegs)) 996 return true; 997 } 998 999 if (AlignBits < MemSize) { 1000 const SITargetLowering *TLI = ST.getTargetLowering(); 1001 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1002 Align(AlignBits / 8)); 1003 } 1004 1005 return false; 1006 }; 1007 1008 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 1009 unsigned Opc) -> bool { 1010 unsigned Size = Query.Types[0].getSizeInBits(); 1011 if (isPowerOf2_32(Size)) 1012 return false; 1013 1014 if (Size == 96 && ST.hasDwordx3LoadStores()) 1015 return false; 1016 1017 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1018 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1019 return false; 1020 1021 unsigned Align = Query.MMODescrs[0].AlignInBits; 1022 unsigned RoundedSize = NextPowerOf2(Size); 1023 return (Align >= RoundedSize); 1024 }; 1025 1026 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1027 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1028 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1029 1030 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1031 // LDS 1032 // TODO: Unsupported flat for SI. 1033 1034 for (unsigned Op : {G_LOAD, G_STORE}) { 1035 const bool IsStore = Op == G_STORE; 1036 1037 auto &Actions = getActionDefinitionsBuilder(Op); 1038 // Explicitly list some common cases. 1039 // TODO: Does this help compile time at all? 1040 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1041 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1042 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1043 {S64, GlobalPtr, 64, GlobalAlign32}, 1044 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1045 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1046 {S32, GlobalPtr, 8, GlobalAlign8}, 1047 {S32, GlobalPtr, 16, GlobalAlign16}, 1048 1049 {S32, LocalPtr, 32, 32}, 1050 {S64, LocalPtr, 64, 32}, 1051 {V2S32, LocalPtr, 64, 32}, 1052 {S32, LocalPtr, 8, 8}, 1053 {S32, LocalPtr, 16, 16}, 1054 {V2S16, LocalPtr, 32, 32}, 1055 1056 {S32, PrivatePtr, 32, 32}, 1057 {S32, PrivatePtr, 8, 8}, 1058 {S32, PrivatePtr, 16, 16}, 1059 {V2S16, PrivatePtr, 32, 32}, 1060 1061 {S32, ConstantPtr, 32, GlobalAlign32}, 1062 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1063 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1064 {S64, ConstantPtr, 64, GlobalAlign32}, 1065 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1066 Actions.legalIf( 1067 [=](const LegalityQuery &Query) -> bool { 1068 return isLoadStoreLegal(ST, Query, Op); 1069 }); 1070 1071 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1072 // 64-bits. 1073 // 1074 // TODO: Should generalize bitcast action into coerce, which will also cover 1075 // inserting addrspacecasts. 1076 Actions.customIf(typeIs(1, Constant32Ptr)); 1077 1078 // Turn any illegal element vectors into something easier to deal 1079 // with. These will ultimately produce 32-bit scalar shifts to extract the 1080 // parts anyway. 1081 // 1082 // For odd 16-bit element vectors, prefer to split those into pieces with 1083 // 16-bit vector parts. 1084 Actions.bitcastIf( 1085 [=](const LegalityQuery &Query) -> bool { 1086 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1087 Query.MMODescrs[0].SizeInBits); 1088 }, bitcastToRegisterType(0)); 1089 1090 Actions 1091 .customIf(typeIs(1, Constant32Ptr)) 1092 // Widen suitably aligned loads by loading extra elements. 1093 .moreElementsIf([=](const LegalityQuery &Query) { 1094 const LLT Ty = Query.Types[0]; 1095 return Op == G_LOAD && Ty.isVector() && 1096 shouldWidenLoadResult(Query, Op); 1097 }, moreElementsToNextPow2(0)) 1098 .widenScalarIf([=](const LegalityQuery &Query) { 1099 const LLT Ty = Query.Types[0]; 1100 return Op == G_LOAD && !Ty.isVector() && 1101 shouldWidenLoadResult(Query, Op); 1102 }, widenScalarOrEltToNextPow2(0)) 1103 .narrowScalarIf( 1104 [=](const LegalityQuery &Query) -> bool { 1105 return !Query.Types[0].isVector() && 1106 needToSplitMemOp(Query, Op == G_LOAD); 1107 }, 1108 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1109 const LLT DstTy = Query.Types[0]; 1110 const LLT PtrTy = Query.Types[1]; 1111 1112 const unsigned DstSize = DstTy.getSizeInBits(); 1113 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1114 1115 // Split extloads. 1116 if (DstSize > MemSize) 1117 return std::make_pair(0, LLT::scalar(MemSize)); 1118 1119 if (!isPowerOf2_32(DstSize)) { 1120 // We're probably decomposing an odd sized store. Try to split 1121 // to the widest type. TODO: Account for alignment. As-is it 1122 // should be OK, since the new parts will be further legalized. 1123 unsigned FloorSize = PowerOf2Floor(DstSize); 1124 return std::make_pair(0, LLT::scalar(FloorSize)); 1125 } 1126 1127 if (DstSize > 32 && (DstSize % 32 != 0)) { 1128 // FIXME: Need a way to specify non-extload of larger size if 1129 // suitably aligned. 1130 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1131 } 1132 1133 unsigned MaxSize = maxSizeForAddrSpace(ST, 1134 PtrTy.getAddressSpace(), 1135 Op == G_LOAD); 1136 if (MemSize > MaxSize) 1137 return std::make_pair(0, LLT::scalar(MaxSize)); 1138 1139 unsigned Align = Query.MMODescrs[0].AlignInBits; 1140 return std::make_pair(0, LLT::scalar(Align)); 1141 }) 1142 .fewerElementsIf( 1143 [=](const LegalityQuery &Query) -> bool { 1144 return Query.Types[0].isVector() && 1145 needToSplitMemOp(Query, Op == G_LOAD); 1146 }, 1147 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1148 const LLT DstTy = Query.Types[0]; 1149 const LLT PtrTy = Query.Types[1]; 1150 1151 LLT EltTy = DstTy.getElementType(); 1152 unsigned MaxSize = maxSizeForAddrSpace(ST, 1153 PtrTy.getAddressSpace(), 1154 Op == G_LOAD); 1155 1156 // FIXME: Handle widened to power of 2 results better. This ends 1157 // up scalarizing. 1158 // FIXME: 3 element stores scalarized on SI 1159 1160 // Split if it's too large for the address space. 1161 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1162 unsigned NumElts = DstTy.getNumElements(); 1163 unsigned EltSize = EltTy.getSizeInBits(); 1164 1165 if (MaxSize % EltSize == 0) { 1166 return std::make_pair( 1167 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1168 } 1169 1170 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1171 1172 // FIXME: Refine when odd breakdowns handled 1173 // The scalars will need to be re-legalized. 1174 if (NumPieces == 1 || NumPieces >= NumElts || 1175 NumElts % NumPieces != 0) 1176 return std::make_pair(0, EltTy); 1177 1178 return std::make_pair(0, 1179 LLT::vector(NumElts / NumPieces, EltTy)); 1180 } 1181 1182 // FIXME: We could probably handle weird extending loads better. 1183 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1184 if (DstTy.getSizeInBits() > MemSize) 1185 return std::make_pair(0, EltTy); 1186 1187 unsigned EltSize = EltTy.getSizeInBits(); 1188 unsigned DstSize = DstTy.getSizeInBits(); 1189 if (!isPowerOf2_32(DstSize)) { 1190 // We're probably decomposing an odd sized store. Try to split 1191 // to the widest type. TODO: Account for alignment. As-is it 1192 // should be OK, since the new parts will be further legalized. 1193 unsigned FloorSize = PowerOf2Floor(DstSize); 1194 return std::make_pair( 1195 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1196 } 1197 1198 // Need to split because of alignment. 1199 unsigned Align = Query.MMODescrs[0].AlignInBits; 1200 if (EltSize > Align && 1201 (EltSize / Align < DstTy.getNumElements())) { 1202 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1203 } 1204 1205 // May need relegalization for the scalars. 1206 return std::make_pair(0, EltTy); 1207 }) 1208 .minScalar(0, S32); 1209 1210 if (IsStore) 1211 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1212 1213 // TODO: Need a bitcast lower option? 1214 Actions 1215 .widenScalarToNextPow2(0) 1216 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1217 } 1218 1219 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1220 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1221 {S32, GlobalPtr, 16, 2 * 8}, 1222 {S32, LocalPtr, 8, 8}, 1223 {S32, LocalPtr, 16, 16}, 1224 {S32, PrivatePtr, 8, 8}, 1225 {S32, PrivatePtr, 16, 16}, 1226 {S32, ConstantPtr, 8, 8}, 1227 {S32, ConstantPtr, 16, 2 * 8}}); 1228 if (ST.hasFlatAddressSpace()) { 1229 ExtLoads.legalForTypesWithMemDesc( 1230 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1231 } 1232 1233 ExtLoads.clampScalar(0, S32, S32) 1234 .widenScalarToNextPow2(0) 1235 .unsupportedIfMemSizeNotPow2() 1236 .lower(); 1237 1238 auto &Atomics = getActionDefinitionsBuilder( 1239 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1240 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1241 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1242 G_ATOMICRMW_UMIN}) 1243 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1244 {S64, GlobalPtr}, {S64, LocalPtr}, 1245 {S32, RegionPtr}, {S64, RegionPtr}}); 1246 if (ST.hasFlatAddressSpace()) { 1247 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1248 } 1249 1250 if (ST.hasLDSFPAtomics()) { 1251 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1252 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1253 } 1254 1255 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1256 // demarshalling 1257 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1258 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1259 {S32, FlatPtr}, {S64, FlatPtr}}) 1260 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1261 {S32, RegionPtr}, {S64, RegionPtr}}); 1262 // TODO: Pointer types, any 32-bit or 64-bit vector 1263 1264 // Condition should be s32 for scalar, s1 for vector. 1265 getActionDefinitionsBuilder(G_SELECT) 1266 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1267 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1268 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1269 .clampScalar(0, S16, S64) 1270 .scalarize(1) 1271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1272 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1273 .clampMaxNumElements(0, S32, 2) 1274 .clampMaxNumElements(0, LocalPtr, 2) 1275 .clampMaxNumElements(0, PrivatePtr, 2) 1276 .scalarize(0) 1277 .widenScalarToNextPow2(0) 1278 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1279 1280 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1281 // be more flexible with the shift amount type. 1282 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1283 .legalFor({{S32, S32}, {S64, S32}}); 1284 if (ST.has16BitInsts()) { 1285 if (ST.hasVOP3PInsts()) { 1286 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1287 .clampMaxNumElements(0, S16, 2); 1288 } else 1289 Shifts.legalFor({{S16, S16}}); 1290 1291 // TODO: Support 16-bit shift amounts for all types 1292 Shifts.widenScalarIf( 1293 [=](const LegalityQuery &Query) { 1294 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1295 // 32-bit amount. 1296 const LLT ValTy = Query.Types[0]; 1297 const LLT AmountTy = Query.Types[1]; 1298 return ValTy.getSizeInBits() <= 16 && 1299 AmountTy.getSizeInBits() < 16; 1300 }, changeTo(1, S16)); 1301 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1302 Shifts.clampScalar(1, S32, S32); 1303 Shifts.clampScalar(0, S16, S64); 1304 Shifts.widenScalarToNextPow2(0, 16); 1305 1306 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1307 .minScalar(0, S16) 1308 .scalarize(0) 1309 .lower(); 1310 } else { 1311 // Make sure we legalize the shift amount type first, as the general 1312 // expansion for the shifted type will produce much worse code if it hasn't 1313 // been truncated already. 1314 Shifts.clampScalar(1, S32, S32); 1315 Shifts.clampScalar(0, S32, S64); 1316 Shifts.widenScalarToNextPow2(0, 32); 1317 1318 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1319 .minScalar(0, S32) 1320 .scalarize(0) 1321 .lower(); 1322 } 1323 Shifts.scalarize(0); 1324 1325 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1326 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1327 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1328 unsigned IdxTypeIdx = 2; 1329 1330 getActionDefinitionsBuilder(Op) 1331 .customIf([=](const LegalityQuery &Query) { 1332 const LLT EltTy = Query.Types[EltTypeIdx]; 1333 const LLT VecTy = Query.Types[VecTypeIdx]; 1334 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1335 const unsigned EltSize = EltTy.getSizeInBits(); 1336 return (EltSize == 32 || EltSize == 64) && 1337 VecTy.getSizeInBits() % 32 == 0 && 1338 VecTy.getSizeInBits() <= MaxRegisterSize && 1339 IdxTy.getSizeInBits() == 32; 1340 }) 1341 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1342 bitcastToVectorElement32(VecTypeIdx)) 1343 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1344 .bitcastIf( 1345 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1346 [=](const LegalityQuery &Query) { 1347 // For > 64-bit element types, try to turn this into a 64-bit 1348 // element vector since we may be able to do better indexing 1349 // if this is scalar. If not, fall back to 32. 1350 const LLT EltTy = Query.Types[EltTypeIdx]; 1351 const LLT VecTy = Query.Types[VecTypeIdx]; 1352 const unsigned DstEltSize = EltTy.getSizeInBits(); 1353 const unsigned VecSize = VecTy.getSizeInBits(); 1354 1355 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1356 return std::make_pair( 1357 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1358 }) 1359 .clampScalar(EltTypeIdx, S32, S64) 1360 .clampScalar(VecTypeIdx, S32, S64) 1361 .clampScalar(IdxTypeIdx, S32, S32) 1362 .clampMaxNumElements(1, S32, 32) 1363 // TODO: Clamp elements for 64-bit vectors? 1364 // It should only be necessary with variable indexes. 1365 // As a last resort, lower to the stack 1366 .lower(); 1367 } 1368 1369 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1370 .unsupportedIf([=](const LegalityQuery &Query) { 1371 const LLT &EltTy = Query.Types[1].getElementType(); 1372 return Query.Types[0] != EltTy; 1373 }); 1374 1375 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1376 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1377 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1378 1379 // FIXME: Doesn't handle extract of illegal sizes. 1380 getActionDefinitionsBuilder(Op) 1381 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1382 // FIXME: Multiples of 16 should not be legal. 1383 .legalIf([=](const LegalityQuery &Query) { 1384 const LLT BigTy = Query.Types[BigTyIdx]; 1385 const LLT LitTy = Query.Types[LitTyIdx]; 1386 return (BigTy.getSizeInBits() % 32 == 0) && 1387 (LitTy.getSizeInBits() % 16 == 0); 1388 }) 1389 .widenScalarIf( 1390 [=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return (BigTy.getScalarSizeInBits() < 16); 1393 }, 1394 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1395 .widenScalarIf( 1396 [=](const LegalityQuery &Query) { 1397 const LLT LitTy = Query.Types[LitTyIdx]; 1398 return (LitTy.getScalarSizeInBits() < 16); 1399 }, 1400 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1401 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1402 .widenScalarToNextPow2(BigTyIdx, 32); 1403 1404 } 1405 1406 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1407 .legalForCartesianProduct(AllS32Vectors, {S32}) 1408 .legalForCartesianProduct(AllS64Vectors, {S64}) 1409 .clampNumElements(0, V16S32, V32S32) 1410 .clampNumElements(0, V2S64, V16S64) 1411 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1412 1413 if (ST.hasScalarPackInsts()) { 1414 BuildVector 1415 // FIXME: Should probably widen s1 vectors straight to s32 1416 .minScalarOrElt(0, S16) 1417 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1418 .minScalar(1, S32); 1419 1420 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1421 .legalFor({V2S16, S32}) 1422 .lower(); 1423 BuildVector.minScalarOrElt(0, S32); 1424 } else { 1425 BuildVector.customFor({V2S16, S16}); 1426 BuildVector.minScalarOrElt(0, S32); 1427 1428 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1429 .customFor({V2S16, S32}) 1430 .lower(); 1431 } 1432 1433 BuildVector.legalIf(isRegisterType(0)); 1434 1435 // FIXME: Clamp maximum size 1436 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1437 .legalIf(isRegisterType(0)); 1438 1439 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1440 // pre-legalize. 1441 if (ST.hasVOP3PInsts()) { 1442 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1443 .customFor({V2S16, V2S16}) 1444 .lower(); 1445 } else 1446 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1447 1448 // Merge/Unmerge 1449 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1450 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1451 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1452 1453 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1454 const LLT Ty = Query.Types[TypeIdx]; 1455 if (Ty.isVector()) { 1456 const LLT &EltTy = Ty.getElementType(); 1457 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1458 return true; 1459 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1460 return true; 1461 } 1462 return false; 1463 }; 1464 1465 auto &Builder = getActionDefinitionsBuilder(Op) 1466 .lowerFor({{S16, V2S16}}) 1467 .lowerIf([=](const LegalityQuery &Query) { 1468 const LLT BigTy = Query.Types[BigTyIdx]; 1469 return BigTy.getSizeInBits() == 32; 1470 }) 1471 // Try to widen to s16 first for small types. 1472 // TODO: Only do this on targets with legal s16 shifts 1473 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1474 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1475 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1476 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1477 elementTypeIs(1, S16)), 1478 changeTo(1, V2S16)) 1479 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1480 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1481 // valid. 1482 .clampScalar(LitTyIdx, S32, S512) 1483 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1484 // Break up vectors with weird elements into scalars 1485 .fewerElementsIf( 1486 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1487 scalarize(0)) 1488 .fewerElementsIf( 1489 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1490 scalarize(1)) 1491 .clampScalar(BigTyIdx, S32, MaxScalar); 1492 1493 if (Op == G_MERGE_VALUES) { 1494 Builder.widenScalarIf( 1495 // TODO: Use 16-bit shifts if legal for 8-bit values? 1496 [=](const LegalityQuery &Query) { 1497 const LLT Ty = Query.Types[LitTyIdx]; 1498 return Ty.getSizeInBits() < 32; 1499 }, 1500 changeTo(LitTyIdx, S32)); 1501 } 1502 1503 Builder.widenScalarIf( 1504 [=](const LegalityQuery &Query) { 1505 const LLT Ty = Query.Types[BigTyIdx]; 1506 return !isPowerOf2_32(Ty.getSizeInBits()) && 1507 Ty.getSizeInBits() % 16 != 0; 1508 }, 1509 [=](const LegalityQuery &Query) { 1510 // Pick the next power of 2, or a multiple of 64 over 128. 1511 // Whichever is smaller. 1512 const LLT &Ty = Query.Types[BigTyIdx]; 1513 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1514 if (NewSizeInBits >= 256) { 1515 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1516 if (RoundedTo < NewSizeInBits) 1517 NewSizeInBits = RoundedTo; 1518 } 1519 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1520 }) 1521 .legalIf([=](const LegalityQuery &Query) { 1522 const LLT &BigTy = Query.Types[BigTyIdx]; 1523 const LLT &LitTy = Query.Types[LitTyIdx]; 1524 1525 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1526 return false; 1527 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1528 return false; 1529 1530 return BigTy.getSizeInBits() % 16 == 0 && 1531 LitTy.getSizeInBits() % 16 == 0 && 1532 BigTy.getSizeInBits() <= MaxRegisterSize; 1533 }) 1534 // Any vectors left are the wrong size. Scalarize them. 1535 .scalarize(0) 1536 .scalarize(1); 1537 } 1538 1539 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1540 // RegBankSelect. 1541 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1542 .legalFor({{S32}, {S64}}); 1543 1544 if (ST.hasVOP3PInsts()) { 1545 SextInReg.lowerFor({{V2S16}}) 1546 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1547 // get more vector shift opportunities, since we'll get those when 1548 // expanded. 1549 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1550 } else if (ST.has16BitInsts()) { 1551 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1552 } else { 1553 // Prefer to promote to s32 before lowering if we don't have 16-bit 1554 // shifts. This avoid a lot of intermediate truncate and extend operations. 1555 SextInReg.lowerFor({{S32}, {S64}}); 1556 } 1557 1558 SextInReg 1559 .scalarize(0) 1560 .clampScalar(0, S32, S64) 1561 .lower(); 1562 1563 getActionDefinitionsBuilder(G_FSHR) 1564 .legalFor({{S32, S32}}) 1565 .scalarize(0) 1566 .lower(); 1567 1568 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1569 .legalFor({S64}); 1570 1571 getActionDefinitionsBuilder(G_FENCE) 1572 .alwaysLegal(); 1573 1574 getActionDefinitionsBuilder({ 1575 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1576 G_FCOPYSIGN, 1577 1578 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1579 G_ATOMICRMW_NAND, 1580 G_ATOMICRMW_FSUB, 1581 G_READ_REGISTER, 1582 G_WRITE_REGISTER, 1583 1584 G_SADDO, G_SSUBO, 1585 1586 // TODO: Implement 1587 G_FMINIMUM, G_FMAXIMUM, 1588 G_FSHL 1589 }).lower(); 1590 1591 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1592 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1593 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1594 .unsupported(); 1595 1596 computeTables(); 1597 verify(*ST.getInstrInfo()); 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1601 MachineInstr &MI) const { 1602 MachineIRBuilder &B = Helper.MIRBuilder; 1603 MachineRegisterInfo &MRI = *B.getMRI(); 1604 1605 switch (MI.getOpcode()) { 1606 case TargetOpcode::G_ADDRSPACE_CAST: 1607 return legalizeAddrSpaceCast(MI, MRI, B); 1608 case TargetOpcode::G_FRINT: 1609 return legalizeFrint(MI, MRI, B); 1610 case TargetOpcode::G_FCEIL: 1611 return legalizeFceil(MI, MRI, B); 1612 case TargetOpcode::G_FREM: 1613 return legalizeFrem(MI, MRI, B); 1614 case TargetOpcode::G_INTRINSIC_TRUNC: 1615 return legalizeIntrinsicTrunc(MI, MRI, B); 1616 case TargetOpcode::G_SITOFP: 1617 return legalizeITOFP(MI, MRI, B, true); 1618 case TargetOpcode::G_UITOFP: 1619 return legalizeITOFP(MI, MRI, B, false); 1620 case TargetOpcode::G_FPTOSI: 1621 return legalizeFPTOI(MI, MRI, B, true); 1622 case TargetOpcode::G_FPTOUI: 1623 return legalizeFPTOI(MI, MRI, B, false); 1624 case TargetOpcode::G_FMINNUM: 1625 case TargetOpcode::G_FMAXNUM: 1626 case TargetOpcode::G_FMINNUM_IEEE: 1627 case TargetOpcode::G_FMAXNUM_IEEE: 1628 return legalizeMinNumMaxNum(Helper, MI); 1629 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1630 return legalizeExtractVectorElt(MI, MRI, B); 1631 case TargetOpcode::G_INSERT_VECTOR_ELT: 1632 return legalizeInsertVectorElt(MI, MRI, B); 1633 case TargetOpcode::G_SHUFFLE_VECTOR: 1634 return legalizeShuffleVector(MI, MRI, B); 1635 case TargetOpcode::G_FSIN: 1636 case TargetOpcode::G_FCOS: 1637 return legalizeSinCos(MI, MRI, B); 1638 case TargetOpcode::G_GLOBAL_VALUE: 1639 return legalizeGlobalValue(MI, MRI, B); 1640 case TargetOpcode::G_LOAD: 1641 return legalizeLoad(Helper, MI); 1642 case TargetOpcode::G_FMAD: 1643 return legalizeFMad(MI, MRI, B); 1644 case TargetOpcode::G_FDIV: 1645 return legalizeFDIV(MI, MRI, B); 1646 case TargetOpcode::G_UDIV: 1647 case TargetOpcode::G_UREM: 1648 return legalizeUDIV_UREM(MI, MRI, B); 1649 case TargetOpcode::G_SDIV: 1650 case TargetOpcode::G_SREM: 1651 return legalizeSDIV_SREM(MI, MRI, B); 1652 case TargetOpcode::G_ATOMIC_CMPXCHG: 1653 return legalizeAtomicCmpXChg(MI, MRI, B); 1654 case TargetOpcode::G_FLOG: 1655 return legalizeFlog(MI, B, numbers::ln2f); 1656 case TargetOpcode::G_FLOG10: 1657 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1658 case TargetOpcode::G_FEXP: 1659 return legalizeFExp(MI, B); 1660 case TargetOpcode::G_FPOW: 1661 return legalizeFPow(MI, B); 1662 case TargetOpcode::G_FFLOOR: 1663 return legalizeFFloor(MI, MRI, B); 1664 case TargetOpcode::G_BUILD_VECTOR: 1665 return legalizeBuildVector(MI, MRI, B); 1666 default: 1667 return false; 1668 } 1669 1670 llvm_unreachable("expected switch to return"); 1671 } 1672 1673 Register AMDGPULegalizerInfo::getSegmentAperture( 1674 unsigned AS, 1675 MachineRegisterInfo &MRI, 1676 MachineIRBuilder &B) const { 1677 MachineFunction &MF = B.getMF(); 1678 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1679 const LLT S32 = LLT::scalar(32); 1680 1681 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1682 1683 if (ST.hasApertureRegs()) { 1684 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1685 // getreg. 1686 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1687 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1688 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1689 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1690 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1691 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1692 unsigned Encoding = 1693 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1694 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1695 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1696 1697 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1698 1699 B.buildInstr(AMDGPU::S_GETREG_B32) 1700 .addDef(GetReg) 1701 .addImm(Encoding); 1702 MRI.setType(GetReg, S32); 1703 1704 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1705 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1706 } 1707 1708 Register QueuePtr = MRI.createGenericVirtualRegister( 1709 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1710 1711 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1712 return Register(); 1713 1714 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1715 // private_segment_aperture_base_hi. 1716 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1717 1718 // TODO: can we be smarter about machine pointer info? 1719 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1720 MachineMemOperand *MMO = MF.getMachineMemOperand( 1721 PtrInfo, 1722 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1723 MachineMemOperand::MOInvariant, 1724 4, commonAlignment(Align(64), StructOffset)); 1725 1726 Register LoadAddr; 1727 1728 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1729 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1730 } 1731 1732 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1733 MachineInstr &MI, MachineRegisterInfo &MRI, 1734 MachineIRBuilder &B) const { 1735 MachineFunction &MF = B.getMF(); 1736 1737 const LLT S32 = LLT::scalar(32); 1738 Register Dst = MI.getOperand(0).getReg(); 1739 Register Src = MI.getOperand(1).getReg(); 1740 1741 LLT DstTy = MRI.getType(Dst); 1742 LLT SrcTy = MRI.getType(Src); 1743 unsigned DestAS = DstTy.getAddressSpace(); 1744 unsigned SrcAS = SrcTy.getAddressSpace(); 1745 1746 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1747 // vector element. 1748 assert(!DstTy.isVector()); 1749 1750 const AMDGPUTargetMachine &TM 1751 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1752 1753 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1754 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1755 return true; 1756 } 1757 1758 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1759 // Truncate. 1760 B.buildExtract(Dst, Src, 0); 1761 MI.eraseFromParent(); 1762 return true; 1763 } 1764 1765 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1766 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1767 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1768 1769 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1770 // another. Merge operands are required to be the same type, but creating an 1771 // extra ptrtoint would be kind of pointless. 1772 auto HighAddr = B.buildConstant( 1773 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1774 B.buildMerge(Dst, {Src, HighAddr}); 1775 MI.eraseFromParent(); 1776 return true; 1777 } 1778 1779 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1780 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1781 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1782 unsigned NullVal = TM.getNullPointerValue(DestAS); 1783 1784 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1785 auto FlatNull = B.buildConstant(SrcTy, 0); 1786 1787 // Extract low 32-bits of the pointer. 1788 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1789 1790 auto CmpRes = 1791 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1792 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1793 1794 MI.eraseFromParent(); 1795 return true; 1796 } 1797 1798 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1799 return false; 1800 1801 if (!ST.hasFlatAddressSpace()) 1802 return false; 1803 1804 auto SegmentNull = 1805 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1806 auto FlatNull = 1807 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1808 1809 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1810 if (!ApertureReg.isValid()) 1811 return false; 1812 1813 auto CmpRes = 1814 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1815 1816 // Coerce the type of the low half of the result so we can use merge_values. 1817 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1818 1819 // TODO: Should we allow mismatched types but matching sizes in merges to 1820 // avoid the ptrtoint? 1821 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1822 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1823 1824 MI.eraseFromParent(); 1825 return true; 1826 } 1827 1828 bool AMDGPULegalizerInfo::legalizeFrint( 1829 MachineInstr &MI, MachineRegisterInfo &MRI, 1830 MachineIRBuilder &B) const { 1831 Register Src = MI.getOperand(1).getReg(); 1832 LLT Ty = MRI.getType(Src); 1833 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1834 1835 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1836 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1837 1838 auto C1 = B.buildFConstant(Ty, C1Val); 1839 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1840 1841 // TODO: Should this propagate fast-math-flags? 1842 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1843 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1844 1845 auto C2 = B.buildFConstant(Ty, C2Val); 1846 auto Fabs = B.buildFAbs(Ty, Src); 1847 1848 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1849 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1850 MI.eraseFromParent(); 1851 return true; 1852 } 1853 1854 bool AMDGPULegalizerInfo::legalizeFceil( 1855 MachineInstr &MI, MachineRegisterInfo &MRI, 1856 MachineIRBuilder &B) const { 1857 1858 const LLT S1 = LLT::scalar(1); 1859 const LLT S64 = LLT::scalar(64); 1860 1861 Register Src = MI.getOperand(1).getReg(); 1862 assert(MRI.getType(Src) == S64); 1863 1864 // result = trunc(src) 1865 // if (src > 0.0 && src != result) 1866 // result += 1.0 1867 1868 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1869 1870 const auto Zero = B.buildFConstant(S64, 0.0); 1871 const auto One = B.buildFConstant(S64, 1.0); 1872 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1873 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1874 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1875 auto Add = B.buildSelect(S64, And, One, Zero); 1876 1877 // TODO: Should this propagate fast-math-flags? 1878 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1879 return true; 1880 } 1881 1882 bool AMDGPULegalizerInfo::legalizeFrem( 1883 MachineInstr &MI, MachineRegisterInfo &MRI, 1884 MachineIRBuilder &B) const { 1885 Register DstReg = MI.getOperand(0).getReg(); 1886 Register Src0Reg = MI.getOperand(1).getReg(); 1887 Register Src1Reg = MI.getOperand(2).getReg(); 1888 auto Flags = MI.getFlags(); 1889 LLT Ty = MRI.getType(DstReg); 1890 1891 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1892 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1893 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1894 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1895 MI.eraseFromParent(); 1896 return true; 1897 } 1898 1899 static MachineInstrBuilder extractF64Exponent(Register Hi, 1900 MachineIRBuilder &B) { 1901 const unsigned FractBits = 52; 1902 const unsigned ExpBits = 11; 1903 LLT S32 = LLT::scalar(32); 1904 1905 auto Const0 = B.buildConstant(S32, FractBits - 32); 1906 auto Const1 = B.buildConstant(S32, ExpBits); 1907 1908 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1909 .addUse(Hi) 1910 .addUse(Const0.getReg(0)) 1911 .addUse(Const1.getReg(0)); 1912 1913 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1914 } 1915 1916 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1917 MachineInstr &MI, MachineRegisterInfo &MRI, 1918 MachineIRBuilder &B) const { 1919 const LLT S1 = LLT::scalar(1); 1920 const LLT S32 = LLT::scalar(32); 1921 const LLT S64 = LLT::scalar(64); 1922 1923 Register Src = MI.getOperand(1).getReg(); 1924 assert(MRI.getType(Src) == S64); 1925 1926 // TODO: Should this use extract since the low half is unused? 1927 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1928 Register Hi = Unmerge.getReg(1); 1929 1930 // Extract the upper half, since this is where we will find the sign and 1931 // exponent. 1932 auto Exp = extractF64Exponent(Hi, B); 1933 1934 const unsigned FractBits = 52; 1935 1936 // Extract the sign bit. 1937 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1938 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1939 1940 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1941 1942 const auto Zero32 = B.buildConstant(S32, 0); 1943 1944 // Extend back to 64-bits. 1945 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1946 1947 auto Shr = B.buildAShr(S64, FractMask, Exp); 1948 auto Not = B.buildNot(S64, Shr); 1949 auto Tmp0 = B.buildAnd(S64, Src, Not); 1950 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1951 1952 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1953 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1954 1955 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1956 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1957 MI.eraseFromParent(); 1958 return true; 1959 } 1960 1961 bool AMDGPULegalizerInfo::legalizeITOFP( 1962 MachineInstr &MI, MachineRegisterInfo &MRI, 1963 MachineIRBuilder &B, bool Signed) const { 1964 1965 Register Dst = MI.getOperand(0).getReg(); 1966 Register Src = MI.getOperand(1).getReg(); 1967 1968 const LLT S64 = LLT::scalar(64); 1969 const LLT S32 = LLT::scalar(32); 1970 1971 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1972 1973 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1974 1975 auto CvtHi = Signed ? 1976 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1977 B.buildUITOFP(S64, Unmerge.getReg(1)); 1978 1979 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1980 1981 auto ThirtyTwo = B.buildConstant(S32, 32); 1982 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1983 .addUse(CvtHi.getReg(0)) 1984 .addUse(ThirtyTwo.getReg(0)); 1985 1986 // TODO: Should this propagate fast-math-flags? 1987 B.buildFAdd(Dst, LdExp, CvtLo); 1988 MI.eraseFromParent(); 1989 return true; 1990 } 1991 1992 // TODO: Copied from DAG implementation. Verify logic and document how this 1993 // actually works. 1994 bool AMDGPULegalizerInfo::legalizeFPTOI( 1995 MachineInstr &MI, MachineRegisterInfo &MRI, 1996 MachineIRBuilder &B, bool Signed) const { 1997 1998 Register Dst = MI.getOperand(0).getReg(); 1999 Register Src = MI.getOperand(1).getReg(); 2000 2001 const LLT S64 = LLT::scalar(64); 2002 const LLT S32 = LLT::scalar(32); 2003 2004 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2005 2006 unsigned Flags = MI.getFlags(); 2007 2008 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2009 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2010 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2011 2012 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2013 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2014 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2015 2016 auto Hi = Signed ? 2017 B.buildFPTOSI(S32, FloorMul) : 2018 B.buildFPTOUI(S32, FloorMul); 2019 auto Lo = B.buildFPTOUI(S32, Fma); 2020 2021 B.buildMerge(Dst, { Lo, Hi }); 2022 MI.eraseFromParent(); 2023 2024 return true; 2025 } 2026 2027 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2028 MachineInstr &MI) const { 2029 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2030 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2031 2032 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2033 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2034 2035 // With ieee_mode disabled, the instructions have the correct behavior 2036 // already for G_FMINNUM/G_FMAXNUM 2037 if (!MFI->getMode().IEEE) 2038 return !IsIEEEOp; 2039 2040 if (IsIEEEOp) 2041 return true; 2042 2043 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2044 } 2045 2046 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2047 MachineInstr &MI, MachineRegisterInfo &MRI, 2048 MachineIRBuilder &B) const { 2049 // TODO: Should move some of this into LegalizerHelper. 2050 2051 // TODO: Promote dynamic indexing of s16 to s32 2052 2053 // FIXME: Artifact combiner probably should have replaced the truncated 2054 // constant before this, so we shouldn't need 2055 // getConstantVRegValWithLookThrough. 2056 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2057 MI.getOperand(2).getReg(), MRI); 2058 if (!IdxVal) // Dynamic case will be selected to register indexing. 2059 return true; 2060 2061 Register Dst = MI.getOperand(0).getReg(); 2062 Register Vec = MI.getOperand(1).getReg(); 2063 2064 LLT VecTy = MRI.getType(Vec); 2065 LLT EltTy = VecTy.getElementType(); 2066 assert(EltTy == MRI.getType(Dst)); 2067 2068 if (IdxVal->Value < VecTy.getNumElements()) 2069 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2070 else 2071 B.buildUndef(Dst); 2072 2073 MI.eraseFromParent(); 2074 return true; 2075 } 2076 2077 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2078 MachineInstr &MI, MachineRegisterInfo &MRI, 2079 MachineIRBuilder &B) const { 2080 // TODO: Should move some of this into LegalizerHelper. 2081 2082 // TODO: Promote dynamic indexing of s16 to s32 2083 2084 // FIXME: Artifact combiner probably should have replaced the truncated 2085 // constant before this, so we shouldn't need 2086 // getConstantVRegValWithLookThrough. 2087 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2088 MI.getOperand(3).getReg(), MRI); 2089 if (!IdxVal) // Dynamic case will be selected to register indexing. 2090 return true; 2091 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Vec = MI.getOperand(1).getReg(); 2094 Register Ins = MI.getOperand(2).getReg(); 2095 2096 LLT VecTy = MRI.getType(Vec); 2097 LLT EltTy = VecTy.getElementType(); 2098 assert(EltTy == MRI.getType(Ins)); 2099 2100 if (IdxVal->Value < VecTy.getNumElements()) 2101 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2102 else 2103 B.buildUndef(Dst); 2104 2105 MI.eraseFromParent(); 2106 return true; 2107 } 2108 2109 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2110 MachineInstr &MI, MachineRegisterInfo &MRI, 2111 MachineIRBuilder &B) const { 2112 const LLT V2S16 = LLT::vector(2, 16); 2113 2114 Register Dst = MI.getOperand(0).getReg(); 2115 Register Src0 = MI.getOperand(1).getReg(); 2116 LLT DstTy = MRI.getType(Dst); 2117 LLT SrcTy = MRI.getType(Src0); 2118 2119 if (SrcTy == V2S16 && DstTy == V2S16 && 2120 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2121 return true; 2122 2123 MachineIRBuilder HelperBuilder(MI); 2124 GISelObserverWrapper DummyObserver; 2125 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2126 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2127 } 2128 2129 bool AMDGPULegalizerInfo::legalizeSinCos( 2130 MachineInstr &MI, MachineRegisterInfo &MRI, 2131 MachineIRBuilder &B) const { 2132 2133 Register DstReg = MI.getOperand(0).getReg(); 2134 Register SrcReg = MI.getOperand(1).getReg(); 2135 LLT Ty = MRI.getType(DstReg); 2136 unsigned Flags = MI.getFlags(); 2137 2138 Register TrigVal; 2139 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2140 if (ST.hasTrigReducedRange()) { 2141 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2142 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2143 .addUse(MulVal.getReg(0)) 2144 .setMIFlags(Flags).getReg(0); 2145 } else 2146 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2147 2148 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2149 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2150 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2151 .addUse(TrigVal) 2152 .setMIFlags(Flags); 2153 MI.eraseFromParent(); 2154 return true; 2155 } 2156 2157 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2158 MachineIRBuilder &B, 2159 const GlobalValue *GV, 2160 int64_t Offset, 2161 unsigned GAFlags) const { 2162 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2163 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2164 // to the following code sequence: 2165 // 2166 // For constant address space: 2167 // s_getpc_b64 s[0:1] 2168 // s_add_u32 s0, s0, $symbol 2169 // s_addc_u32 s1, s1, 0 2170 // 2171 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2172 // a fixup or relocation is emitted to replace $symbol with a literal 2173 // constant, which is a pc-relative offset from the encoding of the $symbol 2174 // operand to the global variable. 2175 // 2176 // For global address space: 2177 // s_getpc_b64 s[0:1] 2178 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2179 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2180 // 2181 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2182 // fixups or relocations are emitted to replace $symbol@*@lo and 2183 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2184 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2185 // operand to the global variable. 2186 // 2187 // What we want here is an offset from the value returned by s_getpc 2188 // (which is the address of the s_add_u32 instruction) to the global 2189 // variable, but since the encoding of $symbol starts 4 bytes after the start 2190 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2191 // small. This requires us to add 4 to the global variable offset in order to 2192 // compute the correct address. 2193 2194 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2195 2196 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2197 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2198 2199 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2200 .addDef(PCReg); 2201 2202 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2203 if (GAFlags == SIInstrInfo::MO_NONE) 2204 MIB.addImm(0); 2205 else 2206 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2207 2208 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2209 2210 if (PtrTy.getSizeInBits() == 32) 2211 B.buildExtract(DstReg, PCReg, 0); 2212 return true; 2213 } 2214 2215 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2216 MachineInstr &MI, MachineRegisterInfo &MRI, 2217 MachineIRBuilder &B) const { 2218 Register DstReg = MI.getOperand(0).getReg(); 2219 LLT Ty = MRI.getType(DstReg); 2220 unsigned AS = Ty.getAddressSpace(); 2221 2222 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2223 MachineFunction &MF = B.getMF(); 2224 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2225 2226 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2227 if (!MFI->isEntryFunction()) { 2228 const Function &Fn = MF.getFunction(); 2229 DiagnosticInfoUnsupported BadLDSDecl( 2230 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2231 DS_Warning); 2232 Fn.getContext().diagnose(BadLDSDecl); 2233 2234 // We currently don't have a way to correctly allocate LDS objects that 2235 // aren't directly associated with a kernel. We do force inlining of 2236 // functions that use local objects. However, if these dead functions are 2237 // not eliminated, we don't want a compile time error. Just emit a warning 2238 // and a trap, since there should be no callable path here. 2239 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2240 B.buildUndef(DstReg); 2241 MI.eraseFromParent(); 2242 return true; 2243 } 2244 2245 // TODO: We could emit code to handle the initialization somewhere. 2246 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2247 const SITargetLowering *TLI = ST.getTargetLowering(); 2248 if (!TLI->shouldUseLDSConstAddress(GV)) { 2249 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2250 return true; // Leave in place; 2251 } 2252 2253 B.buildConstant( 2254 DstReg, 2255 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2256 MI.eraseFromParent(); 2257 return true; 2258 } 2259 2260 const Function &Fn = MF.getFunction(); 2261 DiagnosticInfoUnsupported BadInit( 2262 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2263 Fn.getContext().diagnose(BadInit); 2264 return true; 2265 } 2266 2267 const SITargetLowering *TLI = ST.getTargetLowering(); 2268 2269 if (TLI->shouldEmitFixup(GV)) { 2270 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 if (TLI->shouldEmitPCReloc(GV)) { 2276 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2277 MI.eraseFromParent(); 2278 return true; 2279 } 2280 2281 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2282 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2283 2284 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2285 MachinePointerInfo::getGOT(MF), 2286 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2287 MachineMemOperand::MOInvariant, 2288 8 /*Size*/, Align(8)); 2289 2290 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2291 2292 if (Ty.getSizeInBits() == 32) { 2293 // Truncate if this is a 32-bit constant adrdess. 2294 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2295 B.buildExtract(DstReg, Load, 0); 2296 } else 2297 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2298 2299 MI.eraseFromParent(); 2300 return true; 2301 } 2302 2303 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2304 MachineInstr &MI) const { 2305 MachineIRBuilder &B = Helper.MIRBuilder; 2306 MachineRegisterInfo &MRI = *B.getMRI(); 2307 GISelChangeObserver &Observer = Helper.Observer; 2308 2309 Register PtrReg = MI.getOperand(1).getReg(); 2310 LLT PtrTy = MRI.getType(PtrReg); 2311 unsigned AddrSpace = PtrTy.getAddressSpace(); 2312 2313 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2314 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2315 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2316 Observer.changingInstr(MI); 2317 MI.getOperand(1).setReg(Cast.getReg(0)); 2318 Observer.changedInstr(MI); 2319 return true; 2320 } 2321 2322 return false; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeFMad( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2329 assert(Ty.isScalar()); 2330 2331 MachineFunction &MF = B.getMF(); 2332 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2333 2334 // TODO: Always legal with future ftz flag. 2335 // FIXME: Do we need just output? 2336 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2337 return true; 2338 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2339 return true; 2340 2341 MachineIRBuilder HelperBuilder(MI); 2342 GISelObserverWrapper DummyObserver; 2343 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2344 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2345 } 2346 2347 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2348 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2349 Register DstReg = MI.getOperand(0).getReg(); 2350 Register PtrReg = MI.getOperand(1).getReg(); 2351 Register CmpVal = MI.getOperand(2).getReg(); 2352 Register NewVal = MI.getOperand(3).getReg(); 2353 2354 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2355 "this should not have been custom lowered"); 2356 2357 LLT ValTy = MRI.getType(CmpVal); 2358 LLT VecTy = LLT::vector(2, ValTy); 2359 2360 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2361 2362 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2363 .addDef(DstReg) 2364 .addUse(PtrReg) 2365 .addUse(PackedVal) 2366 .setMemRefs(MI.memoperands()); 2367 2368 MI.eraseFromParent(); 2369 return true; 2370 } 2371 2372 bool AMDGPULegalizerInfo::legalizeFlog( 2373 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2374 Register Dst = MI.getOperand(0).getReg(); 2375 Register Src = MI.getOperand(1).getReg(); 2376 LLT Ty = B.getMRI()->getType(Dst); 2377 unsigned Flags = MI.getFlags(); 2378 2379 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2380 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2381 2382 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2383 MI.eraseFromParent(); 2384 return true; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2388 MachineIRBuilder &B) const { 2389 Register Dst = MI.getOperand(0).getReg(); 2390 Register Src = MI.getOperand(1).getReg(); 2391 unsigned Flags = MI.getFlags(); 2392 LLT Ty = B.getMRI()->getType(Dst); 2393 2394 auto K = B.buildFConstant(Ty, numbers::log2e); 2395 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2396 B.buildFExp2(Dst, Mul, Flags); 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2402 MachineIRBuilder &B) const { 2403 Register Dst = MI.getOperand(0).getReg(); 2404 Register Src0 = MI.getOperand(1).getReg(); 2405 Register Src1 = MI.getOperand(2).getReg(); 2406 unsigned Flags = MI.getFlags(); 2407 LLT Ty = B.getMRI()->getType(Dst); 2408 const LLT S16 = LLT::scalar(16); 2409 const LLT S32 = LLT::scalar(32); 2410 2411 if (Ty == S32) { 2412 auto Log = B.buildFLog2(S32, Src0, Flags); 2413 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2414 .addUse(Log.getReg(0)) 2415 .addUse(Src1) 2416 .setMIFlags(Flags); 2417 B.buildFExp2(Dst, Mul, Flags); 2418 } else if (Ty == S16) { 2419 // There's no f16 fmul_legacy, so we need to convert for it. 2420 auto Log = B.buildFLog2(S16, Src0, Flags); 2421 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2422 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2423 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2424 .addUse(Ext0.getReg(0)) 2425 .addUse(Ext1.getReg(0)) 2426 .setMIFlags(Flags); 2427 2428 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2429 } else 2430 return false; 2431 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 // Find a source register, ignoring any possible source modifiers. 2437 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2438 Register ModSrc = OrigSrc; 2439 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2440 ModSrc = SrcFNeg->getOperand(1).getReg(); 2441 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2442 ModSrc = SrcFAbs->getOperand(1).getReg(); 2443 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2444 ModSrc = SrcFAbs->getOperand(1).getReg(); 2445 return ModSrc; 2446 } 2447 2448 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2449 MachineRegisterInfo &MRI, 2450 MachineIRBuilder &B) const { 2451 2452 const LLT S1 = LLT::scalar(1); 2453 const LLT S64 = LLT::scalar(64); 2454 Register Dst = MI.getOperand(0).getReg(); 2455 Register OrigSrc = MI.getOperand(1).getReg(); 2456 unsigned Flags = MI.getFlags(); 2457 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2458 "this should not have been custom lowered"); 2459 2460 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2461 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2462 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2463 // V_FRACT bug is: 2464 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2465 // 2466 // Convert floor(x) to (x - fract(x)) 2467 2468 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2469 .addUse(OrigSrc) 2470 .setMIFlags(Flags); 2471 2472 // Give source modifier matching some assistance before obscuring a foldable 2473 // pattern. 2474 2475 // TODO: We can avoid the neg on the fract? The input sign to fract 2476 // shouldn't matter? 2477 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2478 2479 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2480 2481 Register Min = MRI.createGenericVirtualRegister(S64); 2482 2483 // We don't need to concern ourselves with the snan handling difference, so 2484 // use the one which will directly select. 2485 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2486 if (MFI->getMode().IEEE) 2487 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2488 else 2489 B.buildFMinNum(Min, Fract, Const, Flags); 2490 2491 Register CorrectedFract = Min; 2492 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2493 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2494 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2495 } 2496 2497 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2498 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2499 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 // Turn an illegal packed v2s16 build vector into bit operations. 2505 // TODO: This should probably be a bitcast action in LegalizerHelper. 2506 bool AMDGPULegalizerInfo::legalizeBuildVector( 2507 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2508 Register Dst = MI.getOperand(0).getReg(); 2509 const LLT S32 = LLT::scalar(32); 2510 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2511 2512 Register Src0 = MI.getOperand(1).getReg(); 2513 Register Src1 = MI.getOperand(2).getReg(); 2514 assert(MRI.getType(Src0) == LLT::scalar(16)); 2515 2516 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2517 B.buildBitcast(Dst, Merge); 2518 2519 MI.eraseFromParent(); 2520 return true; 2521 } 2522 2523 // Return the use branch instruction, otherwise null if the usage is invalid. 2524 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2525 MachineRegisterInfo &MRI, 2526 MachineInstr *&Br, 2527 MachineBasicBlock *&UncondBrTarget) { 2528 Register CondDef = MI.getOperand(0).getReg(); 2529 if (!MRI.hasOneNonDBGUse(CondDef)) 2530 return nullptr; 2531 2532 MachineBasicBlock *Parent = MI.getParent(); 2533 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2534 if (UseMI.getParent() != Parent || 2535 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2536 return nullptr; 2537 2538 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2539 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2540 if (Next == Parent->end()) { 2541 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2542 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2543 return nullptr; 2544 UncondBrTarget = &*NextMBB; 2545 } else { 2546 if (Next->getOpcode() != AMDGPU::G_BR) 2547 return nullptr; 2548 Br = &*Next; 2549 UncondBrTarget = Br->getOperand(0).getMBB(); 2550 } 2551 2552 return &UseMI; 2553 } 2554 2555 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2556 const ArgDescriptor *Arg, 2557 const TargetRegisterClass *ArgRC, 2558 LLT ArgTy) const { 2559 MCRegister SrcReg = Arg->getRegister(); 2560 assert(SrcReg.isPhysical() && "Physical register expected"); 2561 assert(DstReg.isVirtual() && "Virtual register expected"); 2562 2563 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2564 ArgTy); 2565 if (Arg->isMasked()) { 2566 // TODO: Should we try to emit this once in the entry block? 2567 const LLT S32 = LLT::scalar(32); 2568 const unsigned Mask = Arg->getMask(); 2569 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2570 2571 Register AndMaskSrc = LiveIn; 2572 2573 if (Shift != 0) { 2574 auto ShiftAmt = B.buildConstant(S32, Shift); 2575 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2576 } 2577 2578 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2579 } else { 2580 B.buildCopy(DstReg, LiveIn); 2581 } 2582 2583 return true; 2584 } 2585 2586 bool AMDGPULegalizerInfo::loadInputValue( 2587 Register DstReg, MachineIRBuilder &B, 2588 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2589 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2590 const ArgDescriptor *Arg; 2591 const TargetRegisterClass *ArgRC; 2592 LLT ArgTy; 2593 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2594 2595 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2596 return false; // TODO: Handle these 2597 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2598 } 2599 2600 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2601 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2602 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2603 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2604 return false; 2605 2606 MI.eraseFromParent(); 2607 return true; 2608 } 2609 2610 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2611 MachineRegisterInfo &MRI, 2612 MachineIRBuilder &B) const { 2613 Register Dst = MI.getOperand(0).getReg(); 2614 LLT DstTy = MRI.getType(Dst); 2615 LLT S16 = LLT::scalar(16); 2616 LLT S32 = LLT::scalar(32); 2617 LLT S64 = LLT::scalar(64); 2618 2619 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2620 return true; 2621 2622 if (DstTy == S16) 2623 return legalizeFDIV16(MI, MRI, B); 2624 if (DstTy == S32) 2625 return legalizeFDIV32(MI, MRI, B); 2626 if (DstTy == S64) 2627 return legalizeFDIV64(MI, MRI, B); 2628 2629 return false; 2630 } 2631 2632 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2633 Register DstReg, 2634 Register X, 2635 Register Y, 2636 bool IsDiv) const { 2637 const LLT S1 = LLT::scalar(1); 2638 const LLT S32 = LLT::scalar(32); 2639 2640 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2641 // algorithm used here. 2642 2643 // Initial estimate of inv(y). 2644 auto FloatY = B.buildUITOFP(S32, Y); 2645 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2646 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2647 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2648 auto Z = B.buildFPTOUI(S32, ScaledY); 2649 2650 // One round of UNR. 2651 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2652 auto NegYZ = B.buildMul(S32, NegY, Z); 2653 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2654 2655 // Quotient/remainder estimate. 2656 auto Q = B.buildUMulH(S32, X, Z); 2657 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2658 2659 // First quotient/remainder refinement. 2660 auto One = B.buildConstant(S32, 1); 2661 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2662 if (IsDiv) 2663 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2664 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2665 2666 // Second quotient/remainder refinement. 2667 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2668 if (IsDiv) 2669 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2670 else 2671 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2672 } 2673 2674 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2675 MachineRegisterInfo &MRI, 2676 MachineIRBuilder &B) const { 2677 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2678 Register DstReg = MI.getOperand(0).getReg(); 2679 Register Num = MI.getOperand(1).getReg(); 2680 Register Den = MI.getOperand(2).getReg(); 2681 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2682 MI.eraseFromParent(); 2683 return true; 2684 } 2685 2686 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2687 // 2688 // Return lo, hi of result 2689 // 2690 // %cvt.lo = G_UITOFP Val.lo 2691 // %cvt.hi = G_UITOFP Val.hi 2692 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2693 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2694 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2695 // %mul2 = G_FMUL %mul1, 2**(-32) 2696 // %trunc = G_INTRINSIC_TRUNC %mul2 2697 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2698 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2699 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2700 Register Val) { 2701 const LLT S32 = LLT::scalar(32); 2702 auto Unmerge = B.buildUnmerge(S32, Val); 2703 2704 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2705 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2706 2707 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2708 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2709 2710 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2711 auto Mul1 = 2712 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2713 2714 // 2**(-32) 2715 auto Mul2 = 2716 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2717 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2718 2719 // -(2**32) 2720 auto Mad2 = B.buildFMAD(S32, Trunc, 2721 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2722 2723 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2724 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2725 2726 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2727 } 2728 2729 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2730 Register DstReg, 2731 Register Numer, 2732 Register Denom, 2733 bool IsDiv) const { 2734 const LLT S32 = LLT::scalar(32); 2735 const LLT S64 = LLT::scalar(64); 2736 const LLT S1 = LLT::scalar(1); 2737 Register RcpLo, RcpHi; 2738 2739 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2740 2741 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2742 2743 auto Zero64 = B.buildConstant(S64, 0); 2744 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2745 2746 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2747 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2748 2749 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2750 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2751 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2752 2753 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2754 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2755 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2756 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2757 2758 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2759 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2760 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2761 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2762 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2763 2764 auto Zero32 = B.buildConstant(S32, 0); 2765 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2766 auto Add2_HiC = 2767 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2768 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2769 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2770 2771 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2772 Register NumerLo = UnmergeNumer.getReg(0); 2773 Register NumerHi = UnmergeNumer.getReg(1); 2774 2775 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2776 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2777 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2778 Register Mul3_Lo = UnmergeMul3.getReg(0); 2779 Register Mul3_Hi = UnmergeMul3.getReg(1); 2780 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2781 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2782 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2783 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2784 2785 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2786 Register DenomLo = UnmergeDenom.getReg(0); 2787 Register DenomHi = UnmergeDenom.getReg(1); 2788 2789 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2790 auto C1 = B.buildSExt(S32, CmpHi); 2791 2792 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2793 auto C2 = B.buildSExt(S32, CmpLo); 2794 2795 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2796 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2797 2798 // TODO: Here and below portions of the code can be enclosed into if/endif. 2799 // Currently control flow is unconditional and we have 4 selects after 2800 // potential endif to substitute PHIs. 2801 2802 // if C3 != 0 ... 2803 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2804 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2805 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2806 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2807 2808 auto One64 = B.buildConstant(S64, 1); 2809 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2810 2811 auto C4 = 2812 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2813 auto C5 = 2814 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2815 auto C6 = B.buildSelect( 2816 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2817 2818 // if (C6 != 0) 2819 auto Add4 = B.buildAdd(S64, Add3, One64); 2820 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2821 2822 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2823 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2824 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2825 2826 // endif C6 2827 // endif C3 2828 2829 if (IsDiv) { 2830 auto Sel1 = B.buildSelect( 2831 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2832 B.buildSelect(DstReg, 2833 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2834 } else { 2835 auto Sel2 = B.buildSelect( 2836 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2837 B.buildSelect(DstReg, 2838 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2839 } 2840 } 2841 2842 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2843 MachineRegisterInfo &MRI, 2844 MachineIRBuilder &B) const { 2845 const LLT S64 = LLT::scalar(64); 2846 const LLT S32 = LLT::scalar(32); 2847 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2848 Register DstReg = MI.getOperand(0).getReg(); 2849 Register Num = MI.getOperand(1).getReg(); 2850 Register Den = MI.getOperand(2).getReg(); 2851 LLT Ty = MRI.getType(DstReg); 2852 2853 if (Ty == S32) 2854 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2855 else if (Ty == S64) 2856 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2857 else 2858 return false; 2859 2860 MI.eraseFromParent(); 2861 return true; 2862 2863 } 2864 2865 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2866 MachineRegisterInfo &MRI, 2867 MachineIRBuilder &B) const { 2868 const LLT S64 = LLT::scalar(64); 2869 const LLT S32 = LLT::scalar(32); 2870 2871 Register DstReg = MI.getOperand(0).getReg(); 2872 const LLT Ty = MRI.getType(DstReg); 2873 if (Ty != S32 && Ty != S64) 2874 return false; 2875 2876 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2877 2878 Register LHS = MI.getOperand(1).getReg(); 2879 Register RHS = MI.getOperand(2).getReg(); 2880 2881 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2882 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2883 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2884 2885 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2886 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2887 2888 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2889 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2890 2891 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2892 if (Ty == S32) 2893 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2894 else 2895 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2896 2897 Register Sign; 2898 if (IsDiv) 2899 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2900 else 2901 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2902 2903 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2904 B.buildSub(DstReg, UDivRem, Sign); 2905 2906 MI.eraseFromParent(); 2907 return true; 2908 } 2909 2910 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2911 MachineRegisterInfo &MRI, 2912 MachineIRBuilder &B) const { 2913 Register Res = MI.getOperand(0).getReg(); 2914 Register LHS = MI.getOperand(1).getReg(); 2915 Register RHS = MI.getOperand(2).getReg(); 2916 2917 uint16_t Flags = MI.getFlags(); 2918 2919 LLT ResTy = MRI.getType(Res); 2920 LLT S32 = LLT::scalar(32); 2921 LLT S64 = LLT::scalar(64); 2922 2923 const MachineFunction &MF = B.getMF(); 2924 bool Unsafe = 2925 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2926 2927 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2928 return false; 2929 2930 if (!Unsafe && ResTy == S32 && 2931 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2932 return false; 2933 2934 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2935 // 1 / x -> RCP(x) 2936 if (CLHS->isExactlyValue(1.0)) { 2937 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2938 .addUse(RHS) 2939 .setMIFlags(Flags); 2940 2941 MI.eraseFromParent(); 2942 return true; 2943 } 2944 2945 // -1 / x -> RCP( FNEG(x) ) 2946 if (CLHS->isExactlyValue(-1.0)) { 2947 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2948 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2949 .addUse(FNeg.getReg(0)) 2950 .setMIFlags(Flags); 2951 2952 MI.eraseFromParent(); 2953 return true; 2954 } 2955 } 2956 2957 // x / y -> x * (1.0 / y) 2958 if (Unsafe) { 2959 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2960 .addUse(RHS) 2961 .setMIFlags(Flags); 2962 B.buildFMul(Res, LHS, RCP, Flags); 2963 2964 MI.eraseFromParent(); 2965 return true; 2966 } 2967 2968 return false; 2969 } 2970 2971 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2972 MachineRegisterInfo &MRI, 2973 MachineIRBuilder &B) const { 2974 Register Res = MI.getOperand(0).getReg(); 2975 Register LHS = MI.getOperand(1).getReg(); 2976 Register RHS = MI.getOperand(2).getReg(); 2977 2978 uint16_t Flags = MI.getFlags(); 2979 2980 LLT S16 = LLT::scalar(16); 2981 LLT S32 = LLT::scalar(32); 2982 2983 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2984 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2985 2986 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2987 .addUse(RHSExt.getReg(0)) 2988 .setMIFlags(Flags); 2989 2990 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2991 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2992 2993 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2994 .addUse(RDst.getReg(0)) 2995 .addUse(RHS) 2996 .addUse(LHS) 2997 .setMIFlags(Flags); 2998 2999 MI.eraseFromParent(); 3000 return true; 3001 } 3002 3003 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3004 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3005 static void toggleSPDenormMode(bool Enable, 3006 MachineIRBuilder &B, 3007 const GCNSubtarget &ST, 3008 AMDGPU::SIModeRegisterDefaults Mode) { 3009 // Set SP denorm mode to this value. 3010 unsigned SPDenormMode = 3011 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3012 3013 if (ST.hasDenormModeInst()) { 3014 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3015 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3016 3017 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3018 B.buildInstr(AMDGPU::S_DENORM_MODE) 3019 .addImm(NewDenormModeValue); 3020 3021 } else { 3022 // Select FP32 bit field in mode register. 3023 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3024 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3025 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3026 3027 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3028 .addImm(SPDenormMode) 3029 .addImm(SPDenormModeBitField); 3030 } 3031 } 3032 3033 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3034 MachineRegisterInfo &MRI, 3035 MachineIRBuilder &B) const { 3036 Register Res = MI.getOperand(0).getReg(); 3037 Register LHS = MI.getOperand(1).getReg(); 3038 Register RHS = MI.getOperand(2).getReg(); 3039 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3040 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3041 3042 uint16_t Flags = MI.getFlags(); 3043 3044 LLT S32 = LLT::scalar(32); 3045 LLT S1 = LLT::scalar(1); 3046 3047 auto One = B.buildFConstant(S32, 1.0f); 3048 3049 auto DenominatorScaled = 3050 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3051 .addUse(LHS) 3052 .addUse(RHS) 3053 .addImm(0) 3054 .setMIFlags(Flags); 3055 auto NumeratorScaled = 3056 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3057 .addUse(LHS) 3058 .addUse(RHS) 3059 .addImm(1) 3060 .setMIFlags(Flags); 3061 3062 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3063 .addUse(DenominatorScaled.getReg(0)) 3064 .setMIFlags(Flags); 3065 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3066 3067 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3068 // aren't modeled as reading it. 3069 if (!Mode.allFP32Denormals()) 3070 toggleSPDenormMode(true, B, ST, Mode); 3071 3072 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3073 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3074 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3075 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3076 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3077 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3078 3079 if (!Mode.allFP32Denormals()) 3080 toggleSPDenormMode(false, B, ST, Mode); 3081 3082 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3083 .addUse(Fma4.getReg(0)) 3084 .addUse(Fma1.getReg(0)) 3085 .addUse(Fma3.getReg(0)) 3086 .addUse(NumeratorScaled.getReg(1)) 3087 .setMIFlags(Flags); 3088 3089 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3090 .addUse(Fmas.getReg(0)) 3091 .addUse(RHS) 3092 .addUse(LHS) 3093 .setMIFlags(Flags); 3094 3095 MI.eraseFromParent(); 3096 return true; 3097 } 3098 3099 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3100 MachineRegisterInfo &MRI, 3101 MachineIRBuilder &B) const { 3102 Register Res = MI.getOperand(0).getReg(); 3103 Register LHS = MI.getOperand(1).getReg(); 3104 Register RHS = MI.getOperand(2).getReg(); 3105 3106 uint16_t Flags = MI.getFlags(); 3107 3108 LLT S64 = LLT::scalar(64); 3109 LLT S1 = LLT::scalar(1); 3110 3111 auto One = B.buildFConstant(S64, 1.0); 3112 3113 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3114 .addUse(LHS) 3115 .addUse(RHS) 3116 .addImm(0) 3117 .setMIFlags(Flags); 3118 3119 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3120 3121 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3122 .addUse(DivScale0.getReg(0)) 3123 .setMIFlags(Flags); 3124 3125 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3126 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3127 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3128 3129 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3130 .addUse(LHS) 3131 .addUse(RHS) 3132 .addImm(1) 3133 .setMIFlags(Flags); 3134 3135 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3136 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3137 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3138 3139 Register Scale; 3140 if (!ST.hasUsableDivScaleConditionOutput()) { 3141 // Workaround a hardware bug on SI where the condition output from div_scale 3142 // is not usable. 3143 3144 LLT S32 = LLT::scalar(32); 3145 3146 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3147 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3148 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3149 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3150 3151 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3152 Scale1Unmerge.getReg(1)); 3153 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3154 Scale0Unmerge.getReg(1)); 3155 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3156 } else { 3157 Scale = DivScale1.getReg(1); 3158 } 3159 3160 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3161 .addUse(Fma4.getReg(0)) 3162 .addUse(Fma3.getReg(0)) 3163 .addUse(Mul.getReg(0)) 3164 .addUse(Scale) 3165 .setMIFlags(Flags); 3166 3167 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3168 .addUse(Fmas.getReg(0)) 3169 .addUse(RHS) 3170 .addUse(LHS) 3171 .setMIFlags(Flags); 3172 3173 MI.eraseFromParent(); 3174 return true; 3175 } 3176 3177 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3178 MachineRegisterInfo &MRI, 3179 MachineIRBuilder &B) const { 3180 Register Res = MI.getOperand(0).getReg(); 3181 Register LHS = MI.getOperand(2).getReg(); 3182 Register RHS = MI.getOperand(3).getReg(); 3183 uint16_t Flags = MI.getFlags(); 3184 3185 LLT S32 = LLT::scalar(32); 3186 LLT S1 = LLT::scalar(1); 3187 3188 auto Abs = B.buildFAbs(S32, RHS, Flags); 3189 const APFloat C0Val(1.0f); 3190 3191 auto C0 = B.buildConstant(S32, 0x6f800000); 3192 auto C1 = B.buildConstant(S32, 0x2f800000); 3193 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3194 3195 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3196 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3197 3198 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3199 3200 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3201 .addUse(Mul0.getReg(0)) 3202 .setMIFlags(Flags); 3203 3204 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3205 3206 B.buildFMul(Res, Sel, Mul1, Flags); 3207 3208 MI.eraseFromParent(); 3209 return true; 3210 } 3211 3212 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3213 // FIXME: Why do we handle this one but not other removed instructions? 3214 // 3215 // Reciprocal square root. The clamp prevents infinite results, clamping 3216 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3217 // +-max_float. 3218 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3219 MachineRegisterInfo &MRI, 3220 MachineIRBuilder &B) const { 3221 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3222 return true; 3223 3224 Register Dst = MI.getOperand(0).getReg(); 3225 Register Src = MI.getOperand(2).getReg(); 3226 auto Flags = MI.getFlags(); 3227 3228 LLT Ty = MRI.getType(Dst); 3229 3230 const fltSemantics *FltSemantics; 3231 if (Ty == LLT::scalar(32)) 3232 FltSemantics = &APFloat::IEEEsingle(); 3233 else if (Ty == LLT::scalar(64)) 3234 FltSemantics = &APFloat::IEEEdouble(); 3235 else 3236 return false; 3237 3238 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3239 .addUse(Src) 3240 .setMIFlags(Flags); 3241 3242 // We don't need to concern ourselves with the snan handling difference, since 3243 // the rsq quieted (or not) so use the one which will directly select. 3244 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3245 const bool UseIEEE = MFI->getMode().IEEE; 3246 3247 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3248 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3249 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3250 3251 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3252 3253 if (UseIEEE) 3254 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3255 else 3256 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3257 MI.eraseFromParent(); 3258 return true; 3259 } 3260 3261 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3262 switch (IID) { 3263 case Intrinsic::amdgcn_ds_fadd: 3264 return AMDGPU::G_ATOMICRMW_FADD; 3265 case Intrinsic::amdgcn_ds_fmin: 3266 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3267 case Intrinsic::amdgcn_ds_fmax: 3268 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3269 default: 3270 llvm_unreachable("not a DS FP intrinsic"); 3271 } 3272 } 3273 3274 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3275 MachineInstr &MI, 3276 Intrinsic::ID IID) const { 3277 GISelChangeObserver &Observer = Helper.Observer; 3278 Observer.changingInstr(MI); 3279 3280 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3281 3282 // The remaining operands were used to set fields in the MemOperand on 3283 // construction. 3284 for (int I = 6; I > 3; --I) 3285 MI.RemoveOperand(I); 3286 3287 MI.RemoveOperand(1); // Remove the intrinsic ID. 3288 Observer.changedInstr(MI); 3289 return true; 3290 } 3291 3292 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3293 MachineRegisterInfo &MRI, 3294 MachineIRBuilder &B) const { 3295 uint64_t Offset = 3296 ST.getTargetLowering()->getImplicitParameterOffset( 3297 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3298 LLT DstTy = MRI.getType(DstReg); 3299 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3300 3301 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3302 if (!loadInputValue(KernargPtrReg, B, 3303 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3304 return false; 3305 3306 // FIXME: This should be nuw 3307 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3308 return true; 3309 } 3310 3311 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3312 MachineRegisterInfo &MRI, 3313 MachineIRBuilder &B) const { 3314 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3315 if (!MFI->isEntryFunction()) { 3316 return legalizePreloadedArgIntrin(MI, MRI, B, 3317 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3318 } 3319 3320 Register DstReg = MI.getOperand(0).getReg(); 3321 if (!getImplicitArgPtr(DstReg, MRI, B)) 3322 return false; 3323 3324 MI.eraseFromParent(); 3325 return true; 3326 } 3327 3328 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3329 MachineRegisterInfo &MRI, 3330 MachineIRBuilder &B, 3331 unsigned AddrSpace) const { 3332 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3333 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3334 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3335 MI.eraseFromParent(); 3336 return true; 3337 } 3338 3339 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3340 // offset (the offset that is included in bounds checking and swizzling, to be 3341 // split between the instruction's voffset and immoffset fields) and soffset 3342 // (the offset that is excluded from bounds checking and swizzling, to go in 3343 // the instruction's soffset field). This function takes the first kind of 3344 // offset and figures out how to split it between voffset and immoffset. 3345 std::tuple<Register, unsigned, unsigned> 3346 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3347 Register OrigOffset) const { 3348 const unsigned MaxImm = 4095; 3349 Register BaseReg; 3350 unsigned TotalConstOffset; 3351 MachineInstr *OffsetDef; 3352 const LLT S32 = LLT::scalar(32); 3353 3354 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3355 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3356 3357 unsigned ImmOffset = TotalConstOffset; 3358 3359 // If the immediate value is too big for the immoffset field, put the value 3360 // and -4096 into the immoffset field so that the value that is copied/added 3361 // for the voffset field is a multiple of 4096, and it stands more chance 3362 // of being CSEd with the copy/add for another similar load/store. 3363 // However, do not do that rounding down to a multiple of 4096 if that is a 3364 // negative number, as it appears to be illegal to have a negative offset 3365 // in the vgpr, even if adding the immediate offset makes it positive. 3366 unsigned Overflow = ImmOffset & ~MaxImm; 3367 ImmOffset -= Overflow; 3368 if ((int32_t)Overflow < 0) { 3369 Overflow += ImmOffset; 3370 ImmOffset = 0; 3371 } 3372 3373 if (Overflow != 0) { 3374 if (!BaseReg) { 3375 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3376 } else { 3377 auto OverflowVal = B.buildConstant(S32, Overflow); 3378 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3379 } 3380 } 3381 3382 if (!BaseReg) 3383 BaseReg = B.buildConstant(S32, 0).getReg(0); 3384 3385 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3386 } 3387 3388 /// Handle register layout difference for f16 images for some subtargets. 3389 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3390 MachineRegisterInfo &MRI, 3391 Register Reg) const { 3392 if (!ST.hasUnpackedD16VMem()) 3393 return Reg; 3394 3395 const LLT S16 = LLT::scalar(16); 3396 const LLT S32 = LLT::scalar(32); 3397 LLT StoreVT = MRI.getType(Reg); 3398 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3399 3400 auto Unmerge = B.buildUnmerge(S16, Reg); 3401 3402 SmallVector<Register, 4> WideRegs; 3403 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3404 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3405 3406 int NumElts = StoreVT.getNumElements(); 3407 3408 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3409 } 3410 3411 Register AMDGPULegalizerInfo::fixStoreSourceType( 3412 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3413 MachineRegisterInfo *MRI = B.getMRI(); 3414 LLT Ty = MRI->getType(VData); 3415 3416 const LLT S16 = LLT::scalar(16); 3417 3418 // Fixup illegal register types for i8 stores. 3419 if (Ty == LLT::scalar(8) || Ty == S16) { 3420 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3421 return AnyExt; 3422 } 3423 3424 if (Ty.isVector()) { 3425 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3426 if (IsFormat) 3427 return handleD16VData(B, *MRI, VData); 3428 } 3429 } 3430 3431 return VData; 3432 } 3433 3434 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3435 MachineRegisterInfo &MRI, 3436 MachineIRBuilder &B, 3437 bool IsTyped, 3438 bool IsFormat) const { 3439 Register VData = MI.getOperand(1).getReg(); 3440 LLT Ty = MRI.getType(VData); 3441 LLT EltTy = Ty.getScalarType(); 3442 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3443 const LLT S32 = LLT::scalar(32); 3444 3445 VData = fixStoreSourceType(B, VData, IsFormat); 3446 Register RSrc = MI.getOperand(2).getReg(); 3447 3448 MachineMemOperand *MMO = *MI.memoperands_begin(); 3449 const int MemSize = MMO->getSize(); 3450 3451 unsigned ImmOffset; 3452 unsigned TotalOffset; 3453 3454 // The typed intrinsics add an immediate after the registers. 3455 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3456 3457 // The struct intrinsic variants add one additional operand over raw. 3458 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3459 Register VIndex; 3460 int OpOffset = 0; 3461 if (HasVIndex) { 3462 VIndex = MI.getOperand(3).getReg(); 3463 OpOffset = 1; 3464 } 3465 3466 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3467 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3468 3469 unsigned Format = 0; 3470 if (IsTyped) { 3471 Format = MI.getOperand(5 + OpOffset).getImm(); 3472 ++OpOffset; 3473 } 3474 3475 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3476 3477 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3478 if (TotalOffset != 0) 3479 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3480 3481 unsigned Opc; 3482 if (IsTyped) { 3483 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3484 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3485 } else if (IsFormat) { 3486 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3487 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3488 } else { 3489 switch (MemSize) { 3490 case 1: 3491 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3492 break; 3493 case 2: 3494 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3495 break; 3496 default: 3497 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3498 break; 3499 } 3500 } 3501 3502 if (!VIndex) 3503 VIndex = B.buildConstant(S32, 0).getReg(0); 3504 3505 auto MIB = B.buildInstr(Opc) 3506 .addUse(VData) // vdata 3507 .addUse(RSrc) // rsrc 3508 .addUse(VIndex) // vindex 3509 .addUse(VOffset) // voffset 3510 .addUse(SOffset) // soffset 3511 .addImm(ImmOffset); // offset(imm) 3512 3513 if (IsTyped) 3514 MIB.addImm(Format); 3515 3516 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3517 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3518 .addMemOperand(MMO); 3519 3520 MI.eraseFromParent(); 3521 return true; 3522 } 3523 3524 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3525 MachineRegisterInfo &MRI, 3526 MachineIRBuilder &B, 3527 bool IsFormat, 3528 bool IsTyped) const { 3529 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3530 MachineMemOperand *MMO = *MI.memoperands_begin(); 3531 const int MemSize = MMO->getSize(); 3532 const LLT S32 = LLT::scalar(32); 3533 3534 Register Dst = MI.getOperand(0).getReg(); 3535 Register RSrc = MI.getOperand(2).getReg(); 3536 3537 // The typed intrinsics add an immediate after the registers. 3538 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3539 3540 // The struct intrinsic variants add one additional operand over raw. 3541 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3542 Register VIndex; 3543 int OpOffset = 0; 3544 if (HasVIndex) { 3545 VIndex = MI.getOperand(3).getReg(); 3546 OpOffset = 1; 3547 } 3548 3549 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3550 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3551 3552 unsigned Format = 0; 3553 if (IsTyped) { 3554 Format = MI.getOperand(5 + OpOffset).getImm(); 3555 ++OpOffset; 3556 } 3557 3558 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3559 unsigned ImmOffset; 3560 unsigned TotalOffset; 3561 3562 LLT Ty = MRI.getType(Dst); 3563 LLT EltTy = Ty.getScalarType(); 3564 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3565 const bool Unpacked = ST.hasUnpackedD16VMem(); 3566 3567 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3568 if (TotalOffset != 0) 3569 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3570 3571 unsigned Opc; 3572 3573 if (IsTyped) { 3574 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3575 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3576 } else if (IsFormat) { 3577 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3578 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3579 } else { 3580 switch (MemSize) { 3581 case 1: 3582 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3583 break; 3584 case 2: 3585 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3586 break; 3587 default: 3588 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3589 break; 3590 } 3591 } 3592 3593 Register LoadDstReg; 3594 3595 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3596 LLT UnpackedTy = Ty.changeElementSize(32); 3597 3598 if (IsExtLoad) 3599 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3600 else if (Unpacked && IsD16 && Ty.isVector()) 3601 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3602 else 3603 LoadDstReg = Dst; 3604 3605 if (!VIndex) 3606 VIndex = B.buildConstant(S32, 0).getReg(0); 3607 3608 auto MIB = B.buildInstr(Opc) 3609 .addDef(LoadDstReg) // vdata 3610 .addUse(RSrc) // rsrc 3611 .addUse(VIndex) // vindex 3612 .addUse(VOffset) // voffset 3613 .addUse(SOffset) // soffset 3614 .addImm(ImmOffset); // offset(imm) 3615 3616 if (IsTyped) 3617 MIB.addImm(Format); 3618 3619 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3620 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3621 .addMemOperand(MMO); 3622 3623 if (LoadDstReg != Dst) { 3624 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3625 3626 // Widen result for extending loads was widened. 3627 if (IsExtLoad) 3628 B.buildTrunc(Dst, LoadDstReg); 3629 else { 3630 // Repack to original 16-bit vector result 3631 // FIXME: G_TRUNC should work, but legalization currently fails 3632 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3633 SmallVector<Register, 4> Repack; 3634 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3635 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3636 B.buildMerge(Dst, Repack); 3637 } 3638 } 3639 3640 MI.eraseFromParent(); 3641 return true; 3642 } 3643 3644 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3645 MachineIRBuilder &B, 3646 bool IsInc) const { 3647 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3648 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3649 B.buildInstr(Opc) 3650 .addDef(MI.getOperand(0).getReg()) 3651 .addUse(MI.getOperand(2).getReg()) 3652 .addUse(MI.getOperand(3).getReg()) 3653 .cloneMemRefs(MI); 3654 MI.eraseFromParent(); 3655 return true; 3656 } 3657 3658 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3659 switch (IntrID) { 3660 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3661 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3662 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3663 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3664 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3666 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3667 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3668 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3669 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3670 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3671 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3672 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3673 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3674 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3675 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3676 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3677 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3678 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3679 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3680 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3681 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3682 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3683 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3684 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3685 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3686 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3687 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3688 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3689 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3690 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3691 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3692 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3693 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3694 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3695 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3696 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3697 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3698 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3699 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3700 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3701 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3702 default: 3703 llvm_unreachable("unhandled atomic opcode"); 3704 } 3705 } 3706 3707 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3708 MachineIRBuilder &B, 3709 Intrinsic::ID IID) const { 3710 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3711 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3712 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3713 3714 Register Dst; 3715 3716 int OpOffset = 0; 3717 if (HasReturn) { 3718 // A few FP atomics do not support return values. 3719 Dst = MI.getOperand(0).getReg(); 3720 } else { 3721 OpOffset = -1; 3722 } 3723 3724 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3725 Register CmpVal; 3726 3727 if (IsCmpSwap) { 3728 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3729 ++OpOffset; 3730 } 3731 3732 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3733 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3734 3735 // The struct intrinsic variants add one additional operand over raw. 3736 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3737 Register VIndex; 3738 if (HasVIndex) { 3739 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3740 ++OpOffset; 3741 } 3742 3743 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3744 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3745 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3746 3747 MachineMemOperand *MMO = *MI.memoperands_begin(); 3748 3749 unsigned ImmOffset; 3750 unsigned TotalOffset; 3751 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3752 if (TotalOffset != 0) 3753 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3754 3755 if (!VIndex) 3756 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3757 3758 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3759 3760 if (HasReturn) 3761 MIB.addDef(Dst); 3762 3763 MIB.addUse(VData); // vdata 3764 3765 if (IsCmpSwap) 3766 MIB.addReg(CmpVal); 3767 3768 MIB.addUse(RSrc) // rsrc 3769 .addUse(VIndex) // vindex 3770 .addUse(VOffset) // voffset 3771 .addUse(SOffset) // soffset 3772 .addImm(ImmOffset) // offset(imm) 3773 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3774 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3775 .addMemOperand(MMO); 3776 3777 MI.eraseFromParent(); 3778 return true; 3779 } 3780 3781 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3782 /// vector with s16 typed elements. 3783 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3784 SmallVectorImpl<Register> &PackedAddrs, 3785 int AddrIdx, int DimIdx, int EndIdx, 3786 int NumGradients) { 3787 const LLT S16 = LLT::scalar(16); 3788 const LLT V2S16 = LLT::vector(2, 16); 3789 3790 for (int I = AddrIdx; I < EndIdx; ++I) { 3791 MachineOperand &SrcOp = MI.getOperand(I); 3792 if (!SrcOp.isReg()) 3793 continue; // _L to _LZ may have eliminated this. 3794 3795 Register AddrReg = SrcOp.getReg(); 3796 3797 if (I < DimIdx) { 3798 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3799 PackedAddrs.push_back(AddrReg); 3800 } else { 3801 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3802 // derivatives dx/dh and dx/dv are packed with undef. 3803 if (((I + 1) >= EndIdx) || 3804 ((NumGradients / 2) % 2 == 1 && 3805 (I == DimIdx + (NumGradients / 2) - 1 || 3806 I == DimIdx + NumGradients - 1)) || 3807 // Check for _L to _LZ optimization 3808 !MI.getOperand(I + 1).isReg()) { 3809 PackedAddrs.push_back( 3810 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3811 .getReg(0)); 3812 } else { 3813 PackedAddrs.push_back( 3814 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3815 .getReg(0)); 3816 ++I; 3817 } 3818 } 3819 } 3820 } 3821 3822 /// Convert from separate vaddr components to a single vector address register, 3823 /// and replace the remaining operands with $noreg. 3824 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3825 int DimIdx, int NumVAddrs) { 3826 const LLT S32 = LLT::scalar(32); 3827 3828 SmallVector<Register, 8> AddrRegs; 3829 for (int I = 0; I != NumVAddrs; ++I) { 3830 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3831 if (SrcOp.isReg()) { 3832 AddrRegs.push_back(SrcOp.getReg()); 3833 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3834 } 3835 } 3836 3837 int NumAddrRegs = AddrRegs.size(); 3838 if (NumAddrRegs != 1) { 3839 // Round up to 8 elements for v5-v7 3840 // FIXME: Missing intermediate sized register classes and instructions. 3841 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3842 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3843 auto Undef = B.buildUndef(S32); 3844 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3845 NumAddrRegs = RoundedNumRegs; 3846 } 3847 3848 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3849 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3850 } 3851 3852 for (int I = 1; I != NumVAddrs; ++I) { 3853 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3854 if (SrcOp.isReg()) 3855 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3856 } 3857 } 3858 3859 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3860 /// 3861 /// Depending on the subtarget, load/store with 16-bit element data need to be 3862 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3863 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3864 /// registers. 3865 /// 3866 /// We don't want to directly select image instructions just yet, but also want 3867 /// to exposes all register repacking to the legalizer/combiners. We also don't 3868 /// want a selected instrution entering RegBankSelect. In order to avoid 3869 /// defining a multitude of intermediate image instructions, directly hack on 3870 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3871 /// now unnecessary arguments with $noreg. 3872 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3873 MachineInstr &MI, MachineIRBuilder &B, 3874 GISelChangeObserver &Observer, 3875 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3876 3877 const int NumDefs = MI.getNumExplicitDefs(); 3878 bool IsTFE = NumDefs == 2; 3879 // We are only processing the operands of d16 image operations on subtargets 3880 // that use the unpacked register layout, or need to repack the TFE result. 3881 3882 // TODO: Do we need to guard against already legalized intrinsics? 3883 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3884 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3885 3886 MachineRegisterInfo *MRI = B.getMRI(); 3887 const LLT S32 = LLT::scalar(32); 3888 const LLT S16 = LLT::scalar(16); 3889 const LLT V2S16 = LLT::vector(2, 16); 3890 3891 // Index of first address argument 3892 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3893 3894 int NumVAddrs, NumGradients; 3895 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3896 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3897 getDMaskIdx(BaseOpcode, NumDefs); 3898 unsigned DMask = 0; 3899 3900 // Check for 16 bit addresses and pack if true. 3901 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3902 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3903 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3904 const bool IsG16 = GradTy == S16; 3905 const bool IsA16 = AddrTy == S16; 3906 3907 int DMaskLanes = 0; 3908 if (!BaseOpcode->Atomic) { 3909 DMask = MI.getOperand(DMaskIdx).getImm(); 3910 if (BaseOpcode->Gather4) { 3911 DMaskLanes = 4; 3912 } else if (DMask != 0) { 3913 DMaskLanes = countPopulation(DMask); 3914 } else if (!IsTFE && !BaseOpcode->Store) { 3915 // If dmask is 0, this is a no-op load. This can be eliminated. 3916 B.buildUndef(MI.getOperand(0)); 3917 MI.eraseFromParent(); 3918 return true; 3919 } 3920 } 3921 3922 Observer.changingInstr(MI); 3923 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3924 3925 unsigned NewOpcode = NumDefs == 0 ? 3926 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3927 3928 // Track that we legalized this 3929 MI.setDesc(B.getTII().get(NewOpcode)); 3930 3931 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3932 // dmask to be at least 1 otherwise the instruction will fail 3933 if (IsTFE && DMask == 0) { 3934 DMask = 0x1; 3935 DMaskLanes = 1; 3936 MI.getOperand(DMaskIdx).setImm(DMask); 3937 } 3938 3939 if (BaseOpcode->Atomic) { 3940 Register VData0 = MI.getOperand(2).getReg(); 3941 LLT Ty = MRI->getType(VData0); 3942 3943 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3944 if (Ty.isVector()) 3945 return false; 3946 3947 if (BaseOpcode->AtomicX2) { 3948 Register VData1 = MI.getOperand(3).getReg(); 3949 // The two values are packed in one register. 3950 LLT PackedTy = LLT::vector(2, Ty); 3951 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3952 MI.getOperand(2).setReg(Concat.getReg(0)); 3953 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3954 } 3955 } 3956 3957 int CorrectedNumVAddrs = NumVAddrs; 3958 3959 // Optimize _L to _LZ when _L is zero 3960 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3961 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3962 const ConstantFP *ConstantLod; 3963 const int LodIdx = AddrIdx + NumVAddrs - 1; 3964 3965 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3966 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3967 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3968 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3969 LZMappingInfo->LZ, ImageDimIntr->Dim); 3970 3971 // The starting indexes should remain in the same place. 3972 --NumVAddrs; 3973 --CorrectedNumVAddrs; 3974 3975 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3976 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3977 MI.RemoveOperand(LodIdx); 3978 } 3979 } 3980 } 3981 3982 // Optimize _mip away, when 'lod' is zero 3983 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3984 int64_t ConstantLod; 3985 const int LodIdx = AddrIdx + NumVAddrs - 1; 3986 3987 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3988 if (ConstantLod == 0) { 3989 // TODO: Change intrinsic opcode and remove operand instead or replacing 3990 // it with 0, as the _L to _LZ handling is done above. 3991 MI.getOperand(LodIdx).ChangeToImmediate(0); 3992 --CorrectedNumVAddrs; 3993 } 3994 } 3995 } 3996 3997 // Rewrite the addressing register layout before doing anything else. 3998 if (IsA16 || IsG16) { 3999 if (IsA16) { 4000 // Target must support the feature and gradients need to be 16 bit too 4001 if (!ST.hasA16() || !IsG16) 4002 return false; 4003 } else if (!ST.hasG16()) 4004 return false; 4005 4006 if (NumVAddrs > 1) { 4007 SmallVector<Register, 4> PackedRegs; 4008 // Don't compress addresses for G16 4009 const int PackEndIdx = 4010 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4011 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4012 PackEndIdx, NumGradients); 4013 4014 if (!IsA16) { 4015 // Add uncompressed address 4016 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4017 int AddrReg = MI.getOperand(I).getReg(); 4018 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4019 PackedRegs.push_back(AddrReg); 4020 } 4021 } 4022 4023 // See also below in the non-a16 branch 4024 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4025 4026 if (!UseNSA && PackedRegs.size() > 1) { 4027 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4028 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4029 PackedRegs[0] = Concat.getReg(0); 4030 PackedRegs.resize(1); 4031 } 4032 4033 const int NumPacked = PackedRegs.size(); 4034 for (int I = 0; I != NumVAddrs; ++I) { 4035 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4036 if (!SrcOp.isReg()) { 4037 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4038 continue; 4039 } 4040 4041 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4042 4043 if (I < NumPacked) 4044 SrcOp.setReg(PackedRegs[I]); 4045 else 4046 SrcOp.setReg(AMDGPU::NoRegister); 4047 } 4048 } 4049 } else { 4050 // If the register allocator cannot place the address registers contiguously 4051 // without introducing moves, then using the non-sequential address encoding 4052 // is always preferable, since it saves VALU instructions and is usually a 4053 // wash in terms of code size or even better. 4054 // 4055 // However, we currently have no way of hinting to the register allocator 4056 // that MIMG addresses should be placed contiguously when it is possible to 4057 // do so, so force non-NSA for the common 2-address case as a heuristic. 4058 // 4059 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4060 // allocation when possible. 4061 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4062 4063 if (!UseNSA && NumVAddrs > 1) 4064 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4065 } 4066 4067 int Flags = 0; 4068 if (IsA16) 4069 Flags |= 1; 4070 if (IsG16) 4071 Flags |= 2; 4072 MI.addOperand(MachineOperand::CreateImm(Flags)); 4073 4074 if (BaseOpcode->Store) { // No TFE for stores? 4075 // TODO: Handle dmask trim 4076 Register VData = MI.getOperand(1).getReg(); 4077 LLT Ty = MRI->getType(VData); 4078 if (!Ty.isVector() || Ty.getElementType() != S16) 4079 return true; 4080 4081 Register RepackedReg = handleD16VData(B, *MRI, VData); 4082 if (RepackedReg != VData) { 4083 MI.getOperand(1).setReg(RepackedReg); 4084 } 4085 4086 return true; 4087 } 4088 4089 Register DstReg = MI.getOperand(0).getReg(); 4090 LLT Ty = MRI->getType(DstReg); 4091 const LLT EltTy = Ty.getScalarType(); 4092 const bool IsD16 = Ty.getScalarType() == S16; 4093 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4094 4095 // Confirm that the return type is large enough for the dmask specified 4096 if (NumElts < DMaskLanes) 4097 return false; 4098 4099 if (NumElts > 4 || DMaskLanes > 4) 4100 return false; 4101 4102 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4103 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4104 4105 // The raw dword aligned data component of the load. The only legal cases 4106 // where this matters should be when using the packed D16 format, for 4107 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4108 LLT RoundedTy; 4109 4110 // S32 vector to to cover all data, plus TFE result element. 4111 LLT TFETy; 4112 4113 // Register type to use for each loaded component. Will be S32 or V2S16. 4114 LLT RegTy; 4115 4116 if (IsD16 && ST.hasUnpackedD16VMem()) { 4117 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4118 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4119 RegTy = S32; 4120 } else { 4121 unsigned EltSize = EltTy.getSizeInBits(); 4122 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4123 unsigned RoundedSize = 32 * RoundedElts; 4124 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4125 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4126 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4127 } 4128 4129 // The return type does not need adjustment. 4130 // TODO: Should we change s16 case to s32 or <2 x s16>? 4131 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4132 return true; 4133 4134 Register Dst1Reg; 4135 4136 // Insert after the instruction. 4137 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4138 4139 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4140 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4141 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4142 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4143 4144 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4145 4146 MI.getOperand(0).setReg(NewResultReg); 4147 4148 // In the IR, TFE is supposed to be used with a 2 element struct return 4149 // type. The intruction really returns these two values in one contiguous 4150 // register, with one additional dword beyond the loaded data. Rewrite the 4151 // return type to use a single register result. 4152 4153 if (IsTFE) { 4154 Dst1Reg = MI.getOperand(1).getReg(); 4155 if (MRI->getType(Dst1Reg) != S32) 4156 return false; 4157 4158 // TODO: Make sure the TFE operand bit is set. 4159 MI.RemoveOperand(1); 4160 4161 // Handle the easy case that requires no repack instructions. 4162 if (Ty == S32) { 4163 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4164 return true; 4165 } 4166 } 4167 4168 // Now figure out how to copy the new result register back into the old 4169 // result. 4170 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4171 4172 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4173 4174 if (ResultNumRegs == 1) { 4175 assert(!IsTFE); 4176 ResultRegs[0] = NewResultReg; 4177 } else { 4178 // We have to repack into a new vector of some kind. 4179 for (int I = 0; I != NumDataRegs; ++I) 4180 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4181 B.buildUnmerge(ResultRegs, NewResultReg); 4182 4183 // Drop the final TFE element to get the data part. The TFE result is 4184 // directly written to the right place already. 4185 if (IsTFE) 4186 ResultRegs.resize(NumDataRegs); 4187 } 4188 4189 // For an s16 scalar result, we form an s32 result with a truncate regardless 4190 // of packed vs. unpacked. 4191 if (IsD16 && !Ty.isVector()) { 4192 B.buildTrunc(DstReg, ResultRegs[0]); 4193 return true; 4194 } 4195 4196 // Avoid a build/concat_vector of 1 entry. 4197 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4198 B.buildBitcast(DstReg, ResultRegs[0]); 4199 return true; 4200 } 4201 4202 assert(Ty.isVector()); 4203 4204 if (IsD16) { 4205 // For packed D16 results with TFE enabled, all the data components are 4206 // S32. Cast back to the expected type. 4207 // 4208 // TODO: We don't really need to use load s32 elements. We would only need one 4209 // cast for the TFE result if a multiple of v2s16 was used. 4210 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4211 for (Register &Reg : ResultRegs) 4212 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4213 } else if (ST.hasUnpackedD16VMem()) { 4214 for (Register &Reg : ResultRegs) 4215 Reg = B.buildTrunc(S16, Reg).getReg(0); 4216 } 4217 } 4218 4219 auto padWithUndef = [&](LLT Ty, int NumElts) { 4220 if (NumElts == 0) 4221 return; 4222 Register Undef = B.buildUndef(Ty).getReg(0); 4223 for (int I = 0; I != NumElts; ++I) 4224 ResultRegs.push_back(Undef); 4225 }; 4226 4227 // Pad out any elements eliminated due to the dmask. 4228 LLT ResTy = MRI->getType(ResultRegs[0]); 4229 if (!ResTy.isVector()) { 4230 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4231 B.buildBuildVector(DstReg, ResultRegs); 4232 return true; 4233 } 4234 4235 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4236 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4237 4238 // Deal with the one annoying legal case. 4239 const LLT V3S16 = LLT::vector(3, 16); 4240 if (Ty == V3S16) { 4241 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4242 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4243 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4244 return true; 4245 } 4246 4247 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4248 B.buildConcatVectors(DstReg, ResultRegs); 4249 return true; 4250 } 4251 4252 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4253 LegalizerHelper &Helper, MachineInstr &MI) const { 4254 MachineIRBuilder &B = Helper.MIRBuilder; 4255 GISelChangeObserver &Observer = Helper.Observer; 4256 4257 Register Dst = MI.getOperand(0).getReg(); 4258 LLT Ty = B.getMRI()->getType(Dst); 4259 unsigned Size = Ty.getSizeInBits(); 4260 MachineFunction &MF = B.getMF(); 4261 4262 Observer.changingInstr(MI); 4263 4264 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4265 Ty = getBitcastRegisterType(Ty); 4266 Helper.bitcastDst(MI, Ty, 0); 4267 Dst = MI.getOperand(0).getReg(); 4268 B.setInsertPt(B.getMBB(), MI); 4269 } 4270 4271 // FIXME: We don't really need this intermediate instruction. The intrinsic 4272 // should be fixed to have a memory operand. Since it's readnone, we're not 4273 // allowed to add one. 4274 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4275 MI.RemoveOperand(1); // Remove intrinsic ID 4276 4277 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4278 // TODO: Should this use datalayout alignment? 4279 const unsigned MemSize = (Size + 7) / 8; 4280 const Align MemAlign(4); 4281 MachineMemOperand *MMO = MF.getMachineMemOperand( 4282 MachinePointerInfo(), 4283 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4284 MachineMemOperand::MOInvariant, 4285 MemSize, MemAlign); 4286 MI.addMemOperand(MF, MMO); 4287 4288 // There are no 96-bit result scalar loads, but widening to 128-bit should 4289 // always be legal. We may need to restore this to a 96-bit result if it turns 4290 // out this needs to be converted to a vector load during RegBankSelect. 4291 if (!isPowerOf2_32(Size)) { 4292 if (Ty.isVector()) 4293 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4294 else 4295 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4296 } 4297 4298 Observer.changedInstr(MI); 4299 return true; 4300 } 4301 4302 // TODO: Move to selection 4303 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4304 MachineRegisterInfo &MRI, 4305 MachineIRBuilder &B) const { 4306 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4307 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4308 !ST.isTrapHandlerEnabled()) { 4309 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4310 } else { 4311 // Pass queue pointer to trap handler as input, and insert trap instruction 4312 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4313 MachineRegisterInfo &MRI = *B.getMRI(); 4314 4315 Register LiveIn = 4316 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4317 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4318 return false; 4319 4320 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4321 B.buildCopy(SGPR01, LiveIn); 4322 B.buildInstr(AMDGPU::S_TRAP) 4323 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4324 .addReg(SGPR01, RegState::Implicit); 4325 } 4326 4327 MI.eraseFromParent(); 4328 return true; 4329 } 4330 4331 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4332 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4333 // Is non-HSA path or trap-handler disabled? then, report a warning 4334 // accordingly 4335 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4336 !ST.isTrapHandlerEnabled()) { 4337 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4338 "debugtrap handler not supported", 4339 MI.getDebugLoc(), DS_Warning); 4340 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4341 Ctx.diagnose(NoTrap); 4342 } else { 4343 // Insert debug-trap instruction 4344 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4345 } 4346 4347 MI.eraseFromParent(); 4348 return true; 4349 } 4350 4351 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4352 MachineInstr &MI) const { 4353 MachineIRBuilder &B = Helper.MIRBuilder; 4354 MachineRegisterInfo &MRI = *B.getMRI(); 4355 4356 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4357 auto IntrID = MI.getIntrinsicID(); 4358 switch (IntrID) { 4359 case Intrinsic::amdgcn_if: 4360 case Intrinsic::amdgcn_else: { 4361 MachineInstr *Br = nullptr; 4362 MachineBasicBlock *UncondBrTarget = nullptr; 4363 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4364 const SIRegisterInfo *TRI 4365 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4366 4367 Register Def = MI.getOperand(1).getReg(); 4368 Register Use = MI.getOperand(3).getReg(); 4369 4370 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4371 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4372 if (IntrID == Intrinsic::amdgcn_if) { 4373 B.buildInstr(AMDGPU::SI_IF) 4374 .addDef(Def) 4375 .addUse(Use) 4376 .addMBB(UncondBrTarget); 4377 } else { 4378 B.buildInstr(AMDGPU::SI_ELSE) 4379 .addDef(Def) 4380 .addUse(Use) 4381 .addMBB(UncondBrTarget) 4382 .addImm(0); 4383 } 4384 4385 if (Br) { 4386 Br->getOperand(0).setMBB(CondBrTarget); 4387 } else { 4388 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4389 // since we're swapping branch targets it needs to be reinserted. 4390 // FIXME: IRTranslator should probably not do this 4391 B.buildBr(*CondBrTarget); 4392 } 4393 4394 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4395 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4396 MI.eraseFromParent(); 4397 BrCond->eraseFromParent(); 4398 return true; 4399 } 4400 4401 return false; 4402 } 4403 case Intrinsic::amdgcn_loop: { 4404 MachineInstr *Br = nullptr; 4405 MachineBasicBlock *UncondBrTarget = nullptr; 4406 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4407 const SIRegisterInfo *TRI 4408 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4409 4410 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4411 Register Reg = MI.getOperand(2).getReg(); 4412 4413 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4414 B.buildInstr(AMDGPU::SI_LOOP) 4415 .addUse(Reg) 4416 .addMBB(UncondBrTarget); 4417 4418 if (Br) 4419 Br->getOperand(0).setMBB(CondBrTarget); 4420 else 4421 B.buildBr(*CondBrTarget); 4422 4423 MI.eraseFromParent(); 4424 BrCond->eraseFromParent(); 4425 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4426 return true; 4427 } 4428 4429 return false; 4430 } 4431 case Intrinsic::amdgcn_kernarg_segment_ptr: 4432 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4433 // This only makes sense to call in a kernel, so just lower to null. 4434 B.buildConstant(MI.getOperand(0).getReg(), 0); 4435 MI.eraseFromParent(); 4436 return true; 4437 } 4438 4439 return legalizePreloadedArgIntrin( 4440 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4441 case Intrinsic::amdgcn_implicitarg_ptr: 4442 return legalizeImplicitArgPtr(MI, MRI, B); 4443 case Intrinsic::amdgcn_workitem_id_x: 4444 return legalizePreloadedArgIntrin(MI, MRI, B, 4445 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4446 case Intrinsic::amdgcn_workitem_id_y: 4447 return legalizePreloadedArgIntrin(MI, MRI, B, 4448 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4449 case Intrinsic::amdgcn_workitem_id_z: 4450 return legalizePreloadedArgIntrin(MI, MRI, B, 4451 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4452 case Intrinsic::amdgcn_workgroup_id_x: 4453 return legalizePreloadedArgIntrin(MI, MRI, B, 4454 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4455 case Intrinsic::amdgcn_workgroup_id_y: 4456 return legalizePreloadedArgIntrin(MI, MRI, B, 4457 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4458 case Intrinsic::amdgcn_workgroup_id_z: 4459 return legalizePreloadedArgIntrin(MI, MRI, B, 4460 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4461 case Intrinsic::amdgcn_dispatch_ptr: 4462 return legalizePreloadedArgIntrin(MI, MRI, B, 4463 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4464 case Intrinsic::amdgcn_queue_ptr: 4465 return legalizePreloadedArgIntrin(MI, MRI, B, 4466 AMDGPUFunctionArgInfo::QUEUE_PTR); 4467 case Intrinsic::amdgcn_implicit_buffer_ptr: 4468 return legalizePreloadedArgIntrin( 4469 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4470 case Intrinsic::amdgcn_dispatch_id: 4471 return legalizePreloadedArgIntrin(MI, MRI, B, 4472 AMDGPUFunctionArgInfo::DISPATCH_ID); 4473 case Intrinsic::amdgcn_fdiv_fast: 4474 return legalizeFDIVFastIntrin(MI, MRI, B); 4475 case Intrinsic::amdgcn_is_shared: 4476 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4477 case Intrinsic::amdgcn_is_private: 4478 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4479 case Intrinsic::amdgcn_wavefrontsize: { 4480 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4481 MI.eraseFromParent(); 4482 return true; 4483 } 4484 case Intrinsic::amdgcn_s_buffer_load: 4485 return legalizeSBufferLoad(Helper, MI); 4486 case Intrinsic::amdgcn_raw_buffer_store: 4487 case Intrinsic::amdgcn_struct_buffer_store: 4488 return legalizeBufferStore(MI, MRI, B, false, false); 4489 case Intrinsic::amdgcn_raw_buffer_store_format: 4490 case Intrinsic::amdgcn_struct_buffer_store_format: 4491 return legalizeBufferStore(MI, MRI, B, false, true); 4492 case Intrinsic::amdgcn_raw_tbuffer_store: 4493 case Intrinsic::amdgcn_struct_tbuffer_store: 4494 return legalizeBufferStore(MI, MRI, B, true, true); 4495 case Intrinsic::amdgcn_raw_buffer_load: 4496 case Intrinsic::amdgcn_struct_buffer_load: 4497 return legalizeBufferLoad(MI, MRI, B, false, false); 4498 case Intrinsic::amdgcn_raw_buffer_load_format: 4499 case Intrinsic::amdgcn_struct_buffer_load_format: 4500 return legalizeBufferLoad(MI, MRI, B, true, false); 4501 case Intrinsic::amdgcn_raw_tbuffer_load: 4502 case Intrinsic::amdgcn_struct_tbuffer_load: 4503 return legalizeBufferLoad(MI, MRI, B, true, true); 4504 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4505 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4506 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4507 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4508 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4509 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4510 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4511 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4512 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4513 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4514 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4515 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4516 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4517 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4518 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4519 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4520 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4521 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4522 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4523 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4524 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4525 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4526 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4527 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4528 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4529 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4530 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4531 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4532 return legalizeBufferAtomic(MI, B, IntrID); 4533 case Intrinsic::amdgcn_atomic_inc: 4534 return legalizeAtomicIncDec(MI, B, true); 4535 case Intrinsic::amdgcn_atomic_dec: 4536 return legalizeAtomicIncDec(MI, B, false); 4537 case Intrinsic::trap: 4538 return legalizeTrapIntrinsic(MI, MRI, B); 4539 case Intrinsic::debugtrap: 4540 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4541 case Intrinsic::amdgcn_rsq_clamp: 4542 return legalizeRsqClampIntrinsic(MI, MRI, B); 4543 case Intrinsic::amdgcn_ds_fadd: 4544 case Intrinsic::amdgcn_ds_fmin: 4545 case Intrinsic::amdgcn_ds_fmax: 4546 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4547 default: { 4548 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4549 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4550 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4551 return true; 4552 } 4553 } 4554 4555 return true; 4556 } 4557