1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 364 const GCNTargetMachine &TM) 365 : ST(ST_) { 366 using namespace TargetOpcode; 367 368 auto GetAddrSpacePtr = [&TM](unsigned AS) { 369 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 370 }; 371 372 const LLT S1 = LLT::scalar(1); 373 const LLT S16 = LLT::scalar(16); 374 const LLT S32 = LLT::scalar(32); 375 const LLT S64 = LLT::scalar(64); 376 const LLT S128 = LLT::scalar(128); 377 const LLT S256 = LLT::scalar(256); 378 const LLT S512 = LLT::scalar(512); 379 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 380 381 const LLT V2S16 = LLT::vector(2, 16); 382 const LLT V4S16 = LLT::vector(4, 16); 383 384 const LLT V2S32 = LLT::vector(2, 32); 385 const LLT V3S32 = LLT::vector(3, 32); 386 const LLT V4S32 = LLT::vector(4, 32); 387 const LLT V5S32 = LLT::vector(5, 32); 388 const LLT V6S32 = LLT::vector(6, 32); 389 const LLT V7S32 = LLT::vector(7, 32); 390 const LLT V8S32 = LLT::vector(8, 32); 391 const LLT V9S32 = LLT::vector(9, 32); 392 const LLT V10S32 = LLT::vector(10, 32); 393 const LLT V11S32 = LLT::vector(11, 32); 394 const LLT V12S32 = LLT::vector(12, 32); 395 const LLT V13S32 = LLT::vector(13, 32); 396 const LLT V14S32 = LLT::vector(14, 32); 397 const LLT V15S32 = LLT::vector(15, 32); 398 const LLT V16S32 = LLT::vector(16, 32); 399 const LLT V32S32 = LLT::vector(32, 32); 400 401 const LLT V2S64 = LLT::vector(2, 64); 402 const LLT V3S64 = LLT::vector(3, 64); 403 const LLT V4S64 = LLT::vector(4, 64); 404 const LLT V5S64 = LLT::vector(5, 64); 405 const LLT V6S64 = LLT::vector(6, 64); 406 const LLT V7S64 = LLT::vector(7, 64); 407 const LLT V8S64 = LLT::vector(8, 64); 408 const LLT V16S64 = LLT::vector(16, 64); 409 410 std::initializer_list<LLT> AllS32Vectors = 411 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 412 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 413 std::initializer_list<LLT> AllS64Vectors = 414 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 415 416 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 417 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 418 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 419 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 420 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 421 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 422 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 423 424 const LLT CodePtr = FlatPtr; 425 426 const std::initializer_list<LLT> AddrSpaces64 = { 427 GlobalPtr, ConstantPtr, FlatPtr 428 }; 429 430 const std::initializer_list<LLT> AddrSpaces32 = { 431 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 432 }; 433 434 const std::initializer_list<LLT> FPTypesBase = { 435 S32, S64 436 }; 437 438 const std::initializer_list<LLT> FPTypes16 = { 439 S32, S64, S16 440 }; 441 442 const std::initializer_list<LLT> FPTypesPK16 = { 443 S32, S64, S16, V2S16 444 }; 445 446 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 447 448 setAction({G_BRCOND, S1}, Legal); // VCC branches 449 setAction({G_BRCOND, S32}, Legal); // SCC branches 450 451 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 452 // elements for v3s16 453 getActionDefinitionsBuilder(G_PHI) 454 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 455 .legalFor(AllS32Vectors) 456 .legalFor(AllS64Vectors) 457 .legalFor(AddrSpaces64) 458 .legalFor(AddrSpaces32) 459 .legalIf(isPointer(0)) 460 .clampScalar(0, S16, S256) 461 .widenScalarToNextPow2(0, 32) 462 .clampMaxNumElements(0, S32, 16) 463 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 464 .scalarize(0); 465 466 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 467 // Full set of gfx9 features. 468 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 469 .legalFor({S32, S16, V2S16}) 470 .clampScalar(0, S16, S32) 471 .clampMaxNumElements(0, S16, 2) 472 .scalarize(0) 473 .widenScalarToNextPow2(0, 32); 474 475 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 476 .legalFor({S32, S16, V2S16}) // Clamp modifier 477 .minScalarOrElt(0, S16) 478 .clampMaxNumElements(0, S16, 2) 479 .scalarize(0) 480 .widenScalarToNextPow2(0, 32) 481 .lower(); 482 } else if (ST.has16BitInsts()) { 483 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 484 .legalFor({S32, S16}) 485 .clampScalar(0, S16, S32) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 488 489 // Technically the saturating operations require clamp bit support, but this 490 // was introduced at the same time as 16-bit operations. 491 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 492 .legalFor({S32, S16}) // Clamp modifier 493 .minScalar(0, S16) 494 .scalarize(0) 495 .widenScalarToNextPow2(0, 16) 496 .lower(); 497 498 // We're just lowering this, but it helps get a better result to try to 499 // coerce to the desired type first. 500 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 501 .minScalar(0, S16) 502 .scalarize(0) 503 .lower(); 504 } else { 505 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 506 .legalFor({S32}) 507 .clampScalar(0, S32, S32) 508 .scalarize(0); 509 510 if (ST.hasIntClamp()) { 511 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 512 .legalFor({S32}) // Clamp modifier. 513 .scalarize(0) 514 .minScalarOrElt(0, S32) 515 .lower(); 516 } else { 517 // Clamp bit support was added in VI, along with 16-bit operations. 518 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 // FIXME: DAG expansion gets better results. The widening uses the smaller 525 // range values and goes for the min/max lowering directly. 526 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 527 .minScalar(0, S32) 528 .scalarize(0) 529 .lower(); 530 } 531 532 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 533 .customFor({S32, S64}) 534 .clampScalar(0, S32, S64) 535 .widenScalarToNextPow2(0, 32) 536 .scalarize(0); 537 538 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 // Report legal for any types we can handle anywhere. For the cases only legal 544 // on the SALU, RegBankSelect will be able to re-legalize. 545 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 546 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 547 .clampScalar(0, S32, S64) 548 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 549 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 553 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 554 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 555 .legalFor({{S32, S1}, {S32, S32}}) 556 .minScalar(0, S32) 557 // TODO: .scalarize(0) 558 .lower(); 559 560 getActionDefinitionsBuilder(G_BITCAST) 561 // Don't worry about the size constraint. 562 .legalIf(all(isRegisterType(0), isRegisterType(1))) 563 .lower(); 564 565 566 getActionDefinitionsBuilder(G_CONSTANT) 567 .legalFor({S1, S32, S64, S16, GlobalPtr, 568 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 569 .legalIf(isPointer(0)) 570 .clampScalar(0, S32, S64) 571 .widenScalarToNextPow2(0); 572 573 getActionDefinitionsBuilder(G_FCONSTANT) 574 .legalFor({S32, S64, S16}) 575 .clampScalar(0, S16, S64); 576 577 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 578 .legalIf(isRegisterType(0)) 579 // s1 and s16 are special cases because they have legal operations on 580 // them, but don't really occupy registers in the normal way. 581 .legalFor({S1, S16}) 582 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 583 .clampScalarOrElt(0, S32, MaxScalar) 584 .widenScalarToNextPow2(0, 32) 585 .clampMaxNumElements(0, S32, 16); 586 587 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 588 589 // If the amount is divergent, we have to do a wave reduction to get the 590 // maximum value, so this is expanded during RegBankSelect. 591 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 592 .legalFor({{PrivatePtr, S32}}); 593 594 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 595 .customIf(typeIsNot(0, PrivatePtr)); 596 597 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 598 599 auto &FPOpActions = getActionDefinitionsBuilder( 600 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 601 .legalFor({S32, S64}); 602 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 603 .customFor({S32, S64}); 604 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 605 .customFor({S32, S64}); 606 607 if (ST.has16BitInsts()) { 608 if (ST.hasVOP3PInsts()) 609 FPOpActions.legalFor({S16, V2S16}); 610 else 611 FPOpActions.legalFor({S16}); 612 613 TrigActions.customFor({S16}); 614 FDIVActions.customFor({S16}); 615 } 616 617 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 618 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 619 620 if (ST.hasVOP3PInsts()) { 621 MinNumMaxNum.customFor(FPTypesPK16) 622 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 623 .clampMaxNumElements(0, S16, 2) 624 .clampScalar(0, S16, S64) 625 .scalarize(0); 626 } else if (ST.has16BitInsts()) { 627 MinNumMaxNum.customFor(FPTypes16) 628 .clampScalar(0, S16, S64) 629 .scalarize(0); 630 } else { 631 MinNumMaxNum.customFor(FPTypesBase) 632 .clampScalar(0, S32, S64) 633 .scalarize(0); 634 } 635 636 if (ST.hasVOP3PInsts()) 637 FPOpActions.clampMaxNumElements(0, S16, 2); 638 639 FPOpActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 TrigActions 644 .scalarize(0) 645 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 646 647 FDIVActions 648 .scalarize(0) 649 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 650 651 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 652 .legalFor(FPTypesPK16) 653 .clampMaxNumElements(0, S16, 2) 654 .scalarize(0) 655 .clampScalar(0, S16, S64); 656 657 if (ST.has16BitInsts()) { 658 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 659 .legalFor({S32, S64, S16}) 660 .scalarize(0) 661 .clampScalar(0, S16, S64); 662 } else { 663 getActionDefinitionsBuilder(G_FSQRT) 664 .legalFor({S32, S64}) 665 .scalarize(0) 666 .clampScalar(0, S32, S64); 667 668 if (ST.hasFractBug()) { 669 getActionDefinitionsBuilder(G_FFLOOR) 670 .customFor({S64}) 671 .legalFor({S32, S64}) 672 .scalarize(0) 673 .clampScalar(0, S32, S64); 674 } else { 675 getActionDefinitionsBuilder(G_FFLOOR) 676 .legalFor({S32, S64}) 677 .scalarize(0) 678 .clampScalar(0, S32, S64); 679 } 680 } 681 682 getActionDefinitionsBuilder(G_FPTRUNC) 683 .legalFor({{S32, S64}, {S16, S32}}) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder(G_FPEXT) 688 .legalFor({{S64, S32}, {S32, S16}}) 689 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 690 .scalarize(0); 691 692 getActionDefinitionsBuilder(G_FSUB) 693 // Use actual fsub instruction 694 .legalFor({S32}) 695 // Must use fadd + fneg 696 .lowerFor({S64, S16, V2S16}) 697 .scalarize(0) 698 .clampScalar(0, S32, S64); 699 700 // Whether this is legal depends on the floating point mode for the function. 701 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 702 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 703 FMad.customFor({S32, S16}); 704 else if (ST.hasMadMacF32Insts()) 705 FMad.customFor({S32}); 706 else if (ST.hasMadF16()) 707 FMad.customFor({S16}); 708 FMad.scalarize(0) 709 .lower(); 710 711 // TODO: Do we need to clamp maximum bitwidth? 712 getActionDefinitionsBuilder(G_TRUNC) 713 .legalIf(isScalar(0)) 714 .legalFor({{V2S16, V2S32}}) 715 .clampMaxNumElements(0, S16, 2) 716 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 717 // situations (like an invalid implicit use), we don't want to infinite loop 718 // in the legalizer. 719 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 720 .alwaysLegal(); 721 722 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 723 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 724 {S32, S1}, {S64, S1}, {S16, S1}}) 725 .scalarize(0) 726 .clampScalar(0, S32, S64) 727 .widenScalarToNextPow2(1, 32); 728 729 // TODO: Split s1->s64 during regbankselect for VALU. 730 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 731 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 732 .lowerFor({{S32, S64}}) 733 .lowerIf(typeIs(1, S1)) 734 .customFor({{S64, S64}}); 735 if (ST.has16BitInsts()) 736 IToFP.legalFor({{S16, S16}}); 737 IToFP.clampScalar(1, S32, S64) 738 .minScalar(0, S32) 739 .scalarize(0) 740 .widenScalarToNextPow2(1); 741 742 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 743 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 744 .customFor({{S64, S64}}) 745 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 746 if (ST.has16BitInsts()) 747 FPToI.legalFor({{S16, S16}}); 748 else 749 FPToI.minScalar(1, S32); 750 751 FPToI.minScalar(0, S32) 752 .scalarize(0) 753 .lower(); 754 755 // Lower roundeven into G_FRINT 756 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 757 .scalarize(0) 758 .lower(); 759 760 if (ST.has16BitInsts()) { 761 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 762 .legalFor({S16, S32, S64}) 763 .clampScalar(0, S16, S64) 764 .scalarize(0); 765 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 766 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 767 .legalFor({S32, S64}) 768 .clampScalar(0, S32, S64) 769 .scalarize(0); 770 } else { 771 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 772 .legalFor({S32}) 773 .customFor({S64}) 774 .clampScalar(0, S32, S64) 775 .scalarize(0); 776 } 777 778 getActionDefinitionsBuilder(G_PTR_ADD) 779 .legalIf(all(isPointer(0), sameSize(0, 1))) 780 .scalarize(0) 781 .scalarSameSizeAs(1, 0); 782 783 getActionDefinitionsBuilder(G_PTRMASK) 784 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 785 .scalarSameSizeAs(1, 0) 786 .scalarize(0); 787 788 auto &CmpBuilder = 789 getActionDefinitionsBuilder(G_ICMP) 790 // The compare output type differs based on the register bank of the output, 791 // so make both s1 and s32 legal. 792 // 793 // Scalar compares producing output in scc will be promoted to s32, as that 794 // is the allocatable register type that will be needed for the copy from 795 // scc. This will be promoted during RegBankSelect, and we assume something 796 // before that won't try to use s32 result types. 797 // 798 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 799 // bank. 800 .legalForCartesianProduct( 801 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 802 .legalForCartesianProduct( 803 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 804 if (ST.has16BitInsts()) { 805 CmpBuilder.legalFor({{S1, S16}}); 806 } 807 808 CmpBuilder 809 .widenScalarToNextPow2(1) 810 .clampScalar(1, S32, S64) 811 .scalarize(0) 812 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 813 814 getActionDefinitionsBuilder(G_FCMP) 815 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 816 .widenScalarToNextPow2(1) 817 .clampScalar(1, S32, S64) 818 .scalarize(0); 819 820 // FIXME: fpow has a selection pattern that should move to custom lowering. 821 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 822 if (ST.has16BitInsts()) 823 Exp2Ops.legalFor({S32, S16}); 824 else 825 Exp2Ops.legalFor({S32}); 826 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 827 Exp2Ops.scalarize(0); 828 829 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 830 if (ST.has16BitInsts()) 831 ExpOps.customFor({{S32}, {S16}}); 832 else 833 ExpOps.customFor({S32}); 834 ExpOps.clampScalar(0, MinScalarFPTy, S32) 835 .scalarize(0); 836 837 getActionDefinitionsBuilder(G_FPOWI) 838 .clampScalar(0, MinScalarFPTy, S32) 839 .lower(); 840 841 // The 64-bit versions produce 32-bit results, but only on the SALU. 842 getActionDefinitionsBuilder(G_CTPOP) 843 .legalFor({{S32, S32}, {S32, S64}}) 844 .clampScalar(0, S32, S32) 845 .clampScalar(1, S32, S64) 846 .scalarize(0) 847 .widenScalarToNextPow2(0, 32) 848 .widenScalarToNextPow2(1, 32); 849 850 // The hardware instructions return a different result on 0 than the generic 851 // instructions expect. The hardware produces -1, but these produce the 852 // bitwidth. 853 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 854 .scalarize(0) 855 .clampScalar(0, S32, S32) 856 .clampScalar(1, S32, S64) 857 .widenScalarToNextPow2(0, 32) 858 .widenScalarToNextPow2(1, 32) 859 .lower(); 860 861 // The 64-bit versions produce 32-bit results, but only on the SALU. 862 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 863 .legalFor({{S32, S32}, {S32, S64}}) 864 .clampScalar(0, S32, S32) 865 .clampScalar(1, S32, S64) 866 .scalarize(0) 867 .widenScalarToNextPow2(0, 32) 868 .widenScalarToNextPow2(1, 32); 869 870 getActionDefinitionsBuilder(G_BITREVERSE) 871 .legalFor({S32}) 872 .clampScalar(0, S32, S32) 873 .scalarize(0); 874 875 if (ST.has16BitInsts()) { 876 getActionDefinitionsBuilder(G_BSWAP) 877 .legalFor({S16, S32, V2S16}) 878 .clampMaxNumElements(0, S16, 2) 879 // FIXME: Fixing non-power-of-2 before clamp is workaround for 880 // narrowScalar limitation. 881 .widenScalarToNextPow2(0) 882 .clampScalar(0, S16, S32) 883 .scalarize(0); 884 885 if (ST.hasVOP3PInsts()) { 886 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 887 .legalFor({S32, S16, V2S16}) 888 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 889 .clampMaxNumElements(0, S16, 2) 890 .minScalar(0, S16) 891 .widenScalarToNextPow2(0) 892 .scalarize(0) 893 .lower(); 894 } else { 895 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 896 .legalFor({S32, S16}) 897 .widenScalarToNextPow2(0) 898 .minScalar(0, S16) 899 .scalarize(0) 900 .lower(); 901 } 902 } else { 903 // TODO: Should have same legality without v_perm_b32 904 getActionDefinitionsBuilder(G_BSWAP) 905 .legalFor({S32}) 906 .lowerIf(scalarNarrowerThan(0, 32)) 907 // FIXME: Fixing non-power-of-2 before clamp is workaround for 908 // narrowScalar limitation. 909 .widenScalarToNextPow2(0) 910 .maxScalar(0, S32) 911 .scalarize(0) 912 .lower(); 913 914 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 915 .legalFor({S32}) 916 .minScalar(0, S32) 917 .widenScalarToNextPow2(0) 918 .scalarize(0) 919 .lower(); 920 } 921 922 getActionDefinitionsBuilder(G_INTTOPTR) 923 // List the common cases 924 .legalForCartesianProduct(AddrSpaces64, {S64}) 925 .legalForCartesianProduct(AddrSpaces32, {S32}) 926 .scalarize(0) 927 // Accept any address space as long as the size matches 928 .legalIf(sameSize(0, 1)) 929 .widenScalarIf(smallerThan(1, 0), 930 [](const LegalityQuery &Query) { 931 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 932 }) 933 .narrowScalarIf(largerThan(1, 0), 934 [](const LegalityQuery &Query) { 935 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 936 }); 937 938 getActionDefinitionsBuilder(G_PTRTOINT) 939 // List the common cases 940 .legalForCartesianProduct(AddrSpaces64, {S64}) 941 .legalForCartesianProduct(AddrSpaces32, {S32}) 942 .scalarize(0) 943 // Accept any address space as long as the size matches 944 .legalIf(sameSize(0, 1)) 945 .widenScalarIf(smallerThan(0, 1), 946 [](const LegalityQuery &Query) { 947 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 948 }) 949 .narrowScalarIf( 950 largerThan(0, 1), 951 [](const LegalityQuery &Query) { 952 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 953 }); 954 955 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 956 .scalarize(0) 957 .custom(); 958 959 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 960 bool IsLoad) -> bool { 961 const LLT DstTy = Query.Types[0]; 962 963 // Split vector extloads. 964 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 965 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 966 967 if (MemSize < DstTy.getSizeInBits()) 968 MemSize = std::max(MemSize, AlignBits); 969 970 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 971 return true; 972 973 const LLT PtrTy = Query.Types[1]; 974 unsigned AS = PtrTy.getAddressSpace(); 975 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 976 return true; 977 978 // Catch weird sized loads that don't evenly divide into the access sizes 979 // TODO: May be able to widen depending on alignment etc. 980 unsigned NumRegs = (MemSize + 31) / 32; 981 if (NumRegs == 3) { 982 if (!ST.hasDwordx3LoadStores()) 983 return true; 984 } else { 985 // If the alignment allows, these should have been widened. 986 if (!isPowerOf2_32(NumRegs)) 987 return true; 988 } 989 990 if (AlignBits < MemSize) { 991 const SITargetLowering *TLI = ST.getTargetLowering(); 992 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 993 Align(AlignBits / 8)); 994 } 995 996 return false; 997 }; 998 999 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 1000 unsigned Opc) -> bool { 1001 unsigned Size = Query.Types[0].getSizeInBits(); 1002 if (isPowerOf2_32(Size)) 1003 return false; 1004 1005 if (Size == 96 && ST.hasDwordx3LoadStores()) 1006 return false; 1007 1008 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1009 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1010 return false; 1011 1012 unsigned Align = Query.MMODescrs[0].AlignInBits; 1013 unsigned RoundedSize = NextPowerOf2(Size); 1014 return (Align >= RoundedSize); 1015 }; 1016 1017 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1018 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1019 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1020 1021 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1022 // LDS 1023 // TODO: Unsupported flat for SI. 1024 1025 for (unsigned Op : {G_LOAD, G_STORE}) { 1026 const bool IsStore = Op == G_STORE; 1027 1028 auto &Actions = getActionDefinitionsBuilder(Op); 1029 // Explicitly list some common cases. 1030 // TODO: Does this help compile time at all? 1031 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1032 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1033 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1034 {S64, GlobalPtr, 64, GlobalAlign32}, 1035 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1036 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1037 {S32, GlobalPtr, 8, GlobalAlign8}, 1038 {S32, GlobalPtr, 16, GlobalAlign16}, 1039 1040 {S32, LocalPtr, 32, 32}, 1041 {S64, LocalPtr, 64, 32}, 1042 {V2S32, LocalPtr, 64, 32}, 1043 {S32, LocalPtr, 8, 8}, 1044 {S32, LocalPtr, 16, 16}, 1045 {V2S16, LocalPtr, 32, 32}, 1046 1047 {S32, PrivatePtr, 32, 32}, 1048 {S32, PrivatePtr, 8, 8}, 1049 {S32, PrivatePtr, 16, 16}, 1050 {V2S16, PrivatePtr, 32, 32}, 1051 1052 {S32, ConstantPtr, 32, GlobalAlign32}, 1053 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1054 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1055 {S64, ConstantPtr, 64, GlobalAlign32}, 1056 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1057 Actions.legalIf( 1058 [=](const LegalityQuery &Query) -> bool { 1059 return isLoadStoreLegal(ST, Query, Op); 1060 }); 1061 1062 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1063 // 64-bits. 1064 // 1065 // TODO: Should generalize bitcast action into coerce, which will also cover 1066 // inserting addrspacecasts. 1067 Actions.customIf(typeIs(1, Constant32Ptr)); 1068 1069 // Turn any illegal element vectors into something easier to deal 1070 // with. These will ultimately produce 32-bit scalar shifts to extract the 1071 // parts anyway. 1072 // 1073 // For odd 16-bit element vectors, prefer to split those into pieces with 1074 // 16-bit vector parts. 1075 Actions.bitcastIf( 1076 [=](const LegalityQuery &Query) -> bool { 1077 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1078 Query.MMODescrs[0].SizeInBits); 1079 }, bitcastToRegisterType(0)); 1080 1081 Actions 1082 .customIf(typeIs(1, Constant32Ptr)) 1083 // Widen suitably aligned loads by loading extra elements. 1084 .moreElementsIf([=](const LegalityQuery &Query) { 1085 const LLT Ty = Query.Types[0]; 1086 return Op == G_LOAD && Ty.isVector() && 1087 shouldWidenLoadResult(Query, Op); 1088 }, moreElementsToNextPow2(0)) 1089 .widenScalarIf([=](const LegalityQuery &Query) { 1090 const LLT Ty = Query.Types[0]; 1091 return Op == G_LOAD && !Ty.isVector() && 1092 shouldWidenLoadResult(Query, Op); 1093 }, widenScalarOrEltToNextPow2(0)) 1094 .narrowScalarIf( 1095 [=](const LegalityQuery &Query) -> bool { 1096 return !Query.Types[0].isVector() && 1097 needToSplitMemOp(Query, Op == G_LOAD); 1098 }, 1099 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1100 const LLT DstTy = Query.Types[0]; 1101 const LLT PtrTy = Query.Types[1]; 1102 1103 const unsigned DstSize = DstTy.getSizeInBits(); 1104 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1105 1106 // Split extloads. 1107 if (DstSize > MemSize) 1108 return std::make_pair(0, LLT::scalar(MemSize)); 1109 1110 if (!isPowerOf2_32(DstSize)) { 1111 // We're probably decomposing an odd sized store. Try to split 1112 // to the widest type. TODO: Account for alignment. As-is it 1113 // should be OK, since the new parts will be further legalized. 1114 unsigned FloorSize = PowerOf2Floor(DstSize); 1115 return std::make_pair(0, LLT::scalar(FloorSize)); 1116 } 1117 1118 if (DstSize > 32 && (DstSize % 32 != 0)) { 1119 // FIXME: Need a way to specify non-extload of larger size if 1120 // suitably aligned. 1121 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1122 } 1123 1124 unsigned MaxSize = maxSizeForAddrSpace(ST, 1125 PtrTy.getAddressSpace(), 1126 Op == G_LOAD); 1127 if (MemSize > MaxSize) 1128 return std::make_pair(0, LLT::scalar(MaxSize)); 1129 1130 unsigned Align = Query.MMODescrs[0].AlignInBits; 1131 return std::make_pair(0, LLT::scalar(Align)); 1132 }) 1133 .fewerElementsIf( 1134 [=](const LegalityQuery &Query) -> bool { 1135 return Query.Types[0].isVector() && 1136 needToSplitMemOp(Query, Op == G_LOAD); 1137 }, 1138 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1139 const LLT DstTy = Query.Types[0]; 1140 const LLT PtrTy = Query.Types[1]; 1141 1142 LLT EltTy = DstTy.getElementType(); 1143 unsigned MaxSize = maxSizeForAddrSpace(ST, 1144 PtrTy.getAddressSpace(), 1145 Op == G_LOAD); 1146 1147 // FIXME: Handle widened to power of 2 results better. This ends 1148 // up scalarizing. 1149 // FIXME: 3 element stores scalarized on SI 1150 1151 // Split if it's too large for the address space. 1152 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1153 unsigned NumElts = DstTy.getNumElements(); 1154 unsigned EltSize = EltTy.getSizeInBits(); 1155 1156 if (MaxSize % EltSize == 0) { 1157 return std::make_pair( 1158 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1159 } 1160 1161 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1162 1163 // FIXME: Refine when odd breakdowns handled 1164 // The scalars will need to be re-legalized. 1165 if (NumPieces == 1 || NumPieces >= NumElts || 1166 NumElts % NumPieces != 0) 1167 return std::make_pair(0, EltTy); 1168 1169 return std::make_pair(0, 1170 LLT::vector(NumElts / NumPieces, EltTy)); 1171 } 1172 1173 // FIXME: We could probably handle weird extending loads better. 1174 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1175 if (DstTy.getSizeInBits() > MemSize) 1176 return std::make_pair(0, EltTy); 1177 1178 unsigned EltSize = EltTy.getSizeInBits(); 1179 unsigned DstSize = DstTy.getSizeInBits(); 1180 if (!isPowerOf2_32(DstSize)) { 1181 // We're probably decomposing an odd sized store. Try to split 1182 // to the widest type. TODO: Account for alignment. As-is it 1183 // should be OK, since the new parts will be further legalized. 1184 unsigned FloorSize = PowerOf2Floor(DstSize); 1185 return std::make_pair( 1186 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1187 } 1188 1189 // Need to split because of alignment. 1190 unsigned Align = Query.MMODescrs[0].AlignInBits; 1191 if (EltSize > Align && 1192 (EltSize / Align < DstTy.getNumElements())) { 1193 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1194 } 1195 1196 // May need relegalization for the scalars. 1197 return std::make_pair(0, EltTy); 1198 }) 1199 .minScalar(0, S32); 1200 1201 if (IsStore) 1202 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1203 1204 // TODO: Need a bitcast lower option? 1205 Actions 1206 .widenScalarToNextPow2(0) 1207 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1208 } 1209 1210 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1211 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1212 {S32, GlobalPtr, 16, 2 * 8}, 1213 {S32, LocalPtr, 8, 8}, 1214 {S32, LocalPtr, 16, 16}, 1215 {S32, PrivatePtr, 8, 8}, 1216 {S32, PrivatePtr, 16, 16}, 1217 {S32, ConstantPtr, 8, 8}, 1218 {S32, ConstantPtr, 16, 2 * 8}}); 1219 if (ST.hasFlatAddressSpace()) { 1220 ExtLoads.legalForTypesWithMemDesc( 1221 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1222 } 1223 1224 ExtLoads.clampScalar(0, S32, S32) 1225 .widenScalarToNextPow2(0) 1226 .unsupportedIfMemSizeNotPow2() 1227 .lower(); 1228 1229 auto &Atomics = getActionDefinitionsBuilder( 1230 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1231 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1232 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1233 G_ATOMICRMW_UMIN}) 1234 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1235 {S64, GlobalPtr}, {S64, LocalPtr}, 1236 {S32, RegionPtr}, {S64, RegionPtr}}); 1237 if (ST.hasFlatAddressSpace()) { 1238 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1239 } 1240 1241 if (ST.hasLDSFPAtomics()) { 1242 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1243 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1244 } 1245 1246 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1247 // demarshalling 1248 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1249 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1250 {S32, FlatPtr}, {S64, FlatPtr}}) 1251 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1252 {S32, RegionPtr}, {S64, RegionPtr}}); 1253 // TODO: Pointer types, any 32-bit or 64-bit vector 1254 1255 // Condition should be s32 for scalar, s1 for vector. 1256 getActionDefinitionsBuilder(G_SELECT) 1257 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1258 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1259 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1260 .clampScalar(0, S16, S64) 1261 .scalarize(1) 1262 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1263 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1264 .clampMaxNumElements(0, S32, 2) 1265 .clampMaxNumElements(0, LocalPtr, 2) 1266 .clampMaxNumElements(0, PrivatePtr, 2) 1267 .scalarize(0) 1268 .widenScalarToNextPow2(0) 1269 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1270 1271 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1272 // be more flexible with the shift amount type. 1273 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1274 .legalFor({{S32, S32}, {S64, S32}}); 1275 if (ST.has16BitInsts()) { 1276 if (ST.hasVOP3PInsts()) { 1277 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1278 .clampMaxNumElements(0, S16, 2); 1279 } else 1280 Shifts.legalFor({{S16, S16}}); 1281 1282 // TODO: Support 16-bit shift amounts for all types 1283 Shifts.widenScalarIf( 1284 [=](const LegalityQuery &Query) { 1285 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1286 // 32-bit amount. 1287 const LLT ValTy = Query.Types[0]; 1288 const LLT AmountTy = Query.Types[1]; 1289 return ValTy.getSizeInBits() <= 16 && 1290 AmountTy.getSizeInBits() < 16; 1291 }, changeTo(1, S16)); 1292 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1293 Shifts.clampScalar(1, S32, S32); 1294 Shifts.clampScalar(0, S16, S64); 1295 Shifts.widenScalarToNextPow2(0, 16); 1296 } else { 1297 // Make sure we legalize the shift amount type first, as the general 1298 // expansion for the shifted type will produce much worse code if it hasn't 1299 // been truncated already. 1300 Shifts.clampScalar(1, S32, S32); 1301 Shifts.clampScalar(0, S32, S64); 1302 Shifts.widenScalarToNextPow2(0, 32); 1303 } 1304 Shifts.scalarize(0); 1305 1306 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1307 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1308 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1309 unsigned IdxTypeIdx = 2; 1310 1311 getActionDefinitionsBuilder(Op) 1312 .customIf([=](const LegalityQuery &Query) { 1313 const LLT EltTy = Query.Types[EltTypeIdx]; 1314 const LLT VecTy = Query.Types[VecTypeIdx]; 1315 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1316 const unsigned EltSize = EltTy.getSizeInBits(); 1317 return (EltSize == 32 || EltSize == 64) && 1318 VecTy.getSizeInBits() % 32 == 0 && 1319 VecTy.getSizeInBits() <= MaxRegisterSize && 1320 IdxTy.getSizeInBits() == 32; 1321 }) 1322 .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), 1323 bitcastToVectorElement32(1)) 1324 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1325 .bitcastIf( 1326 all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), 1327 [=](const LegalityQuery &Query) { 1328 // For > 64-bit element types, try to turn this into a 64-bit 1329 // element vector since we may be able to do better indexing 1330 // if this is scalar. If not, fall back to 32. 1331 const LLT EltTy = Query.Types[EltTypeIdx]; 1332 const LLT VecTy = Query.Types[VecTypeIdx]; 1333 const unsigned DstEltSize = EltTy.getSizeInBits(); 1334 const unsigned VecSize = VecTy.getSizeInBits(); 1335 1336 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1337 return std::make_pair( 1338 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1339 }) 1340 .clampScalar(EltTypeIdx, S32, S64) 1341 .clampScalar(VecTypeIdx, S32, S64) 1342 .clampScalar(IdxTypeIdx, S32, S32) 1343 .clampMaxNumElements(1, S32, 32) 1344 // TODO: Clamp elements for 64-bit vectors? 1345 // It should only be necessary with variable indexes. 1346 // As a last resort, lower to the stack 1347 .lower(); 1348 } 1349 1350 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1351 .unsupportedIf([=](const LegalityQuery &Query) { 1352 const LLT &EltTy = Query.Types[1].getElementType(); 1353 return Query.Types[0] != EltTy; 1354 }); 1355 1356 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1357 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1358 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1359 1360 // FIXME: Doesn't handle extract of illegal sizes. 1361 getActionDefinitionsBuilder(Op) 1362 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1363 // FIXME: Multiples of 16 should not be legal. 1364 .legalIf([=](const LegalityQuery &Query) { 1365 const LLT BigTy = Query.Types[BigTyIdx]; 1366 const LLT LitTy = Query.Types[LitTyIdx]; 1367 return (BigTy.getSizeInBits() % 32 == 0) && 1368 (LitTy.getSizeInBits() % 16 == 0); 1369 }) 1370 .widenScalarIf( 1371 [=](const LegalityQuery &Query) { 1372 const LLT BigTy = Query.Types[BigTyIdx]; 1373 return (BigTy.getScalarSizeInBits() < 16); 1374 }, 1375 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1376 .widenScalarIf( 1377 [=](const LegalityQuery &Query) { 1378 const LLT LitTy = Query.Types[LitTyIdx]; 1379 return (LitTy.getScalarSizeInBits() < 16); 1380 }, 1381 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1382 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1383 .widenScalarToNextPow2(BigTyIdx, 32); 1384 1385 } 1386 1387 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1388 .legalForCartesianProduct(AllS32Vectors, {S32}) 1389 .legalForCartesianProduct(AllS64Vectors, {S64}) 1390 .clampNumElements(0, V16S32, V32S32) 1391 .clampNumElements(0, V2S64, V16S64) 1392 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1393 1394 if (ST.hasScalarPackInsts()) { 1395 BuildVector 1396 // FIXME: Should probably widen s1 vectors straight to s32 1397 .minScalarOrElt(0, S16) 1398 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1399 .minScalar(1, S32); 1400 1401 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1402 .legalFor({V2S16, S32}) 1403 .lower(); 1404 BuildVector.minScalarOrElt(0, S32); 1405 } else { 1406 BuildVector.customFor({V2S16, S16}); 1407 BuildVector.minScalarOrElt(0, S32); 1408 1409 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1410 .customFor({V2S16, S32}) 1411 .lower(); 1412 } 1413 1414 BuildVector.legalIf(isRegisterType(0)); 1415 1416 // FIXME: Clamp maximum size 1417 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1418 .legalIf(isRegisterType(0)); 1419 1420 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1421 // pre-legalize. 1422 if (ST.hasVOP3PInsts()) { 1423 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1424 .customFor({V2S16, V2S16}) 1425 .lower(); 1426 } else 1427 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1428 1429 // Merge/Unmerge 1430 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1431 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1432 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1433 1434 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1435 const LLT Ty = Query.Types[TypeIdx]; 1436 if (Ty.isVector()) { 1437 const LLT &EltTy = Ty.getElementType(); 1438 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1439 return true; 1440 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1441 return true; 1442 } 1443 return false; 1444 }; 1445 1446 auto &Builder = getActionDefinitionsBuilder(Op) 1447 .lowerFor({{S16, V2S16}}) 1448 .lowerIf([=](const LegalityQuery &Query) { 1449 const LLT BigTy = Query.Types[BigTyIdx]; 1450 return BigTy.getSizeInBits() == 32; 1451 }) 1452 // Try to widen to s16 first for small types. 1453 // TODO: Only do this on targets with legal s16 shifts 1454 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1455 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1456 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1457 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1458 elementTypeIs(1, S16)), 1459 changeTo(1, V2S16)) 1460 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1461 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1462 // valid. 1463 .clampScalar(LitTyIdx, S32, S512) 1464 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1465 // Break up vectors with weird elements into scalars 1466 .fewerElementsIf( 1467 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1468 scalarize(0)) 1469 .fewerElementsIf( 1470 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1471 scalarize(1)) 1472 .clampScalar(BigTyIdx, S32, MaxScalar); 1473 1474 if (Op == G_MERGE_VALUES) { 1475 Builder.widenScalarIf( 1476 // TODO: Use 16-bit shifts if legal for 8-bit values? 1477 [=](const LegalityQuery &Query) { 1478 const LLT Ty = Query.Types[LitTyIdx]; 1479 return Ty.getSizeInBits() < 32; 1480 }, 1481 changeTo(LitTyIdx, S32)); 1482 } 1483 1484 Builder.widenScalarIf( 1485 [=](const LegalityQuery &Query) { 1486 const LLT Ty = Query.Types[BigTyIdx]; 1487 return !isPowerOf2_32(Ty.getSizeInBits()) && 1488 Ty.getSizeInBits() % 16 != 0; 1489 }, 1490 [=](const LegalityQuery &Query) { 1491 // Pick the next power of 2, or a multiple of 64 over 128. 1492 // Whichever is smaller. 1493 const LLT &Ty = Query.Types[BigTyIdx]; 1494 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1495 if (NewSizeInBits >= 256) { 1496 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1497 if (RoundedTo < NewSizeInBits) 1498 NewSizeInBits = RoundedTo; 1499 } 1500 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1501 }) 1502 .legalIf([=](const LegalityQuery &Query) { 1503 const LLT &BigTy = Query.Types[BigTyIdx]; 1504 const LLT &LitTy = Query.Types[LitTyIdx]; 1505 1506 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1507 return false; 1508 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1509 return false; 1510 1511 return BigTy.getSizeInBits() % 16 == 0 && 1512 LitTy.getSizeInBits() % 16 == 0 && 1513 BigTy.getSizeInBits() <= MaxRegisterSize; 1514 }) 1515 // Any vectors left are the wrong size. Scalarize them. 1516 .scalarize(0) 1517 .scalarize(1); 1518 } 1519 1520 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1521 // RegBankSelect. 1522 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1523 .legalFor({{S32}, {S64}}); 1524 1525 if (ST.hasVOP3PInsts()) { 1526 SextInReg.lowerFor({{V2S16}}) 1527 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1528 // get more vector shift opportunities, since we'll get those when 1529 // expanded. 1530 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1531 } else if (ST.has16BitInsts()) { 1532 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1533 } else { 1534 // Prefer to promote to s32 before lowering if we don't have 16-bit 1535 // shifts. This avoid a lot of intermediate truncate and extend operations. 1536 SextInReg.lowerFor({{S32}, {S64}}); 1537 } 1538 1539 SextInReg 1540 .scalarize(0) 1541 .clampScalar(0, S32, S64) 1542 .lower(); 1543 1544 getActionDefinitionsBuilder(G_FSHR) 1545 .legalFor({{S32, S32}}) 1546 .scalarize(0) 1547 .lower(); 1548 1549 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1550 .legalFor({S64}); 1551 1552 getActionDefinitionsBuilder(G_FENCE) 1553 .alwaysLegal(); 1554 1555 getActionDefinitionsBuilder({ 1556 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1557 G_FCOPYSIGN, 1558 1559 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1560 G_ATOMICRMW_NAND, 1561 G_ATOMICRMW_FSUB, 1562 G_READ_REGISTER, 1563 G_WRITE_REGISTER, 1564 1565 G_SADDO, G_SSUBO, 1566 1567 // TODO: Implement 1568 G_FMINIMUM, G_FMAXIMUM, 1569 G_FSHL 1570 }).lower(); 1571 1572 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1573 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1574 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1575 .unsupported(); 1576 1577 computeTables(); 1578 verify(*ST.getInstrInfo()); 1579 } 1580 1581 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1582 MachineInstr &MI) const { 1583 MachineIRBuilder &B = Helper.MIRBuilder; 1584 MachineRegisterInfo &MRI = *B.getMRI(); 1585 GISelChangeObserver &Observer = Helper.Observer; 1586 1587 switch (MI.getOpcode()) { 1588 case TargetOpcode::G_ADDRSPACE_CAST: 1589 return legalizeAddrSpaceCast(MI, MRI, B); 1590 case TargetOpcode::G_FRINT: 1591 return legalizeFrint(MI, MRI, B); 1592 case TargetOpcode::G_FCEIL: 1593 return legalizeFceil(MI, MRI, B); 1594 case TargetOpcode::G_INTRINSIC_TRUNC: 1595 return legalizeIntrinsicTrunc(MI, MRI, B); 1596 case TargetOpcode::G_SITOFP: 1597 return legalizeITOFP(MI, MRI, B, true); 1598 case TargetOpcode::G_UITOFP: 1599 return legalizeITOFP(MI, MRI, B, false); 1600 case TargetOpcode::G_FPTOSI: 1601 return legalizeFPTOI(MI, MRI, B, true); 1602 case TargetOpcode::G_FPTOUI: 1603 return legalizeFPTOI(MI, MRI, B, false); 1604 case TargetOpcode::G_FMINNUM: 1605 case TargetOpcode::G_FMAXNUM: 1606 case TargetOpcode::G_FMINNUM_IEEE: 1607 case TargetOpcode::G_FMAXNUM_IEEE: 1608 return legalizeMinNumMaxNum(Helper, MI); 1609 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1610 return legalizeExtractVectorElt(MI, MRI, B); 1611 case TargetOpcode::G_INSERT_VECTOR_ELT: 1612 return legalizeInsertVectorElt(MI, MRI, B); 1613 case TargetOpcode::G_SHUFFLE_VECTOR: 1614 return legalizeShuffleVector(MI, MRI, B); 1615 case TargetOpcode::G_FSIN: 1616 case TargetOpcode::G_FCOS: 1617 return legalizeSinCos(MI, MRI, B); 1618 case TargetOpcode::G_GLOBAL_VALUE: 1619 return legalizeGlobalValue(MI, MRI, B); 1620 case TargetOpcode::G_LOAD: 1621 return legalizeLoad(MI, MRI, B, Observer); 1622 case TargetOpcode::G_FMAD: 1623 return legalizeFMad(MI, MRI, B); 1624 case TargetOpcode::G_FDIV: 1625 return legalizeFDIV(MI, MRI, B); 1626 case TargetOpcode::G_UDIV: 1627 case TargetOpcode::G_UREM: 1628 return legalizeUDIV_UREM(MI, MRI, B); 1629 case TargetOpcode::G_SDIV: 1630 case TargetOpcode::G_SREM: 1631 return legalizeSDIV_SREM(MI, MRI, B); 1632 case TargetOpcode::G_ATOMIC_CMPXCHG: 1633 return legalizeAtomicCmpXChg(MI, MRI, B); 1634 case TargetOpcode::G_FLOG: 1635 return legalizeFlog(MI, B, numbers::ln2f); 1636 case TargetOpcode::G_FLOG10: 1637 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1638 case TargetOpcode::G_FEXP: 1639 return legalizeFExp(MI, B); 1640 case TargetOpcode::G_FPOW: 1641 return legalizeFPow(MI, B); 1642 case TargetOpcode::G_FFLOOR: 1643 return legalizeFFloor(MI, MRI, B); 1644 case TargetOpcode::G_BUILD_VECTOR: 1645 return legalizeBuildVector(MI, MRI, B); 1646 default: 1647 return false; 1648 } 1649 1650 llvm_unreachable("expected switch to return"); 1651 } 1652 1653 Register AMDGPULegalizerInfo::getSegmentAperture( 1654 unsigned AS, 1655 MachineRegisterInfo &MRI, 1656 MachineIRBuilder &B) const { 1657 MachineFunction &MF = B.getMF(); 1658 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1659 const LLT S32 = LLT::scalar(32); 1660 1661 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1662 1663 if (ST.hasApertureRegs()) { 1664 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1665 // getreg. 1666 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1667 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1668 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1669 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1670 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1671 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1672 unsigned Encoding = 1673 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1674 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1675 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1676 1677 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1678 1679 B.buildInstr(AMDGPU::S_GETREG_B32) 1680 .addDef(GetReg) 1681 .addImm(Encoding); 1682 MRI.setType(GetReg, S32); 1683 1684 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1685 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1686 } 1687 1688 Register QueuePtr = MRI.createGenericVirtualRegister( 1689 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1690 1691 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1692 return Register(); 1693 1694 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1695 // private_segment_aperture_base_hi. 1696 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1697 1698 // TODO: can we be smarter about machine pointer info? 1699 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1700 MachineMemOperand *MMO = MF.getMachineMemOperand( 1701 PtrInfo, 1702 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1703 MachineMemOperand::MOInvariant, 1704 4, commonAlignment(Align(64), StructOffset)); 1705 1706 Register LoadAddr; 1707 1708 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1709 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1710 } 1711 1712 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1713 MachineInstr &MI, MachineRegisterInfo &MRI, 1714 MachineIRBuilder &B) const { 1715 MachineFunction &MF = B.getMF(); 1716 1717 const LLT S32 = LLT::scalar(32); 1718 Register Dst = MI.getOperand(0).getReg(); 1719 Register Src = MI.getOperand(1).getReg(); 1720 1721 LLT DstTy = MRI.getType(Dst); 1722 LLT SrcTy = MRI.getType(Src); 1723 unsigned DestAS = DstTy.getAddressSpace(); 1724 unsigned SrcAS = SrcTy.getAddressSpace(); 1725 1726 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1727 // vector element. 1728 assert(!DstTy.isVector()); 1729 1730 const AMDGPUTargetMachine &TM 1731 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1732 1733 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1734 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1735 return true; 1736 } 1737 1738 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1739 // Truncate. 1740 B.buildExtract(Dst, Src, 0); 1741 MI.eraseFromParent(); 1742 return true; 1743 } 1744 1745 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1746 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1747 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1748 1749 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1750 // another. Merge operands are required to be the same type, but creating an 1751 // extra ptrtoint would be kind of pointless. 1752 auto HighAddr = B.buildConstant( 1753 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1754 B.buildMerge(Dst, {Src, HighAddr}); 1755 MI.eraseFromParent(); 1756 return true; 1757 } 1758 1759 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1760 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1761 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1762 unsigned NullVal = TM.getNullPointerValue(DestAS); 1763 1764 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1765 auto FlatNull = B.buildConstant(SrcTy, 0); 1766 1767 // Extract low 32-bits of the pointer. 1768 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1769 1770 auto CmpRes = 1771 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1772 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1773 1774 MI.eraseFromParent(); 1775 return true; 1776 } 1777 1778 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1779 return false; 1780 1781 if (!ST.hasFlatAddressSpace()) 1782 return false; 1783 1784 auto SegmentNull = 1785 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1786 auto FlatNull = 1787 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1788 1789 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1790 if (!ApertureReg.isValid()) 1791 return false; 1792 1793 auto CmpRes = 1794 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1795 1796 // Coerce the type of the low half of the result so we can use merge_values. 1797 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1798 1799 // TODO: Should we allow mismatched types but matching sizes in merges to 1800 // avoid the ptrtoint? 1801 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1802 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1803 1804 MI.eraseFromParent(); 1805 return true; 1806 } 1807 1808 bool AMDGPULegalizerInfo::legalizeFrint( 1809 MachineInstr &MI, MachineRegisterInfo &MRI, 1810 MachineIRBuilder &B) const { 1811 Register Src = MI.getOperand(1).getReg(); 1812 LLT Ty = MRI.getType(Src); 1813 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1814 1815 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1816 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1817 1818 auto C1 = B.buildFConstant(Ty, C1Val); 1819 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1820 1821 // TODO: Should this propagate fast-math-flags? 1822 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1823 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1824 1825 auto C2 = B.buildFConstant(Ty, C2Val); 1826 auto Fabs = B.buildFAbs(Ty, Src); 1827 1828 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1829 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1830 MI.eraseFromParent(); 1831 return true; 1832 } 1833 1834 bool AMDGPULegalizerInfo::legalizeFceil( 1835 MachineInstr &MI, MachineRegisterInfo &MRI, 1836 MachineIRBuilder &B) const { 1837 1838 const LLT S1 = LLT::scalar(1); 1839 const LLT S64 = LLT::scalar(64); 1840 1841 Register Src = MI.getOperand(1).getReg(); 1842 assert(MRI.getType(Src) == S64); 1843 1844 // result = trunc(src) 1845 // if (src > 0.0 && src != result) 1846 // result += 1.0 1847 1848 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1849 1850 const auto Zero = B.buildFConstant(S64, 0.0); 1851 const auto One = B.buildFConstant(S64, 1.0); 1852 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1853 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1854 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1855 auto Add = B.buildSelect(S64, And, One, Zero); 1856 1857 // TODO: Should this propagate fast-math-flags? 1858 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1859 return true; 1860 } 1861 1862 static MachineInstrBuilder extractF64Exponent(Register Hi, 1863 MachineIRBuilder &B) { 1864 const unsigned FractBits = 52; 1865 const unsigned ExpBits = 11; 1866 LLT S32 = LLT::scalar(32); 1867 1868 auto Const0 = B.buildConstant(S32, FractBits - 32); 1869 auto Const1 = B.buildConstant(S32, ExpBits); 1870 1871 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1872 .addUse(Hi) 1873 .addUse(Const0.getReg(0)) 1874 .addUse(Const1.getReg(0)); 1875 1876 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1877 } 1878 1879 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1880 MachineInstr &MI, MachineRegisterInfo &MRI, 1881 MachineIRBuilder &B) const { 1882 const LLT S1 = LLT::scalar(1); 1883 const LLT S32 = LLT::scalar(32); 1884 const LLT S64 = LLT::scalar(64); 1885 1886 Register Src = MI.getOperand(1).getReg(); 1887 assert(MRI.getType(Src) == S64); 1888 1889 // TODO: Should this use extract since the low half is unused? 1890 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1891 Register Hi = Unmerge.getReg(1); 1892 1893 // Extract the upper half, since this is where we will find the sign and 1894 // exponent. 1895 auto Exp = extractF64Exponent(Hi, B); 1896 1897 const unsigned FractBits = 52; 1898 1899 // Extract the sign bit. 1900 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1901 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1902 1903 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1904 1905 const auto Zero32 = B.buildConstant(S32, 0); 1906 1907 // Extend back to 64-bits. 1908 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1909 1910 auto Shr = B.buildAShr(S64, FractMask, Exp); 1911 auto Not = B.buildNot(S64, Shr); 1912 auto Tmp0 = B.buildAnd(S64, Src, Not); 1913 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1914 1915 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1916 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1917 1918 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1919 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1920 MI.eraseFromParent(); 1921 return true; 1922 } 1923 1924 bool AMDGPULegalizerInfo::legalizeITOFP( 1925 MachineInstr &MI, MachineRegisterInfo &MRI, 1926 MachineIRBuilder &B, bool Signed) const { 1927 1928 Register Dst = MI.getOperand(0).getReg(); 1929 Register Src = MI.getOperand(1).getReg(); 1930 1931 const LLT S64 = LLT::scalar(64); 1932 const LLT S32 = LLT::scalar(32); 1933 1934 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1935 1936 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1937 1938 auto CvtHi = Signed ? 1939 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1940 B.buildUITOFP(S64, Unmerge.getReg(1)); 1941 1942 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1943 1944 auto ThirtyTwo = B.buildConstant(S32, 32); 1945 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1946 .addUse(CvtHi.getReg(0)) 1947 .addUse(ThirtyTwo.getReg(0)); 1948 1949 // TODO: Should this propagate fast-math-flags? 1950 B.buildFAdd(Dst, LdExp, CvtLo); 1951 MI.eraseFromParent(); 1952 return true; 1953 } 1954 1955 // TODO: Copied from DAG implementation. Verify logic and document how this 1956 // actually works. 1957 bool AMDGPULegalizerInfo::legalizeFPTOI( 1958 MachineInstr &MI, MachineRegisterInfo &MRI, 1959 MachineIRBuilder &B, bool Signed) const { 1960 1961 Register Dst = MI.getOperand(0).getReg(); 1962 Register Src = MI.getOperand(1).getReg(); 1963 1964 const LLT S64 = LLT::scalar(64); 1965 const LLT S32 = LLT::scalar(32); 1966 1967 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1968 1969 unsigned Flags = MI.getFlags(); 1970 1971 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1972 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1973 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1974 1975 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1976 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1977 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1978 1979 auto Hi = Signed ? 1980 B.buildFPTOSI(S32, FloorMul) : 1981 B.buildFPTOUI(S32, FloorMul); 1982 auto Lo = B.buildFPTOUI(S32, Fma); 1983 1984 B.buildMerge(Dst, { Lo, Hi }); 1985 MI.eraseFromParent(); 1986 1987 return true; 1988 } 1989 1990 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1991 MachineInstr &MI) const { 1992 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1993 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1994 1995 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1996 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1997 1998 // With ieee_mode disabled, the instructions have the correct behavior 1999 // already for G_FMINNUM/G_FMAXNUM 2000 if (!MFI->getMode().IEEE) 2001 return !IsIEEEOp; 2002 2003 if (IsIEEEOp) 2004 return true; 2005 2006 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2007 } 2008 2009 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2010 MachineInstr &MI, MachineRegisterInfo &MRI, 2011 MachineIRBuilder &B) const { 2012 // TODO: Should move some of this into LegalizerHelper. 2013 2014 // TODO: Promote dynamic indexing of s16 to s32 2015 2016 // FIXME: Artifact combiner probably should have replaced the truncated 2017 // constant before this, so we shouldn't need 2018 // getConstantVRegValWithLookThrough. 2019 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2020 MI.getOperand(2).getReg(), MRI); 2021 if (!IdxVal) // Dynamic case will be selected to register indexing. 2022 return true; 2023 2024 Register Dst = MI.getOperand(0).getReg(); 2025 Register Vec = MI.getOperand(1).getReg(); 2026 2027 LLT VecTy = MRI.getType(Vec); 2028 LLT EltTy = VecTy.getElementType(); 2029 assert(EltTy == MRI.getType(Dst)); 2030 2031 if (IdxVal->Value < VecTy.getNumElements()) 2032 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2033 else 2034 B.buildUndef(Dst); 2035 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B) const { 2043 // TODO: Should move some of this into LegalizerHelper. 2044 2045 // TODO: Promote dynamic indexing of s16 to s32 2046 2047 // FIXME: Artifact combiner probably should have replaced the truncated 2048 // constant before this, so we shouldn't need 2049 // getConstantVRegValWithLookThrough. 2050 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2051 MI.getOperand(3).getReg(), MRI); 2052 if (!IdxVal) // Dynamic case will be selected to register indexing. 2053 return true; 2054 2055 Register Dst = MI.getOperand(0).getReg(); 2056 Register Vec = MI.getOperand(1).getReg(); 2057 Register Ins = MI.getOperand(2).getReg(); 2058 2059 LLT VecTy = MRI.getType(Vec); 2060 LLT EltTy = VecTy.getElementType(); 2061 assert(EltTy == MRI.getType(Ins)); 2062 2063 if (IdxVal->Value < VecTy.getNumElements()) 2064 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2065 else 2066 B.buildUndef(Dst); 2067 2068 MI.eraseFromParent(); 2069 return true; 2070 } 2071 2072 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2073 MachineInstr &MI, MachineRegisterInfo &MRI, 2074 MachineIRBuilder &B) const { 2075 const LLT V2S16 = LLT::vector(2, 16); 2076 2077 Register Dst = MI.getOperand(0).getReg(); 2078 Register Src0 = MI.getOperand(1).getReg(); 2079 LLT DstTy = MRI.getType(Dst); 2080 LLT SrcTy = MRI.getType(Src0); 2081 2082 if (SrcTy == V2S16 && DstTy == V2S16 && 2083 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2084 return true; 2085 2086 MachineIRBuilder HelperBuilder(MI); 2087 GISelObserverWrapper DummyObserver; 2088 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2089 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2090 } 2091 2092 bool AMDGPULegalizerInfo::legalizeSinCos( 2093 MachineInstr &MI, MachineRegisterInfo &MRI, 2094 MachineIRBuilder &B) const { 2095 2096 Register DstReg = MI.getOperand(0).getReg(); 2097 Register SrcReg = MI.getOperand(1).getReg(); 2098 LLT Ty = MRI.getType(DstReg); 2099 unsigned Flags = MI.getFlags(); 2100 2101 Register TrigVal; 2102 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2103 if (ST.hasTrigReducedRange()) { 2104 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2105 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2106 .addUse(MulVal.getReg(0)) 2107 .setMIFlags(Flags).getReg(0); 2108 } else 2109 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2110 2111 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2112 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2113 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2114 .addUse(TrigVal) 2115 .setMIFlags(Flags); 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2121 MachineIRBuilder &B, 2122 const GlobalValue *GV, 2123 int64_t Offset, 2124 unsigned GAFlags) const { 2125 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2126 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2127 // to the following code sequence: 2128 // 2129 // For constant address space: 2130 // s_getpc_b64 s[0:1] 2131 // s_add_u32 s0, s0, $symbol 2132 // s_addc_u32 s1, s1, 0 2133 // 2134 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2135 // a fixup or relocation is emitted to replace $symbol with a literal 2136 // constant, which is a pc-relative offset from the encoding of the $symbol 2137 // operand to the global variable. 2138 // 2139 // For global address space: 2140 // s_getpc_b64 s[0:1] 2141 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2142 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2143 // 2144 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2145 // fixups or relocations are emitted to replace $symbol@*@lo and 2146 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2147 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2148 // operand to the global variable. 2149 // 2150 // What we want here is an offset from the value returned by s_getpc 2151 // (which is the address of the s_add_u32 instruction) to the global 2152 // variable, but since the encoding of $symbol starts 4 bytes after the start 2153 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2154 // small. This requires us to add 4 to the global variable offset in order to 2155 // compute the correct address. 2156 2157 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2158 2159 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2160 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2161 2162 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2163 .addDef(PCReg); 2164 2165 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2166 if (GAFlags == SIInstrInfo::MO_NONE) 2167 MIB.addImm(0); 2168 else 2169 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2170 2171 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2172 2173 if (PtrTy.getSizeInBits() == 32) 2174 B.buildExtract(DstReg, PCReg, 0); 2175 return true; 2176 } 2177 2178 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2179 MachineInstr &MI, MachineRegisterInfo &MRI, 2180 MachineIRBuilder &B) const { 2181 Register DstReg = MI.getOperand(0).getReg(); 2182 LLT Ty = MRI.getType(DstReg); 2183 unsigned AS = Ty.getAddressSpace(); 2184 2185 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2186 MachineFunction &MF = B.getMF(); 2187 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2188 2189 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2190 if (!MFI->isEntryFunction()) { 2191 const Function &Fn = MF.getFunction(); 2192 DiagnosticInfoUnsupported BadLDSDecl( 2193 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2194 DS_Warning); 2195 Fn.getContext().diagnose(BadLDSDecl); 2196 2197 // We currently don't have a way to correctly allocate LDS objects that 2198 // aren't directly associated with a kernel. We do force inlining of 2199 // functions that use local objects. However, if these dead functions are 2200 // not eliminated, we don't want a compile time error. Just emit a warning 2201 // and a trap, since there should be no callable path here. 2202 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2203 B.buildUndef(DstReg); 2204 MI.eraseFromParent(); 2205 return true; 2206 } 2207 2208 // TODO: We could emit code to handle the initialization somewhere. 2209 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2210 const SITargetLowering *TLI = ST.getTargetLowering(); 2211 if (!TLI->shouldUseLDSConstAddress(GV)) { 2212 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2213 return true; // Leave in place; 2214 } 2215 2216 B.buildConstant( 2217 DstReg, 2218 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2219 MI.eraseFromParent(); 2220 return true; 2221 } 2222 2223 const Function &Fn = MF.getFunction(); 2224 DiagnosticInfoUnsupported BadInit( 2225 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2226 Fn.getContext().diagnose(BadInit); 2227 return true; 2228 } 2229 2230 const SITargetLowering *TLI = ST.getTargetLowering(); 2231 2232 if (TLI->shouldEmitFixup(GV)) { 2233 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 if (TLI->shouldEmitPCReloc(GV)) { 2239 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2240 MI.eraseFromParent(); 2241 return true; 2242 } 2243 2244 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2245 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2246 2247 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2248 MachinePointerInfo::getGOT(MF), 2249 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2250 MachineMemOperand::MOInvariant, 2251 8 /*Size*/, Align(8)); 2252 2253 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2254 2255 if (Ty.getSizeInBits() == 32) { 2256 // Truncate if this is a 32-bit constant adrdess. 2257 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2258 B.buildExtract(DstReg, Load, 0); 2259 } else 2260 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2261 2262 MI.eraseFromParent(); 2263 return true; 2264 } 2265 2266 bool AMDGPULegalizerInfo::legalizeLoad( 2267 MachineInstr &MI, MachineRegisterInfo &MRI, 2268 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2269 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2270 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2271 Observer.changingInstr(MI); 2272 MI.getOperand(1).setReg(Cast.getReg(0)); 2273 Observer.changedInstr(MI); 2274 return true; 2275 } 2276 2277 bool AMDGPULegalizerInfo::legalizeFMad( 2278 MachineInstr &MI, MachineRegisterInfo &MRI, 2279 MachineIRBuilder &B) const { 2280 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2281 assert(Ty.isScalar()); 2282 2283 MachineFunction &MF = B.getMF(); 2284 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2285 2286 // TODO: Always legal with future ftz flag. 2287 // FIXME: Do we need just output? 2288 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2289 return true; 2290 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2291 return true; 2292 2293 MachineIRBuilder HelperBuilder(MI); 2294 GISelObserverWrapper DummyObserver; 2295 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2296 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2297 } 2298 2299 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2300 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2301 Register DstReg = MI.getOperand(0).getReg(); 2302 Register PtrReg = MI.getOperand(1).getReg(); 2303 Register CmpVal = MI.getOperand(2).getReg(); 2304 Register NewVal = MI.getOperand(3).getReg(); 2305 2306 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2307 "this should not have been custom lowered"); 2308 2309 LLT ValTy = MRI.getType(CmpVal); 2310 LLT VecTy = LLT::vector(2, ValTy); 2311 2312 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2313 2314 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2315 .addDef(DstReg) 2316 .addUse(PtrReg) 2317 .addUse(PackedVal) 2318 .setMemRefs(MI.memoperands()); 2319 2320 MI.eraseFromParent(); 2321 return true; 2322 } 2323 2324 bool AMDGPULegalizerInfo::legalizeFlog( 2325 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2326 Register Dst = MI.getOperand(0).getReg(); 2327 Register Src = MI.getOperand(1).getReg(); 2328 LLT Ty = B.getMRI()->getType(Dst); 2329 unsigned Flags = MI.getFlags(); 2330 2331 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2332 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2333 2334 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2335 MI.eraseFromParent(); 2336 return true; 2337 } 2338 2339 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2340 MachineIRBuilder &B) const { 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Src = MI.getOperand(1).getReg(); 2343 unsigned Flags = MI.getFlags(); 2344 LLT Ty = B.getMRI()->getType(Dst); 2345 2346 auto K = B.buildFConstant(Ty, numbers::log2e); 2347 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2348 B.buildFExp2(Dst, Mul, Flags); 2349 MI.eraseFromParent(); 2350 return true; 2351 } 2352 2353 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2354 MachineIRBuilder &B) const { 2355 Register Dst = MI.getOperand(0).getReg(); 2356 Register Src0 = MI.getOperand(1).getReg(); 2357 Register Src1 = MI.getOperand(2).getReg(); 2358 unsigned Flags = MI.getFlags(); 2359 LLT Ty = B.getMRI()->getType(Dst); 2360 const LLT S16 = LLT::scalar(16); 2361 const LLT S32 = LLT::scalar(32); 2362 2363 if (Ty == S32) { 2364 auto Log = B.buildFLog2(S32, Src0, Flags); 2365 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2366 .addUse(Log.getReg(0)) 2367 .addUse(Src1) 2368 .setMIFlags(Flags); 2369 B.buildFExp2(Dst, Mul, Flags); 2370 } else if (Ty == S16) { 2371 // There's no f16 fmul_legacy, so we need to convert for it. 2372 auto Log = B.buildFLog2(S16, Src0, Flags); 2373 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2374 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2375 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2376 .addUse(Ext0.getReg(0)) 2377 .addUse(Ext1.getReg(0)) 2378 .setMIFlags(Flags); 2379 2380 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2381 } else 2382 return false; 2383 2384 MI.eraseFromParent(); 2385 return true; 2386 } 2387 2388 // Find a source register, ignoring any possible source modifiers. 2389 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2390 Register ModSrc = OrigSrc; 2391 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2392 ModSrc = SrcFNeg->getOperand(1).getReg(); 2393 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2394 ModSrc = SrcFAbs->getOperand(1).getReg(); 2395 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2396 ModSrc = SrcFAbs->getOperand(1).getReg(); 2397 return ModSrc; 2398 } 2399 2400 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2401 MachineRegisterInfo &MRI, 2402 MachineIRBuilder &B) const { 2403 2404 const LLT S1 = LLT::scalar(1); 2405 const LLT S64 = LLT::scalar(64); 2406 Register Dst = MI.getOperand(0).getReg(); 2407 Register OrigSrc = MI.getOperand(1).getReg(); 2408 unsigned Flags = MI.getFlags(); 2409 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2410 "this should not have been custom lowered"); 2411 2412 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2413 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2414 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2415 // V_FRACT bug is: 2416 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2417 // 2418 // Convert floor(x) to (x - fract(x)) 2419 2420 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2421 .addUse(OrigSrc) 2422 .setMIFlags(Flags); 2423 2424 // Give source modifier matching some assistance before obscuring a foldable 2425 // pattern. 2426 2427 // TODO: We can avoid the neg on the fract? The input sign to fract 2428 // shouldn't matter? 2429 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2430 2431 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2432 2433 Register Min = MRI.createGenericVirtualRegister(S64); 2434 2435 // We don't need to concern ourselves with the snan handling difference, so 2436 // use the one which will directly select. 2437 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2438 if (MFI->getMode().IEEE) 2439 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2440 else 2441 B.buildFMinNum(Min, Fract, Const, Flags); 2442 2443 Register CorrectedFract = Min; 2444 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2445 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2446 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2447 } 2448 2449 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2450 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2451 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 // Turn an illegal packed v2s16 build vector into bit operations. 2457 // TODO: This should probably be a bitcast action in LegalizerHelper. 2458 bool AMDGPULegalizerInfo::legalizeBuildVector( 2459 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2460 Register Dst = MI.getOperand(0).getReg(); 2461 const LLT S32 = LLT::scalar(32); 2462 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2463 2464 Register Src0 = MI.getOperand(1).getReg(); 2465 Register Src1 = MI.getOperand(2).getReg(); 2466 assert(MRI.getType(Src0) == LLT::scalar(16)); 2467 2468 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2469 B.buildBitcast(Dst, Merge); 2470 2471 MI.eraseFromParent(); 2472 return true; 2473 } 2474 2475 // Return the use branch instruction, otherwise null if the usage is invalid. 2476 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2477 MachineRegisterInfo &MRI, 2478 MachineInstr *&Br, 2479 MachineBasicBlock *&UncondBrTarget) { 2480 Register CondDef = MI.getOperand(0).getReg(); 2481 if (!MRI.hasOneNonDBGUse(CondDef)) 2482 return nullptr; 2483 2484 MachineBasicBlock *Parent = MI.getParent(); 2485 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2486 if (UseMI.getParent() != Parent || 2487 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2488 return nullptr; 2489 2490 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2491 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2492 if (Next == Parent->end()) { 2493 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2494 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2495 return nullptr; 2496 UncondBrTarget = &*NextMBB; 2497 } else { 2498 if (Next->getOpcode() != AMDGPU::G_BR) 2499 return nullptr; 2500 Br = &*Next; 2501 UncondBrTarget = Br->getOperand(0).getMBB(); 2502 } 2503 2504 return &UseMI; 2505 } 2506 2507 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2508 const ArgDescriptor *Arg, 2509 const TargetRegisterClass *ArgRC, 2510 LLT ArgTy) const { 2511 MCRegister SrcReg = Arg->getRegister(); 2512 assert(SrcReg.isPhysical() && "Physical register expected"); 2513 assert(DstReg.isVirtual() && "Virtual register expected"); 2514 2515 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2516 ArgTy); 2517 if (Arg->isMasked()) { 2518 // TODO: Should we try to emit this once in the entry block? 2519 const LLT S32 = LLT::scalar(32); 2520 const unsigned Mask = Arg->getMask(); 2521 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2522 2523 Register AndMaskSrc = LiveIn; 2524 2525 if (Shift != 0) { 2526 auto ShiftAmt = B.buildConstant(S32, Shift); 2527 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2528 } 2529 2530 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2531 } else { 2532 B.buildCopy(DstReg, LiveIn); 2533 } 2534 2535 return true; 2536 } 2537 2538 bool AMDGPULegalizerInfo::loadInputValue( 2539 Register DstReg, MachineIRBuilder &B, 2540 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2541 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2542 const ArgDescriptor *Arg; 2543 const TargetRegisterClass *ArgRC; 2544 LLT ArgTy; 2545 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2546 2547 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2548 return false; // TODO: Handle these 2549 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2553 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2554 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2555 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2556 return false; 2557 2558 MI.eraseFromParent(); 2559 return true; 2560 } 2561 2562 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2563 MachineRegisterInfo &MRI, 2564 MachineIRBuilder &B) const { 2565 Register Dst = MI.getOperand(0).getReg(); 2566 LLT DstTy = MRI.getType(Dst); 2567 LLT S16 = LLT::scalar(16); 2568 LLT S32 = LLT::scalar(32); 2569 LLT S64 = LLT::scalar(64); 2570 2571 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2572 return true; 2573 2574 if (DstTy == S16) 2575 return legalizeFDIV16(MI, MRI, B); 2576 if (DstTy == S32) 2577 return legalizeFDIV32(MI, MRI, B); 2578 if (DstTy == S64) 2579 return legalizeFDIV64(MI, MRI, B); 2580 2581 return false; 2582 } 2583 2584 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2585 Register DstReg, 2586 Register X, 2587 Register Y, 2588 bool IsDiv) const { 2589 const LLT S1 = LLT::scalar(1); 2590 const LLT S32 = LLT::scalar(32); 2591 2592 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2593 // algorithm used here. 2594 2595 // Initial estimate of inv(y). 2596 auto FloatY = B.buildUITOFP(S32, Y); 2597 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2598 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2599 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2600 auto Z = B.buildFPTOUI(S32, ScaledY); 2601 2602 // One round of UNR. 2603 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2604 auto NegYZ = B.buildMul(S32, NegY, Z); 2605 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2606 2607 // Quotient/remainder estimate. 2608 auto Q = B.buildUMulH(S32, X, Z); 2609 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2610 2611 // First quotient/remainder refinement. 2612 auto One = B.buildConstant(S32, 1); 2613 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2614 if (IsDiv) 2615 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2616 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2617 2618 // Second quotient/remainder refinement. 2619 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2620 if (IsDiv) 2621 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2622 else 2623 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2627 MachineRegisterInfo &MRI, 2628 MachineIRBuilder &B) const { 2629 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2630 Register DstReg = MI.getOperand(0).getReg(); 2631 Register Num = MI.getOperand(1).getReg(); 2632 Register Den = MI.getOperand(2).getReg(); 2633 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2634 MI.eraseFromParent(); 2635 return true; 2636 } 2637 2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2639 // 2640 // Return lo, hi of result 2641 // 2642 // %cvt.lo = G_UITOFP Val.lo 2643 // %cvt.hi = G_UITOFP Val.hi 2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2647 // %mul2 = G_FMUL %mul1, 2**(-32) 2648 // %trunc = G_INTRINSIC_TRUNC %mul2 2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2652 Register Val) { 2653 const LLT S32 = LLT::scalar(32); 2654 auto Unmerge = B.buildUnmerge(S32, Val); 2655 2656 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2657 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2658 2659 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2660 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2661 2662 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2663 auto Mul1 = 2664 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2665 2666 // 2**(-32) 2667 auto Mul2 = 2668 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2669 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2670 2671 // -(2**32) 2672 auto Mad2 = B.buildFMAD(S32, Trunc, 2673 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2674 2675 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2676 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2677 2678 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2679 } 2680 2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2682 Register DstReg, 2683 Register Numer, 2684 Register Denom, 2685 bool IsDiv) const { 2686 const LLT S32 = LLT::scalar(32); 2687 const LLT S64 = LLT::scalar(64); 2688 const LLT S1 = LLT::scalar(1); 2689 Register RcpLo, RcpHi; 2690 2691 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2692 2693 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2694 2695 auto Zero64 = B.buildConstant(S64, 0); 2696 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2697 2698 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2699 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2700 2701 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2702 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2703 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2704 2705 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2706 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2707 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2708 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2709 2710 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2711 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2712 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2713 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2714 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2715 2716 auto Zero32 = B.buildConstant(S32, 0); 2717 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2718 auto Add2_HiC = 2719 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2720 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2721 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2722 2723 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2724 Register NumerLo = UnmergeNumer.getReg(0); 2725 Register NumerHi = UnmergeNumer.getReg(1); 2726 2727 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2728 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2729 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2730 Register Mul3_Lo = UnmergeMul3.getReg(0); 2731 Register Mul3_Hi = UnmergeMul3.getReg(1); 2732 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2733 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2734 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2735 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2736 2737 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2738 Register DenomLo = UnmergeDenom.getReg(0); 2739 Register DenomHi = UnmergeDenom.getReg(1); 2740 2741 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2742 auto C1 = B.buildSExt(S32, CmpHi); 2743 2744 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2745 auto C2 = B.buildSExt(S32, CmpLo); 2746 2747 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2748 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2749 2750 // TODO: Here and below portions of the code can be enclosed into if/endif. 2751 // Currently control flow is unconditional and we have 4 selects after 2752 // potential endif to substitute PHIs. 2753 2754 // if C3 != 0 ... 2755 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2756 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2757 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2758 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2759 2760 auto One64 = B.buildConstant(S64, 1); 2761 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2762 2763 auto C4 = 2764 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2765 auto C5 = 2766 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2767 auto C6 = B.buildSelect( 2768 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2769 2770 // if (C6 != 0) 2771 auto Add4 = B.buildAdd(S64, Add3, One64); 2772 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2773 2774 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2775 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2776 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2777 2778 // endif C6 2779 // endif C3 2780 2781 if (IsDiv) { 2782 auto Sel1 = B.buildSelect( 2783 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2784 B.buildSelect(DstReg, 2785 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2786 } else { 2787 auto Sel2 = B.buildSelect( 2788 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2789 B.buildSelect(DstReg, 2790 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2791 } 2792 } 2793 2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2795 MachineRegisterInfo &MRI, 2796 MachineIRBuilder &B) const { 2797 const LLT S64 = LLT::scalar(64); 2798 const LLT S32 = LLT::scalar(32); 2799 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2800 Register DstReg = MI.getOperand(0).getReg(); 2801 Register Num = MI.getOperand(1).getReg(); 2802 Register Den = MI.getOperand(2).getReg(); 2803 LLT Ty = MRI.getType(DstReg); 2804 2805 if (Ty == S32) 2806 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2807 else if (Ty == S64) 2808 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2809 else 2810 return false; 2811 2812 MI.eraseFromParent(); 2813 return true; 2814 2815 } 2816 2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 const LLT S64 = LLT::scalar(64); 2821 const LLT S32 = LLT::scalar(32); 2822 2823 Register DstReg = MI.getOperand(0).getReg(); 2824 const LLT Ty = MRI.getType(DstReg); 2825 if (Ty != S32 && Ty != S64) 2826 return false; 2827 2828 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2829 2830 Register LHS = MI.getOperand(1).getReg(); 2831 Register RHS = MI.getOperand(2).getReg(); 2832 2833 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2834 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2835 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2836 2837 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2838 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2839 2840 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2841 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2842 2843 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2844 if (Ty == S32) 2845 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2846 else 2847 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2848 2849 Register Sign; 2850 if (IsDiv) 2851 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2852 else 2853 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2854 2855 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2856 B.buildSub(DstReg, UDivRem, Sign); 2857 2858 MI.eraseFromParent(); 2859 return true; 2860 } 2861 2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2863 MachineRegisterInfo &MRI, 2864 MachineIRBuilder &B) const { 2865 Register Res = MI.getOperand(0).getReg(); 2866 Register LHS = MI.getOperand(1).getReg(); 2867 Register RHS = MI.getOperand(2).getReg(); 2868 2869 uint16_t Flags = MI.getFlags(); 2870 2871 LLT ResTy = MRI.getType(Res); 2872 LLT S32 = LLT::scalar(32); 2873 LLT S64 = LLT::scalar(64); 2874 2875 const MachineFunction &MF = B.getMF(); 2876 bool Unsafe = 2877 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2878 2879 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2880 return false; 2881 2882 if (!Unsafe && ResTy == S32 && 2883 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2884 return false; 2885 2886 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2887 // 1 / x -> RCP(x) 2888 if (CLHS->isExactlyValue(1.0)) { 2889 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2890 .addUse(RHS) 2891 .setMIFlags(Flags); 2892 2893 MI.eraseFromParent(); 2894 return true; 2895 } 2896 2897 // -1 / x -> RCP( FNEG(x) ) 2898 if (CLHS->isExactlyValue(-1.0)) { 2899 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2900 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2901 .addUse(FNeg.getReg(0)) 2902 .setMIFlags(Flags); 2903 2904 MI.eraseFromParent(); 2905 return true; 2906 } 2907 } 2908 2909 // x / y -> x * (1.0 / y) 2910 if (Unsafe) { 2911 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2912 .addUse(RHS) 2913 .setMIFlags(Flags); 2914 B.buildFMul(Res, LHS, RCP, Flags); 2915 2916 MI.eraseFromParent(); 2917 return true; 2918 } 2919 2920 return false; 2921 } 2922 2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2924 MachineRegisterInfo &MRI, 2925 MachineIRBuilder &B) const { 2926 Register Res = MI.getOperand(0).getReg(); 2927 Register LHS = MI.getOperand(1).getReg(); 2928 Register RHS = MI.getOperand(2).getReg(); 2929 2930 uint16_t Flags = MI.getFlags(); 2931 2932 LLT S16 = LLT::scalar(16); 2933 LLT S32 = LLT::scalar(32); 2934 2935 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2936 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2937 2938 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2939 .addUse(RHSExt.getReg(0)) 2940 .setMIFlags(Flags); 2941 2942 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2943 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2944 2945 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2946 .addUse(RDst.getReg(0)) 2947 .addUse(RHS) 2948 .addUse(LHS) 2949 .setMIFlags(Flags); 2950 2951 MI.eraseFromParent(); 2952 return true; 2953 } 2954 2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2957 static void toggleSPDenormMode(bool Enable, 2958 MachineIRBuilder &B, 2959 const GCNSubtarget &ST, 2960 AMDGPU::SIModeRegisterDefaults Mode) { 2961 // Set SP denorm mode to this value. 2962 unsigned SPDenormMode = 2963 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2964 2965 if (ST.hasDenormModeInst()) { 2966 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2968 2969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2970 B.buildInstr(AMDGPU::S_DENORM_MODE) 2971 .addImm(NewDenormModeValue); 2972 2973 } else { 2974 // Select FP32 bit field in mode register. 2975 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2976 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2977 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2978 2979 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2980 .addImm(SPDenormMode) 2981 .addImm(SPDenormModeBitField); 2982 } 2983 } 2984 2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2986 MachineRegisterInfo &MRI, 2987 MachineIRBuilder &B) const { 2988 Register Res = MI.getOperand(0).getReg(); 2989 Register LHS = MI.getOperand(1).getReg(); 2990 Register RHS = MI.getOperand(2).getReg(); 2991 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2992 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2993 2994 uint16_t Flags = MI.getFlags(); 2995 2996 LLT S32 = LLT::scalar(32); 2997 LLT S1 = LLT::scalar(1); 2998 2999 auto One = B.buildFConstant(S32, 1.0f); 3000 3001 auto DenominatorScaled = 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3003 .addUse(LHS) 3004 .addUse(RHS) 3005 .addImm(0) 3006 .setMIFlags(Flags); 3007 auto NumeratorScaled = 3008 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3009 .addUse(LHS) 3010 .addUse(RHS) 3011 .addImm(1) 3012 .setMIFlags(Flags); 3013 3014 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3015 .addUse(DenominatorScaled.getReg(0)) 3016 .setMIFlags(Flags); 3017 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3018 3019 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3020 // aren't modeled as reading it. 3021 if (!Mode.allFP32Denormals()) 3022 toggleSPDenormMode(true, B, ST, Mode); 3023 3024 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3025 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3026 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3027 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3028 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3029 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3030 3031 if (!Mode.allFP32Denormals()) 3032 toggleSPDenormMode(false, B, ST, Mode); 3033 3034 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3035 .addUse(Fma4.getReg(0)) 3036 .addUse(Fma1.getReg(0)) 3037 .addUse(Fma3.getReg(0)) 3038 .addUse(NumeratorScaled.getReg(1)) 3039 .setMIFlags(Flags); 3040 3041 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3042 .addUse(Fmas.getReg(0)) 3043 .addUse(RHS) 3044 .addUse(LHS) 3045 .setMIFlags(Flags); 3046 3047 MI.eraseFromParent(); 3048 return true; 3049 } 3050 3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3052 MachineRegisterInfo &MRI, 3053 MachineIRBuilder &B) const { 3054 Register Res = MI.getOperand(0).getReg(); 3055 Register LHS = MI.getOperand(1).getReg(); 3056 Register RHS = MI.getOperand(2).getReg(); 3057 3058 uint16_t Flags = MI.getFlags(); 3059 3060 LLT S64 = LLT::scalar(64); 3061 LLT S1 = LLT::scalar(1); 3062 3063 auto One = B.buildFConstant(S64, 1.0); 3064 3065 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3066 .addUse(LHS) 3067 .addUse(RHS) 3068 .addImm(0) 3069 .setMIFlags(Flags); 3070 3071 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3072 3073 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3074 .addUse(DivScale0.getReg(0)) 3075 .setMIFlags(Flags); 3076 3077 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3078 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3079 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3080 3081 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3082 .addUse(LHS) 3083 .addUse(RHS) 3084 .addImm(1) 3085 .setMIFlags(Flags); 3086 3087 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3088 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3089 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3090 3091 Register Scale; 3092 if (!ST.hasUsableDivScaleConditionOutput()) { 3093 // Workaround a hardware bug on SI where the condition output from div_scale 3094 // is not usable. 3095 3096 LLT S32 = LLT::scalar(32); 3097 3098 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3099 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3100 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3101 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3102 3103 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3104 Scale1Unmerge.getReg(1)); 3105 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3106 Scale0Unmerge.getReg(1)); 3107 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3108 } else { 3109 Scale = DivScale1.getReg(1); 3110 } 3111 3112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3113 .addUse(Fma4.getReg(0)) 3114 .addUse(Fma3.getReg(0)) 3115 .addUse(Mul.getReg(0)) 3116 .addUse(Scale) 3117 .setMIFlags(Flags); 3118 3119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3120 .addUse(Fmas.getReg(0)) 3121 .addUse(RHS) 3122 .addUse(LHS) 3123 .setMIFlags(Flags); 3124 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B) const { 3132 Register Res = MI.getOperand(0).getReg(); 3133 Register LHS = MI.getOperand(2).getReg(); 3134 Register RHS = MI.getOperand(3).getReg(); 3135 uint16_t Flags = MI.getFlags(); 3136 3137 LLT S32 = LLT::scalar(32); 3138 LLT S1 = LLT::scalar(1); 3139 3140 auto Abs = B.buildFAbs(S32, RHS, Flags); 3141 const APFloat C0Val(1.0f); 3142 3143 auto C0 = B.buildConstant(S32, 0x6f800000); 3144 auto C1 = B.buildConstant(S32, 0x2f800000); 3145 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3146 3147 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3148 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3149 3150 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3151 3152 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3153 .addUse(Mul0.getReg(0)) 3154 .setMIFlags(Flags); 3155 3156 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3157 3158 B.buildFMul(Res, Sel, Mul1, Flags); 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3165 // FIXME: Why do we handle this one but not other removed instructions? 3166 // 3167 // Reciprocal square root. The clamp prevents infinite results, clamping 3168 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3169 // +-max_float. 3170 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3171 MachineRegisterInfo &MRI, 3172 MachineIRBuilder &B) const { 3173 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3174 return true; 3175 3176 Register Dst = MI.getOperand(0).getReg(); 3177 Register Src = MI.getOperand(2).getReg(); 3178 auto Flags = MI.getFlags(); 3179 3180 LLT Ty = MRI.getType(Dst); 3181 3182 const fltSemantics *FltSemantics; 3183 if (Ty == LLT::scalar(32)) 3184 FltSemantics = &APFloat::IEEEsingle(); 3185 else if (Ty == LLT::scalar(64)) 3186 FltSemantics = &APFloat::IEEEdouble(); 3187 else 3188 return false; 3189 3190 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3191 .addUse(Src) 3192 .setMIFlags(Flags); 3193 3194 // We don't need to concern ourselves with the snan handling difference, since 3195 // the rsq quieted (or not) so use the one which will directly select. 3196 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3197 const bool UseIEEE = MFI->getMode().IEEE; 3198 3199 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3200 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3201 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3202 3203 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3204 3205 if (UseIEEE) 3206 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3207 else 3208 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3209 MI.eraseFromParent(); 3210 return true; 3211 } 3212 3213 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3214 switch (IID) { 3215 case Intrinsic::amdgcn_ds_fadd: 3216 return AMDGPU::G_ATOMICRMW_FADD; 3217 case Intrinsic::amdgcn_ds_fmin: 3218 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3219 case Intrinsic::amdgcn_ds_fmax: 3220 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3221 default: 3222 llvm_unreachable("not a DS FP intrinsic"); 3223 } 3224 } 3225 3226 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3227 MachineInstr &MI, 3228 Intrinsic::ID IID) const { 3229 GISelChangeObserver &Observer = Helper.Observer; 3230 Observer.changingInstr(MI); 3231 3232 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3233 3234 // The remaining operands were used to set fields in the MemOperand on 3235 // construction. 3236 for (int I = 6; I > 3; --I) 3237 MI.RemoveOperand(I); 3238 3239 MI.RemoveOperand(1); // Remove the intrinsic ID. 3240 Observer.changedInstr(MI); 3241 return true; 3242 } 3243 3244 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3245 MachineRegisterInfo &MRI, 3246 MachineIRBuilder &B) const { 3247 uint64_t Offset = 3248 ST.getTargetLowering()->getImplicitParameterOffset( 3249 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3250 LLT DstTy = MRI.getType(DstReg); 3251 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3252 3253 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3254 if (!loadInputValue(KernargPtrReg, B, 3255 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3256 return false; 3257 3258 // FIXME: This should be nuw 3259 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3260 return true; 3261 } 3262 3263 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3264 MachineRegisterInfo &MRI, 3265 MachineIRBuilder &B) const { 3266 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3267 if (!MFI->isEntryFunction()) { 3268 return legalizePreloadedArgIntrin(MI, MRI, B, 3269 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3270 } 3271 3272 Register DstReg = MI.getOperand(0).getReg(); 3273 if (!getImplicitArgPtr(DstReg, MRI, B)) 3274 return false; 3275 3276 MI.eraseFromParent(); 3277 return true; 3278 } 3279 3280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3281 MachineRegisterInfo &MRI, 3282 MachineIRBuilder &B, 3283 unsigned AddrSpace) const { 3284 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3285 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3286 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3287 MI.eraseFromParent(); 3288 return true; 3289 } 3290 3291 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3292 // offset (the offset that is included in bounds checking and swizzling, to be 3293 // split between the instruction's voffset and immoffset fields) and soffset 3294 // (the offset that is excluded from bounds checking and swizzling, to go in 3295 // the instruction's soffset field). This function takes the first kind of 3296 // offset and figures out how to split it between voffset and immoffset. 3297 std::tuple<Register, unsigned, unsigned> 3298 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3299 Register OrigOffset) const { 3300 const unsigned MaxImm = 4095; 3301 Register BaseReg; 3302 unsigned TotalConstOffset; 3303 MachineInstr *OffsetDef; 3304 const LLT S32 = LLT::scalar(32); 3305 3306 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3307 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3308 3309 unsigned ImmOffset = TotalConstOffset; 3310 3311 // If the immediate value is too big for the immoffset field, put the value 3312 // and -4096 into the immoffset field so that the value that is copied/added 3313 // for the voffset field is a multiple of 4096, and it stands more chance 3314 // of being CSEd with the copy/add for another similar load/store. 3315 // However, do not do that rounding down to a multiple of 4096 if that is a 3316 // negative number, as it appears to be illegal to have a negative offset 3317 // in the vgpr, even if adding the immediate offset makes it positive. 3318 unsigned Overflow = ImmOffset & ~MaxImm; 3319 ImmOffset -= Overflow; 3320 if ((int32_t)Overflow < 0) { 3321 Overflow += ImmOffset; 3322 ImmOffset = 0; 3323 } 3324 3325 if (Overflow != 0) { 3326 if (!BaseReg) { 3327 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3328 } else { 3329 auto OverflowVal = B.buildConstant(S32, Overflow); 3330 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3331 } 3332 } 3333 3334 if (!BaseReg) 3335 BaseReg = B.buildConstant(S32, 0).getReg(0); 3336 3337 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3338 } 3339 3340 /// Handle register layout difference for f16 images for some subtargets. 3341 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3342 MachineRegisterInfo &MRI, 3343 Register Reg) const { 3344 if (!ST.hasUnpackedD16VMem()) 3345 return Reg; 3346 3347 const LLT S16 = LLT::scalar(16); 3348 const LLT S32 = LLT::scalar(32); 3349 LLT StoreVT = MRI.getType(Reg); 3350 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3351 3352 auto Unmerge = B.buildUnmerge(S16, Reg); 3353 3354 SmallVector<Register, 4> WideRegs; 3355 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3356 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3357 3358 int NumElts = StoreVT.getNumElements(); 3359 3360 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3361 } 3362 3363 Register AMDGPULegalizerInfo::fixStoreSourceType( 3364 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3365 MachineRegisterInfo *MRI = B.getMRI(); 3366 LLT Ty = MRI->getType(VData); 3367 3368 const LLT S16 = LLT::scalar(16); 3369 3370 // Fixup illegal register types for i8 stores. 3371 if (Ty == LLT::scalar(8) || Ty == S16) { 3372 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3373 return AnyExt; 3374 } 3375 3376 if (Ty.isVector()) { 3377 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3378 if (IsFormat) 3379 return handleD16VData(B, *MRI, VData); 3380 } 3381 } 3382 3383 return VData; 3384 } 3385 3386 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3387 MachineRegisterInfo &MRI, 3388 MachineIRBuilder &B, 3389 bool IsTyped, 3390 bool IsFormat) const { 3391 Register VData = MI.getOperand(1).getReg(); 3392 LLT Ty = MRI.getType(VData); 3393 LLT EltTy = Ty.getScalarType(); 3394 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3395 const LLT S32 = LLT::scalar(32); 3396 3397 VData = fixStoreSourceType(B, VData, IsFormat); 3398 Register RSrc = MI.getOperand(2).getReg(); 3399 3400 MachineMemOperand *MMO = *MI.memoperands_begin(); 3401 const int MemSize = MMO->getSize(); 3402 3403 unsigned ImmOffset; 3404 unsigned TotalOffset; 3405 3406 // The typed intrinsics add an immediate after the registers. 3407 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3408 3409 // The struct intrinsic variants add one additional operand over raw. 3410 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3411 Register VIndex; 3412 int OpOffset = 0; 3413 if (HasVIndex) { 3414 VIndex = MI.getOperand(3).getReg(); 3415 OpOffset = 1; 3416 } 3417 3418 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3419 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3420 3421 unsigned Format = 0; 3422 if (IsTyped) { 3423 Format = MI.getOperand(5 + OpOffset).getImm(); 3424 ++OpOffset; 3425 } 3426 3427 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3428 3429 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3430 if (TotalOffset != 0) 3431 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3432 3433 unsigned Opc; 3434 if (IsTyped) { 3435 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3436 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3437 } else if (IsFormat) { 3438 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3439 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3440 } else { 3441 switch (MemSize) { 3442 case 1: 3443 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3444 break; 3445 case 2: 3446 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3447 break; 3448 default: 3449 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3450 break; 3451 } 3452 } 3453 3454 if (!VIndex) 3455 VIndex = B.buildConstant(S32, 0).getReg(0); 3456 3457 auto MIB = B.buildInstr(Opc) 3458 .addUse(VData) // vdata 3459 .addUse(RSrc) // rsrc 3460 .addUse(VIndex) // vindex 3461 .addUse(VOffset) // voffset 3462 .addUse(SOffset) // soffset 3463 .addImm(ImmOffset); // offset(imm) 3464 3465 if (IsTyped) 3466 MIB.addImm(Format); 3467 3468 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3469 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3470 .addMemOperand(MMO); 3471 3472 MI.eraseFromParent(); 3473 return true; 3474 } 3475 3476 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3477 MachineRegisterInfo &MRI, 3478 MachineIRBuilder &B, 3479 bool IsFormat, 3480 bool IsTyped) const { 3481 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3482 MachineMemOperand *MMO = *MI.memoperands_begin(); 3483 const int MemSize = MMO->getSize(); 3484 const LLT S32 = LLT::scalar(32); 3485 3486 Register Dst = MI.getOperand(0).getReg(); 3487 Register RSrc = MI.getOperand(2).getReg(); 3488 3489 // The typed intrinsics add an immediate after the registers. 3490 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3491 3492 // The struct intrinsic variants add one additional operand over raw. 3493 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3494 Register VIndex; 3495 int OpOffset = 0; 3496 if (HasVIndex) { 3497 VIndex = MI.getOperand(3).getReg(); 3498 OpOffset = 1; 3499 } 3500 3501 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3502 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3503 3504 unsigned Format = 0; 3505 if (IsTyped) { 3506 Format = MI.getOperand(5 + OpOffset).getImm(); 3507 ++OpOffset; 3508 } 3509 3510 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3511 unsigned ImmOffset; 3512 unsigned TotalOffset; 3513 3514 LLT Ty = MRI.getType(Dst); 3515 LLT EltTy = Ty.getScalarType(); 3516 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3517 const bool Unpacked = ST.hasUnpackedD16VMem(); 3518 3519 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3520 if (TotalOffset != 0) 3521 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3522 3523 unsigned Opc; 3524 3525 if (IsTyped) { 3526 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3527 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3528 } else if (IsFormat) { 3529 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3530 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3531 } else { 3532 switch (MemSize) { 3533 case 1: 3534 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3535 break; 3536 case 2: 3537 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3538 break; 3539 default: 3540 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3541 break; 3542 } 3543 } 3544 3545 Register LoadDstReg; 3546 3547 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3548 LLT UnpackedTy = Ty.changeElementSize(32); 3549 3550 if (IsExtLoad) 3551 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3552 else if (Unpacked && IsD16 && Ty.isVector()) 3553 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3554 else 3555 LoadDstReg = Dst; 3556 3557 if (!VIndex) 3558 VIndex = B.buildConstant(S32, 0).getReg(0); 3559 3560 auto MIB = B.buildInstr(Opc) 3561 .addDef(LoadDstReg) // vdata 3562 .addUse(RSrc) // rsrc 3563 .addUse(VIndex) // vindex 3564 .addUse(VOffset) // voffset 3565 .addUse(SOffset) // soffset 3566 .addImm(ImmOffset); // offset(imm) 3567 3568 if (IsTyped) 3569 MIB.addImm(Format); 3570 3571 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3572 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3573 .addMemOperand(MMO); 3574 3575 if (LoadDstReg != Dst) { 3576 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3577 3578 // Widen result for extending loads was widened. 3579 if (IsExtLoad) 3580 B.buildTrunc(Dst, LoadDstReg); 3581 else { 3582 // Repack to original 16-bit vector result 3583 // FIXME: G_TRUNC should work, but legalization currently fails 3584 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3585 SmallVector<Register, 4> Repack; 3586 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3587 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3588 B.buildMerge(Dst, Repack); 3589 } 3590 } 3591 3592 MI.eraseFromParent(); 3593 return true; 3594 } 3595 3596 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3597 MachineIRBuilder &B, 3598 bool IsInc) const { 3599 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3600 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3601 B.buildInstr(Opc) 3602 .addDef(MI.getOperand(0).getReg()) 3603 .addUse(MI.getOperand(2).getReg()) 3604 .addUse(MI.getOperand(3).getReg()) 3605 .cloneMemRefs(MI); 3606 MI.eraseFromParent(); 3607 return true; 3608 } 3609 3610 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3611 switch (IntrID) { 3612 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3613 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3614 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3615 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3616 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3617 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3618 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3619 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3621 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3622 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3623 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3624 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3625 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3626 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3627 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3628 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3629 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3630 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3631 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3632 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3633 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3634 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3636 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3637 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3638 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3639 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3640 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3641 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3642 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3643 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3644 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3645 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3646 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3647 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3648 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3649 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3651 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3652 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3653 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3654 default: 3655 llvm_unreachable("unhandled atomic opcode"); 3656 } 3657 } 3658 3659 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3660 MachineIRBuilder &B, 3661 Intrinsic::ID IID) const { 3662 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3663 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3664 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3665 3666 Register Dst; 3667 3668 int OpOffset = 0; 3669 if (HasReturn) { 3670 // A few FP atomics do not support return values. 3671 Dst = MI.getOperand(0).getReg(); 3672 } else { 3673 OpOffset = -1; 3674 } 3675 3676 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3677 Register CmpVal; 3678 3679 if (IsCmpSwap) { 3680 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3681 ++OpOffset; 3682 } 3683 3684 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3685 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3686 3687 // The struct intrinsic variants add one additional operand over raw. 3688 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3689 Register VIndex; 3690 if (HasVIndex) { 3691 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3692 ++OpOffset; 3693 } 3694 3695 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3696 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3697 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3698 3699 MachineMemOperand *MMO = *MI.memoperands_begin(); 3700 3701 unsigned ImmOffset; 3702 unsigned TotalOffset; 3703 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3704 if (TotalOffset != 0) 3705 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3706 3707 if (!VIndex) 3708 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3709 3710 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3711 3712 if (HasReturn) 3713 MIB.addDef(Dst); 3714 3715 MIB.addUse(VData); // vdata 3716 3717 if (IsCmpSwap) 3718 MIB.addReg(CmpVal); 3719 3720 MIB.addUse(RSrc) // rsrc 3721 .addUse(VIndex) // vindex 3722 .addUse(VOffset) // voffset 3723 .addUse(SOffset) // soffset 3724 .addImm(ImmOffset) // offset(imm) 3725 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3726 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3727 .addMemOperand(MMO); 3728 3729 MI.eraseFromParent(); 3730 return true; 3731 } 3732 3733 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3734 /// vector with s16 typed elements. 3735 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3736 SmallVectorImpl<Register> &PackedAddrs, 3737 int AddrIdx, int DimIdx, int EndIdx, 3738 int NumGradients) { 3739 const LLT S16 = LLT::scalar(16); 3740 const LLT V2S16 = LLT::vector(2, 16); 3741 3742 for (int I = AddrIdx; I < EndIdx; ++I) { 3743 MachineOperand &SrcOp = MI.getOperand(I); 3744 if (!SrcOp.isReg()) 3745 continue; // _L to _LZ may have eliminated this. 3746 3747 Register AddrReg = SrcOp.getReg(); 3748 3749 if (I < DimIdx) { 3750 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3751 PackedAddrs.push_back(AddrReg); 3752 } else { 3753 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3754 // derivatives dx/dh and dx/dv are packed with undef. 3755 if (((I + 1) >= EndIdx) || 3756 ((NumGradients / 2) % 2 == 1 && 3757 (I == DimIdx + (NumGradients / 2) - 1 || 3758 I == DimIdx + NumGradients - 1)) || 3759 // Check for _L to _LZ optimization 3760 !MI.getOperand(I + 1).isReg()) { 3761 PackedAddrs.push_back( 3762 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3763 .getReg(0)); 3764 } else { 3765 PackedAddrs.push_back( 3766 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3767 .getReg(0)); 3768 ++I; 3769 } 3770 } 3771 } 3772 } 3773 3774 /// Convert from separate vaddr components to a single vector address register, 3775 /// and replace the remaining operands with $noreg. 3776 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3777 int DimIdx, int NumVAddrs) { 3778 const LLT S32 = LLT::scalar(32); 3779 3780 SmallVector<Register, 8> AddrRegs; 3781 for (int I = 0; I != NumVAddrs; ++I) { 3782 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3783 if (SrcOp.isReg()) { 3784 AddrRegs.push_back(SrcOp.getReg()); 3785 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3786 } 3787 } 3788 3789 int NumAddrRegs = AddrRegs.size(); 3790 if (NumAddrRegs != 1) { 3791 // Round up to 8 elements for v5-v7 3792 // FIXME: Missing intermediate sized register classes and instructions. 3793 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3794 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3795 auto Undef = B.buildUndef(S32); 3796 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3797 NumAddrRegs = RoundedNumRegs; 3798 } 3799 3800 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3801 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3802 } 3803 3804 for (int I = 1; I != NumVAddrs; ++I) { 3805 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3806 if (SrcOp.isReg()) 3807 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3808 } 3809 } 3810 3811 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3812 /// 3813 /// Depending on the subtarget, load/store with 16-bit element data need to be 3814 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3815 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3816 /// registers. 3817 /// 3818 /// We don't want to directly select image instructions just yet, but also want 3819 /// to exposes all register repacking to the legalizer/combiners. We also don't 3820 /// want a selected instrution entering RegBankSelect. In order to avoid 3821 /// defining a multitude of intermediate image instructions, directly hack on 3822 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3823 /// now unnecessary arguments with $noreg. 3824 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3825 MachineInstr &MI, MachineIRBuilder &B, 3826 GISelChangeObserver &Observer, 3827 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3828 3829 const int NumDefs = MI.getNumExplicitDefs(); 3830 bool IsTFE = NumDefs == 2; 3831 // We are only processing the operands of d16 image operations on subtargets 3832 // that use the unpacked register layout, or need to repack the TFE result. 3833 3834 // TODO: Do we need to guard against already legalized intrinsics? 3835 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3836 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3837 3838 MachineRegisterInfo *MRI = B.getMRI(); 3839 const LLT S32 = LLT::scalar(32); 3840 const LLT S16 = LLT::scalar(16); 3841 const LLT V2S16 = LLT::vector(2, 16); 3842 3843 // Index of first address argument 3844 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3845 3846 int NumVAddrs, NumGradients; 3847 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3848 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3849 getDMaskIdx(BaseOpcode, NumDefs); 3850 unsigned DMask = 0; 3851 3852 // Check for 16 bit addresses and pack if true. 3853 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3854 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3855 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3856 const bool IsG16 = GradTy == S16; 3857 const bool IsA16 = AddrTy == S16; 3858 3859 int DMaskLanes = 0; 3860 if (!BaseOpcode->Atomic) { 3861 DMask = MI.getOperand(DMaskIdx).getImm(); 3862 if (BaseOpcode->Gather4) { 3863 DMaskLanes = 4; 3864 } else if (DMask != 0) { 3865 DMaskLanes = countPopulation(DMask); 3866 } else if (!IsTFE && !BaseOpcode->Store) { 3867 // If dmask is 0, this is a no-op load. This can be eliminated. 3868 B.buildUndef(MI.getOperand(0)); 3869 MI.eraseFromParent(); 3870 return true; 3871 } 3872 } 3873 3874 Observer.changingInstr(MI); 3875 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3876 3877 unsigned NewOpcode = NumDefs == 0 ? 3878 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3879 3880 // Track that we legalized this 3881 MI.setDesc(B.getTII().get(NewOpcode)); 3882 3883 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3884 // dmask to be at least 1 otherwise the instruction will fail 3885 if (IsTFE && DMask == 0) { 3886 DMask = 0x1; 3887 DMaskLanes = 1; 3888 MI.getOperand(DMaskIdx).setImm(DMask); 3889 } 3890 3891 if (BaseOpcode->Atomic) { 3892 Register VData0 = MI.getOperand(2).getReg(); 3893 LLT Ty = MRI->getType(VData0); 3894 3895 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3896 if (Ty.isVector()) 3897 return false; 3898 3899 if (BaseOpcode->AtomicX2) { 3900 Register VData1 = MI.getOperand(3).getReg(); 3901 // The two values are packed in one register. 3902 LLT PackedTy = LLT::vector(2, Ty); 3903 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3904 MI.getOperand(2).setReg(Concat.getReg(0)); 3905 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3906 } 3907 } 3908 3909 int CorrectedNumVAddrs = NumVAddrs; 3910 3911 // Optimize _L to _LZ when _L is zero 3912 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3913 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3914 const ConstantFP *ConstantLod; 3915 const int LodIdx = AddrIdx + NumVAddrs - 1; 3916 3917 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3918 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3919 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3920 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3921 LZMappingInfo->LZ, ImageDimIntr->Dim); 3922 3923 // The starting indexes should remain in the same place. 3924 --NumVAddrs; 3925 --CorrectedNumVAddrs; 3926 3927 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3928 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3929 MI.RemoveOperand(LodIdx); 3930 } 3931 } 3932 } 3933 3934 // Optimize _mip away, when 'lod' is zero 3935 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3936 int64_t ConstantLod; 3937 const int LodIdx = AddrIdx + NumVAddrs - 1; 3938 3939 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3940 if (ConstantLod == 0) { 3941 // TODO: Change intrinsic opcode and remove operand instead or replacing 3942 // it with 0, as the _L to _LZ handling is done above. 3943 MI.getOperand(LodIdx).ChangeToImmediate(0); 3944 --CorrectedNumVAddrs; 3945 } 3946 } 3947 } 3948 3949 // Rewrite the addressing register layout before doing anything else. 3950 if (IsA16 || IsG16) { 3951 if (IsA16) { 3952 // Target must support the feature and gradients need to be 16 bit too 3953 if (!ST.hasA16() || !IsG16) 3954 return false; 3955 } else if (!ST.hasG16()) 3956 return false; 3957 3958 if (NumVAddrs > 1) { 3959 SmallVector<Register, 4> PackedRegs; 3960 // Don't compress addresses for G16 3961 const int PackEndIdx = 3962 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3963 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3964 PackEndIdx, NumGradients); 3965 3966 if (!IsA16) { 3967 // Add uncompressed address 3968 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3969 int AddrReg = MI.getOperand(I).getReg(); 3970 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3971 PackedRegs.push_back(AddrReg); 3972 } 3973 } 3974 3975 // See also below in the non-a16 branch 3976 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3977 3978 if (!UseNSA && PackedRegs.size() > 1) { 3979 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3980 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3981 PackedRegs[0] = Concat.getReg(0); 3982 PackedRegs.resize(1); 3983 } 3984 3985 const int NumPacked = PackedRegs.size(); 3986 for (int I = 0; I != NumVAddrs; ++I) { 3987 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3988 if (!SrcOp.isReg()) { 3989 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3990 continue; 3991 } 3992 3993 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3994 3995 if (I < NumPacked) 3996 SrcOp.setReg(PackedRegs[I]); 3997 else 3998 SrcOp.setReg(AMDGPU::NoRegister); 3999 } 4000 } 4001 } else { 4002 // If the register allocator cannot place the address registers contiguously 4003 // without introducing moves, then using the non-sequential address encoding 4004 // is always preferable, since it saves VALU instructions and is usually a 4005 // wash in terms of code size or even better. 4006 // 4007 // However, we currently have no way of hinting to the register allocator 4008 // that MIMG addresses should be placed contiguously when it is possible to 4009 // do so, so force non-NSA for the common 2-address case as a heuristic. 4010 // 4011 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4012 // allocation when possible. 4013 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4014 4015 if (!UseNSA && NumVAddrs > 1) 4016 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4017 } 4018 4019 int Flags = 0; 4020 if (IsA16) 4021 Flags |= 1; 4022 if (IsG16) 4023 Flags |= 2; 4024 MI.addOperand(MachineOperand::CreateImm(Flags)); 4025 4026 if (BaseOpcode->Store) { // No TFE for stores? 4027 // TODO: Handle dmask trim 4028 Register VData = MI.getOperand(1).getReg(); 4029 LLT Ty = MRI->getType(VData); 4030 if (!Ty.isVector() || Ty.getElementType() != S16) 4031 return true; 4032 4033 Register RepackedReg = handleD16VData(B, *MRI, VData); 4034 if (RepackedReg != VData) { 4035 MI.getOperand(1).setReg(RepackedReg); 4036 } 4037 4038 return true; 4039 } 4040 4041 Register DstReg = MI.getOperand(0).getReg(); 4042 LLT Ty = MRI->getType(DstReg); 4043 const LLT EltTy = Ty.getScalarType(); 4044 const bool IsD16 = Ty.getScalarType() == S16; 4045 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4046 4047 // Confirm that the return type is large enough for the dmask specified 4048 if (NumElts < DMaskLanes) 4049 return false; 4050 4051 if (NumElts > 4 || DMaskLanes > 4) 4052 return false; 4053 4054 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4055 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4056 4057 // The raw dword aligned data component of the load. The only legal cases 4058 // where this matters should be when using the packed D16 format, for 4059 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4060 LLT RoundedTy; 4061 4062 // S32 vector to to cover all data, plus TFE result element. 4063 LLT TFETy; 4064 4065 // Register type to use for each loaded component. Will be S32 or V2S16. 4066 LLT RegTy; 4067 4068 if (IsD16 && ST.hasUnpackedD16VMem()) { 4069 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4070 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4071 RegTy = S32; 4072 } else { 4073 unsigned EltSize = EltTy.getSizeInBits(); 4074 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4075 unsigned RoundedSize = 32 * RoundedElts; 4076 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4077 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4078 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4079 } 4080 4081 // The return type does not need adjustment. 4082 // TODO: Should we change s16 case to s32 or <2 x s16>? 4083 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4084 return true; 4085 4086 Register Dst1Reg; 4087 4088 // Insert after the instruction. 4089 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4090 4091 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4092 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4093 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4094 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4095 4096 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4097 4098 MI.getOperand(0).setReg(NewResultReg); 4099 4100 // In the IR, TFE is supposed to be used with a 2 element struct return 4101 // type. The intruction really returns these two values in one contiguous 4102 // register, with one additional dword beyond the loaded data. Rewrite the 4103 // return type to use a single register result. 4104 4105 if (IsTFE) { 4106 Dst1Reg = MI.getOperand(1).getReg(); 4107 if (MRI->getType(Dst1Reg) != S32) 4108 return false; 4109 4110 // TODO: Make sure the TFE operand bit is set. 4111 MI.RemoveOperand(1); 4112 4113 // Handle the easy case that requires no repack instructions. 4114 if (Ty == S32) { 4115 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4116 return true; 4117 } 4118 } 4119 4120 // Now figure out how to copy the new result register back into the old 4121 // result. 4122 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4123 4124 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4125 4126 if (ResultNumRegs == 1) { 4127 assert(!IsTFE); 4128 ResultRegs[0] = NewResultReg; 4129 } else { 4130 // We have to repack into a new vector of some kind. 4131 for (int I = 0; I != NumDataRegs; ++I) 4132 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4133 B.buildUnmerge(ResultRegs, NewResultReg); 4134 4135 // Drop the final TFE element to get the data part. The TFE result is 4136 // directly written to the right place already. 4137 if (IsTFE) 4138 ResultRegs.resize(NumDataRegs); 4139 } 4140 4141 // For an s16 scalar result, we form an s32 result with a truncate regardless 4142 // of packed vs. unpacked. 4143 if (IsD16 && !Ty.isVector()) { 4144 B.buildTrunc(DstReg, ResultRegs[0]); 4145 return true; 4146 } 4147 4148 // Avoid a build/concat_vector of 1 entry. 4149 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4150 B.buildBitcast(DstReg, ResultRegs[0]); 4151 return true; 4152 } 4153 4154 assert(Ty.isVector()); 4155 4156 if (IsD16) { 4157 // For packed D16 results with TFE enabled, all the data components are 4158 // S32. Cast back to the expected type. 4159 // 4160 // TODO: We don't really need to use load s32 elements. We would only need one 4161 // cast for the TFE result if a multiple of v2s16 was used. 4162 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4163 for (Register &Reg : ResultRegs) 4164 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4165 } else if (ST.hasUnpackedD16VMem()) { 4166 for (Register &Reg : ResultRegs) 4167 Reg = B.buildTrunc(S16, Reg).getReg(0); 4168 } 4169 } 4170 4171 auto padWithUndef = [&](LLT Ty, int NumElts) { 4172 if (NumElts == 0) 4173 return; 4174 Register Undef = B.buildUndef(Ty).getReg(0); 4175 for (int I = 0; I != NumElts; ++I) 4176 ResultRegs.push_back(Undef); 4177 }; 4178 4179 // Pad out any elements eliminated due to the dmask. 4180 LLT ResTy = MRI->getType(ResultRegs[0]); 4181 if (!ResTy.isVector()) { 4182 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4183 B.buildBuildVector(DstReg, ResultRegs); 4184 return true; 4185 } 4186 4187 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4188 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4189 4190 // Deal with the one annoying legal case. 4191 const LLT V3S16 = LLT::vector(3, 16); 4192 if (Ty == V3S16) { 4193 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4194 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4195 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4196 return true; 4197 } 4198 4199 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4200 B.buildConcatVectors(DstReg, ResultRegs); 4201 return true; 4202 } 4203 4204 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4205 LegalizerHelper &Helper, MachineInstr &MI) const { 4206 MachineIRBuilder &B = Helper.MIRBuilder; 4207 GISelChangeObserver &Observer = Helper.Observer; 4208 4209 Register Dst = MI.getOperand(0).getReg(); 4210 LLT Ty = B.getMRI()->getType(Dst); 4211 unsigned Size = Ty.getSizeInBits(); 4212 MachineFunction &MF = B.getMF(); 4213 4214 Observer.changingInstr(MI); 4215 4216 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4217 Ty = getBitcastRegisterType(Ty); 4218 Helper.bitcastDst(MI, Ty, 0); 4219 Dst = MI.getOperand(0).getReg(); 4220 B.setInsertPt(B.getMBB(), MI); 4221 } 4222 4223 // FIXME: We don't really need this intermediate instruction. The intrinsic 4224 // should be fixed to have a memory operand. Since it's readnone, we're not 4225 // allowed to add one. 4226 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4227 MI.RemoveOperand(1); // Remove intrinsic ID 4228 4229 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4230 // TODO: Should this use datalayout alignment? 4231 const unsigned MemSize = (Size + 7) / 8; 4232 const Align MemAlign(4); 4233 MachineMemOperand *MMO = MF.getMachineMemOperand( 4234 MachinePointerInfo(), 4235 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4236 MachineMemOperand::MOInvariant, 4237 MemSize, MemAlign); 4238 MI.addMemOperand(MF, MMO); 4239 4240 // There are no 96-bit result scalar loads, but widening to 128-bit should 4241 // always be legal. We may need to restore this to a 96-bit result if it turns 4242 // out this needs to be converted to a vector load during RegBankSelect. 4243 if (!isPowerOf2_32(Size)) { 4244 if (Ty.isVector()) 4245 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4246 else 4247 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4248 } 4249 4250 Observer.changedInstr(MI); 4251 return true; 4252 } 4253 4254 // TODO: Move to selection 4255 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4256 MachineRegisterInfo &MRI, 4257 MachineIRBuilder &B) const { 4258 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4259 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4260 !ST.isTrapHandlerEnabled()) { 4261 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4262 } else { 4263 // Pass queue pointer to trap handler as input, and insert trap instruction 4264 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4265 MachineRegisterInfo &MRI = *B.getMRI(); 4266 4267 Register LiveIn = 4268 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4269 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4270 return false; 4271 4272 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4273 B.buildCopy(SGPR01, LiveIn); 4274 B.buildInstr(AMDGPU::S_TRAP) 4275 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4276 .addReg(SGPR01, RegState::Implicit); 4277 } 4278 4279 MI.eraseFromParent(); 4280 return true; 4281 } 4282 4283 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4284 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4285 // Is non-HSA path or trap-handler disabled? then, report a warning 4286 // accordingly 4287 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4288 !ST.isTrapHandlerEnabled()) { 4289 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4290 "debugtrap handler not supported", 4291 MI.getDebugLoc(), DS_Warning); 4292 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4293 Ctx.diagnose(NoTrap); 4294 } else { 4295 // Insert debug-trap instruction 4296 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4297 } 4298 4299 MI.eraseFromParent(); 4300 return true; 4301 } 4302 4303 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4304 MachineInstr &MI) const { 4305 MachineIRBuilder &B = Helper.MIRBuilder; 4306 MachineRegisterInfo &MRI = *B.getMRI(); 4307 4308 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4309 auto IntrID = MI.getIntrinsicID(); 4310 switch (IntrID) { 4311 case Intrinsic::amdgcn_if: 4312 case Intrinsic::amdgcn_else: { 4313 MachineInstr *Br = nullptr; 4314 MachineBasicBlock *UncondBrTarget = nullptr; 4315 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4316 const SIRegisterInfo *TRI 4317 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4318 4319 Register Def = MI.getOperand(1).getReg(); 4320 Register Use = MI.getOperand(3).getReg(); 4321 4322 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4323 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4324 if (IntrID == Intrinsic::amdgcn_if) { 4325 B.buildInstr(AMDGPU::SI_IF) 4326 .addDef(Def) 4327 .addUse(Use) 4328 .addMBB(UncondBrTarget); 4329 } else { 4330 B.buildInstr(AMDGPU::SI_ELSE) 4331 .addDef(Def) 4332 .addUse(Use) 4333 .addMBB(UncondBrTarget) 4334 .addImm(0); 4335 } 4336 4337 if (Br) { 4338 Br->getOperand(0).setMBB(CondBrTarget); 4339 } else { 4340 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4341 // since we're swapping branch targets it needs to be reinserted. 4342 // FIXME: IRTranslator should probably not do this 4343 B.buildBr(*CondBrTarget); 4344 } 4345 4346 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4347 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4348 MI.eraseFromParent(); 4349 BrCond->eraseFromParent(); 4350 return true; 4351 } 4352 4353 return false; 4354 } 4355 case Intrinsic::amdgcn_loop: { 4356 MachineInstr *Br = nullptr; 4357 MachineBasicBlock *UncondBrTarget = nullptr; 4358 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4359 const SIRegisterInfo *TRI 4360 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4361 4362 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4363 Register Reg = MI.getOperand(2).getReg(); 4364 4365 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4366 B.buildInstr(AMDGPU::SI_LOOP) 4367 .addUse(Reg) 4368 .addMBB(UncondBrTarget); 4369 4370 if (Br) 4371 Br->getOperand(0).setMBB(CondBrTarget); 4372 else 4373 B.buildBr(*CondBrTarget); 4374 4375 MI.eraseFromParent(); 4376 BrCond->eraseFromParent(); 4377 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4378 return true; 4379 } 4380 4381 return false; 4382 } 4383 case Intrinsic::amdgcn_kernarg_segment_ptr: 4384 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4385 // This only makes sense to call in a kernel, so just lower to null. 4386 B.buildConstant(MI.getOperand(0).getReg(), 0); 4387 MI.eraseFromParent(); 4388 return true; 4389 } 4390 4391 return legalizePreloadedArgIntrin( 4392 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4393 case Intrinsic::amdgcn_implicitarg_ptr: 4394 return legalizeImplicitArgPtr(MI, MRI, B); 4395 case Intrinsic::amdgcn_workitem_id_x: 4396 return legalizePreloadedArgIntrin(MI, MRI, B, 4397 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4398 case Intrinsic::amdgcn_workitem_id_y: 4399 return legalizePreloadedArgIntrin(MI, MRI, B, 4400 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4401 case Intrinsic::amdgcn_workitem_id_z: 4402 return legalizePreloadedArgIntrin(MI, MRI, B, 4403 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4404 case Intrinsic::amdgcn_workgroup_id_x: 4405 return legalizePreloadedArgIntrin(MI, MRI, B, 4406 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4407 case Intrinsic::amdgcn_workgroup_id_y: 4408 return legalizePreloadedArgIntrin(MI, MRI, B, 4409 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4410 case Intrinsic::amdgcn_workgroup_id_z: 4411 return legalizePreloadedArgIntrin(MI, MRI, B, 4412 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4413 case Intrinsic::amdgcn_dispatch_ptr: 4414 return legalizePreloadedArgIntrin(MI, MRI, B, 4415 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4416 case Intrinsic::amdgcn_queue_ptr: 4417 return legalizePreloadedArgIntrin(MI, MRI, B, 4418 AMDGPUFunctionArgInfo::QUEUE_PTR); 4419 case Intrinsic::amdgcn_implicit_buffer_ptr: 4420 return legalizePreloadedArgIntrin( 4421 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4422 case Intrinsic::amdgcn_dispatch_id: 4423 return legalizePreloadedArgIntrin(MI, MRI, B, 4424 AMDGPUFunctionArgInfo::DISPATCH_ID); 4425 case Intrinsic::amdgcn_fdiv_fast: 4426 return legalizeFDIVFastIntrin(MI, MRI, B); 4427 case Intrinsic::amdgcn_is_shared: 4428 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4429 case Intrinsic::amdgcn_is_private: 4430 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4431 case Intrinsic::amdgcn_wavefrontsize: { 4432 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4433 MI.eraseFromParent(); 4434 return true; 4435 } 4436 case Intrinsic::amdgcn_s_buffer_load: 4437 return legalizeSBufferLoad(Helper, MI); 4438 case Intrinsic::amdgcn_raw_buffer_store: 4439 case Intrinsic::amdgcn_struct_buffer_store: 4440 return legalizeBufferStore(MI, MRI, B, false, false); 4441 case Intrinsic::amdgcn_raw_buffer_store_format: 4442 case Intrinsic::amdgcn_struct_buffer_store_format: 4443 return legalizeBufferStore(MI, MRI, B, false, true); 4444 case Intrinsic::amdgcn_raw_tbuffer_store: 4445 case Intrinsic::amdgcn_struct_tbuffer_store: 4446 return legalizeBufferStore(MI, MRI, B, true, true); 4447 case Intrinsic::amdgcn_raw_buffer_load: 4448 case Intrinsic::amdgcn_struct_buffer_load: 4449 return legalizeBufferLoad(MI, MRI, B, false, false); 4450 case Intrinsic::amdgcn_raw_buffer_load_format: 4451 case Intrinsic::amdgcn_struct_buffer_load_format: 4452 return legalizeBufferLoad(MI, MRI, B, true, false); 4453 case Intrinsic::amdgcn_raw_tbuffer_load: 4454 case Intrinsic::amdgcn_struct_tbuffer_load: 4455 return legalizeBufferLoad(MI, MRI, B, true, true); 4456 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4457 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4458 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4459 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4460 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4461 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4462 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4463 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4464 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4465 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4466 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4467 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4468 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4469 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4470 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4471 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4472 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4473 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4474 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4475 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4476 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4477 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4478 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4479 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4480 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4481 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4482 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4483 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4484 return legalizeBufferAtomic(MI, B, IntrID); 4485 case Intrinsic::amdgcn_atomic_inc: 4486 return legalizeAtomicIncDec(MI, B, true); 4487 case Intrinsic::amdgcn_atomic_dec: 4488 return legalizeAtomicIncDec(MI, B, false); 4489 case Intrinsic::trap: 4490 return legalizeTrapIntrinsic(MI, MRI, B); 4491 case Intrinsic::debugtrap: 4492 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4493 case Intrinsic::amdgcn_rsq_clamp: 4494 return legalizeRsqClampIntrinsic(MI, MRI, B); 4495 case Intrinsic::amdgcn_ds_fadd: 4496 case Intrinsic::amdgcn_ds_fmin: 4497 case Intrinsic::amdgcn_ds_fmax: 4498 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4499 default: { 4500 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4501 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4502 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4503 return true; 4504 } 4505 } 4506 4507 return true; 4508 } 4509