1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returs true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 111 }; 112 } 113 114 // Increase the number of vector elements to reach the next multiple of 32-bit 115 // type. 116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 117 return [=](const LegalityQuery &Query) { 118 const LLT Ty = Query.Types[TypeIdx]; 119 120 const LLT EltTy = Ty.getElementType(); 121 const int Size = Ty.getSizeInBits(); 122 const int EltSize = EltTy.getSizeInBits(); 123 const int NextMul32 = (Size + 31) / 32; 124 125 assert(EltSize < 32); 126 127 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 128 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 129 }; 130 } 131 132 static LLT getBitcastRegisterType(const LLT Ty) { 133 const unsigned Size = Ty.getSizeInBits(); 134 135 LLT CoercedTy; 136 if (Size <= 32) { 137 // <2 x s8> -> s16 138 // <4 x s8> -> s32 139 return LLT::scalar(Size); 140 } 141 142 return LLT::scalarOrVector(Size / 32, 32); 143 } 144 145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 149 }; 150 } 151 152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 unsigned Size = Ty.getSizeInBits(); 156 assert(Size % 32 == 0); 157 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 158 }; 159 } 160 161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 165 }; 166 } 167 168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 172 }; 173 } 174 175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 176 return [=](const LegalityQuery &Query) { 177 const LLT QueryTy = Query.Types[TypeIdx]; 178 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 179 }; 180 } 181 182 static bool isRegisterSize(unsigned Size) { 183 return Size % 32 == 0 && Size <= MaxRegisterSize; 184 } 185 186 static bool isRegisterVectorElementType(LLT EltTy) { 187 const int EltSize = EltTy.getSizeInBits(); 188 return EltSize == 16 || EltSize % 32 == 0; 189 } 190 191 static bool isRegisterVectorType(LLT Ty) { 192 const int EltSize = Ty.getElementType().getSizeInBits(); 193 return EltSize == 32 || EltSize == 64 || 194 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 195 EltSize == 128 || EltSize == 256; 196 } 197 198 static bool isRegisterType(LLT Ty) { 199 if (!isRegisterSize(Ty.getSizeInBits())) 200 return false; 201 202 if (Ty.isVector()) 203 return isRegisterVectorType(Ty); 204 205 return true; 206 } 207 208 // Any combination of 32 or 64-bit elements up the maximum register size, and 209 // multiples of v2s16. 210 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 211 return [=](const LegalityQuery &Query) { 212 return isRegisterType(Query.Types[TypeIdx]); 213 }; 214 } 215 216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 217 return [=](const LegalityQuery &Query) { 218 const LLT QueryTy = Query.Types[TypeIdx]; 219 if (!QueryTy.isVector()) 220 return false; 221 const LLT EltTy = QueryTy.getElementType(); 222 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 223 }; 224 } 225 226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 227 return [=](const LegalityQuery &Query) { 228 const LLT Ty = Query.Types[TypeIdx]; 229 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 230 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 231 }; 232 } 233 234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 235 // handle some operations by just promoting the register during 236 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 238 bool IsLoad) { 239 switch (AS) { 240 case AMDGPUAS::PRIVATE_ADDRESS: 241 // FIXME: Private element size. 242 return 32; 243 case AMDGPUAS::LOCAL_ADDRESS: 244 return ST.useDS128() ? 128 : 64; 245 case AMDGPUAS::GLOBAL_ADDRESS: 246 case AMDGPUAS::CONSTANT_ADDRESS: 247 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 248 // Treat constant and global as identical. SMRD loads are sometimes usable for 249 // global loads (ideally constant address space should be eliminated) 250 // depending on the context. Legality cannot be context dependent, but 251 // RegBankSelect can split the load as necessary depending on the pointer 252 // register bank/uniformity and if the memory is invariant or not written in a 253 // kernel. 254 return IsLoad ? 512 : 128; 255 default: 256 // Flat addresses may contextually need to be split to 32-bit parts if they 257 // may alias scratch depending on the subtarget. 258 return 128; 259 } 260 } 261 262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 263 const LegalityQuery &Query, 264 unsigned Opcode) { 265 const LLT Ty = Query.Types[0]; 266 267 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 268 const bool IsLoad = Opcode != AMDGPU::G_STORE; 269 270 unsigned RegSize = Ty.getSizeInBits(); 271 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 272 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 273 unsigned AS = Query.Types[1].getAddressSpace(); 274 275 // All of these need to be custom lowered to cast the pointer operand. 276 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 277 return false; 278 279 // TODO: We should be able to widen loads if the alignment is high enough, but 280 // we also need to modify the memory access size. 281 #if 0 282 // Accept widening loads based on alignment. 283 if (IsLoad && MemSize < Size) 284 MemSize = std::max(MemSize, Align); 285 #endif 286 287 // Only 1-byte and 2-byte to 32-bit extloads are valid. 288 if (MemSize != RegSize && RegSize != 32) 289 return false; 290 291 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 292 return false; 293 294 switch (MemSize) { 295 case 8: 296 case 16: 297 case 32: 298 case 64: 299 case 128: 300 break; 301 case 96: 302 if (!ST.hasDwordx3LoadStores()) 303 return false; 304 break; 305 case 256: 306 case 512: 307 // These may contextually need to be broken down. 308 break; 309 default: 310 return false; 311 } 312 313 assert(RegSize >= MemSize); 314 315 if (AlignBits < MemSize) { 316 const SITargetLowering *TLI = ST.getTargetLowering(); 317 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 318 Align(AlignBits / 8))) 319 return false; 320 } 321 322 return true; 323 } 324 325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 326 // workaround this. Eventually it should ignore the type for loads and only care 327 // about the size. Return true in cases where we will workaround this for now by 328 // bitcasting. 329 static bool loadStoreBitcastWorkaround(const LLT Ty) { 330 if (EnableNewLegality) 331 return false; 332 333 const unsigned Size = Ty.getSizeInBits(); 334 if (Size <= 64) 335 return false; 336 if (!Ty.isVector()) 337 return true; 338 unsigned EltSize = Ty.getElementType().getSizeInBits(); 339 return EltSize != 32 && EltSize != 64; 340 } 341 342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 343 unsigned Opcode) { 344 const LLT Ty = Query.Types[0]; 345 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 346 !loadStoreBitcastWorkaround(Ty); 347 } 348 349 /// Return true if a load or store of the type should be lowered with a bitcast 350 /// to a different type. 351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 352 const unsigned MemSizeInBits) { 353 const unsigned Size = Ty.getSizeInBits(); 354 if (Size != MemSizeInBits) 355 return Size <= 32 && Ty.isVector(); 356 357 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 358 return true; 359 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 360 !isRegisterVectorElementType(Ty.getElementType()); 361 } 362 363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 364 const GCNTargetMachine &TM) 365 : ST(ST_) { 366 using namespace TargetOpcode; 367 368 auto GetAddrSpacePtr = [&TM](unsigned AS) { 369 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 370 }; 371 372 const LLT S1 = LLT::scalar(1); 373 const LLT S16 = LLT::scalar(16); 374 const LLT S32 = LLT::scalar(32); 375 const LLT S64 = LLT::scalar(64); 376 const LLT S128 = LLT::scalar(128); 377 const LLT S256 = LLT::scalar(256); 378 const LLT S512 = LLT::scalar(512); 379 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 380 381 const LLT V2S16 = LLT::vector(2, 16); 382 const LLT V4S16 = LLT::vector(4, 16); 383 384 const LLT V2S32 = LLT::vector(2, 32); 385 const LLT V3S32 = LLT::vector(3, 32); 386 const LLT V4S32 = LLT::vector(4, 32); 387 const LLT V5S32 = LLT::vector(5, 32); 388 const LLT V6S32 = LLT::vector(6, 32); 389 const LLT V7S32 = LLT::vector(7, 32); 390 const LLT V8S32 = LLT::vector(8, 32); 391 const LLT V9S32 = LLT::vector(9, 32); 392 const LLT V10S32 = LLT::vector(10, 32); 393 const LLT V11S32 = LLT::vector(11, 32); 394 const LLT V12S32 = LLT::vector(12, 32); 395 const LLT V13S32 = LLT::vector(13, 32); 396 const LLT V14S32 = LLT::vector(14, 32); 397 const LLT V15S32 = LLT::vector(15, 32); 398 const LLT V16S32 = LLT::vector(16, 32); 399 const LLT V32S32 = LLT::vector(32, 32); 400 401 const LLT V2S64 = LLT::vector(2, 64); 402 const LLT V3S64 = LLT::vector(3, 64); 403 const LLT V4S64 = LLT::vector(4, 64); 404 const LLT V5S64 = LLT::vector(5, 64); 405 const LLT V6S64 = LLT::vector(6, 64); 406 const LLT V7S64 = LLT::vector(7, 64); 407 const LLT V8S64 = LLT::vector(8, 64); 408 const LLT V16S64 = LLT::vector(16, 64); 409 410 std::initializer_list<LLT> AllS32Vectors = 411 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 412 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 413 std::initializer_list<LLT> AllS64Vectors = 414 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 415 416 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 417 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 418 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 419 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 420 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 421 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 422 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 423 424 const LLT CodePtr = FlatPtr; 425 426 const std::initializer_list<LLT> AddrSpaces64 = { 427 GlobalPtr, ConstantPtr, FlatPtr 428 }; 429 430 const std::initializer_list<LLT> AddrSpaces32 = { 431 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 432 }; 433 434 const std::initializer_list<LLT> FPTypesBase = { 435 S32, S64 436 }; 437 438 const std::initializer_list<LLT> FPTypes16 = { 439 S32, S64, S16 440 }; 441 442 const std::initializer_list<LLT> FPTypesPK16 = { 443 S32, S64, S16, V2S16 444 }; 445 446 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 447 448 setAction({G_BRCOND, S1}, Legal); // VCC branches 449 setAction({G_BRCOND, S32}, Legal); // SCC branches 450 451 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 452 // elements for v3s16 453 getActionDefinitionsBuilder(G_PHI) 454 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 455 .legalFor(AllS32Vectors) 456 .legalFor(AllS64Vectors) 457 .legalFor(AddrSpaces64) 458 .legalFor(AddrSpaces32) 459 .legalIf(isPointer(0)) 460 .clampScalar(0, S16, S256) 461 .widenScalarToNextPow2(0, 32) 462 .clampMaxNumElements(0, S32, 16) 463 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 464 .scalarize(0); 465 466 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 467 // Full set of gfx9 features. 468 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 469 .legalFor({S32, S16, V2S16}) 470 .clampScalar(0, S16, S32) 471 .clampMaxNumElements(0, S16, 2) 472 .scalarize(0) 473 .widenScalarToNextPow2(0, 32); 474 475 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 476 .legalFor({S32, S16, V2S16}) // Clamp modifier 477 .minScalarOrElt(0, S16) 478 .clampMaxNumElements(0, S16, 2) 479 .scalarize(0) 480 .widenScalarToNextPow2(0, 32) 481 .lower(); 482 } else if (ST.has16BitInsts()) { 483 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 484 .legalFor({S32, S16}) 485 .clampScalar(0, S16, S32) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 488 489 // Technically the saturating operations require clamp bit support, but this 490 // was introduced at the same time as 16-bit operations. 491 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 492 .legalFor({S32, S16}) // Clamp modifier 493 .minScalar(0, S16) 494 .scalarize(0) 495 .widenScalarToNextPow2(0, 16) 496 .lower(); 497 498 // We're just lowering this, but it helps get a better result to try to 499 // coerce to the desired type first. 500 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 501 .minScalar(0, S16) 502 .scalarize(0) 503 .lower(); 504 } else { 505 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 506 .legalFor({S32}) 507 .clampScalar(0, S32, S32) 508 .scalarize(0); 509 510 if (ST.hasIntClamp()) { 511 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 512 .legalFor({S32}) // Clamp modifier. 513 .scalarize(0) 514 .minScalarOrElt(0, S32) 515 .lower(); 516 } else { 517 // Clamp bit support was added in VI, along with 16-bit operations. 518 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 // FIXME: DAG expansion gets better results. The widening uses the smaller 525 // range values and goes for the min/max lowering directly. 526 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 527 .minScalar(0, S32) 528 .scalarize(0) 529 .lower(); 530 } 531 532 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 533 .customFor({S32, S64}) 534 .clampScalar(0, S32, S64) 535 .widenScalarToNextPow2(0, 32) 536 .scalarize(0); 537 538 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 539 .legalFor({S32}) 540 .clampScalar(0, S32, S32) 541 .scalarize(0); 542 543 // Report legal for any types we can handle anywhere. For the cases only legal 544 // on the SALU, RegBankSelect will be able to re-legalize. 545 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 546 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 547 .clampScalar(0, S32, S64) 548 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 549 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 550 .widenScalarToNextPow2(0) 551 .scalarize(0); 552 553 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 554 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 555 .legalFor({{S32, S1}, {S32, S32}}) 556 .minScalar(0, S32) 557 // TODO: .scalarize(0) 558 .lower(); 559 560 getActionDefinitionsBuilder(G_BITCAST) 561 // Don't worry about the size constraint. 562 .legalIf(all(isRegisterType(0), isRegisterType(1))) 563 .lower(); 564 565 566 getActionDefinitionsBuilder(G_CONSTANT) 567 .legalFor({S1, S32, S64, S16, GlobalPtr, 568 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 569 .legalIf(isPointer(0)) 570 .clampScalar(0, S32, S64) 571 .widenScalarToNextPow2(0); 572 573 getActionDefinitionsBuilder(G_FCONSTANT) 574 .legalFor({S32, S64, S16}) 575 .clampScalar(0, S16, S64); 576 577 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 578 .legalIf(isRegisterType(0)) 579 // s1 and s16 are special cases because they have legal operations on 580 // them, but don't really occupy registers in the normal way. 581 .legalFor({S1, S16}) 582 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 583 .clampScalarOrElt(0, S32, MaxScalar) 584 .widenScalarToNextPow2(0, 32) 585 .clampMaxNumElements(0, S32, 16); 586 587 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 588 589 // If the amount is divergent, we have to do a wave reduction to get the 590 // maximum value, so this is expanded during RegBankSelect. 591 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 592 .legalFor({{PrivatePtr, S32}}); 593 594 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 595 .customIf(typeIsNot(0, PrivatePtr)); 596 597 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 598 599 auto &FPOpActions = getActionDefinitionsBuilder( 600 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 601 .legalFor({S32, S64}); 602 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 603 .customFor({S32, S64}); 604 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 605 .customFor({S32, S64}); 606 607 if (ST.has16BitInsts()) { 608 if (ST.hasVOP3PInsts()) 609 FPOpActions.legalFor({S16, V2S16}); 610 else 611 FPOpActions.legalFor({S16}); 612 613 TrigActions.customFor({S16}); 614 FDIVActions.customFor({S16}); 615 } 616 617 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 618 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 619 620 if (ST.hasVOP3PInsts()) { 621 MinNumMaxNum.customFor(FPTypesPK16) 622 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 623 .clampMaxNumElements(0, S16, 2) 624 .clampScalar(0, S16, S64) 625 .scalarize(0); 626 } else if (ST.has16BitInsts()) { 627 MinNumMaxNum.customFor(FPTypes16) 628 .clampScalar(0, S16, S64) 629 .scalarize(0); 630 } else { 631 MinNumMaxNum.customFor(FPTypesBase) 632 .clampScalar(0, S32, S64) 633 .scalarize(0); 634 } 635 636 if (ST.hasVOP3PInsts()) 637 FPOpActions.clampMaxNumElements(0, S16, 2); 638 639 FPOpActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 TrigActions 644 .scalarize(0) 645 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 646 647 FDIVActions 648 .scalarize(0) 649 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 650 651 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 652 .legalFor(FPTypesPK16) 653 .clampMaxNumElements(0, S16, 2) 654 .scalarize(0) 655 .clampScalar(0, S16, S64); 656 657 if (ST.has16BitInsts()) { 658 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 659 .legalFor({S32, S64, S16}) 660 .scalarize(0) 661 .clampScalar(0, S16, S64); 662 } else { 663 getActionDefinitionsBuilder(G_FSQRT) 664 .legalFor({S32, S64}) 665 .scalarize(0) 666 .clampScalar(0, S32, S64); 667 668 if (ST.hasFractBug()) { 669 getActionDefinitionsBuilder(G_FFLOOR) 670 .customFor({S64}) 671 .legalFor({S32, S64}) 672 .scalarize(0) 673 .clampScalar(0, S32, S64); 674 } else { 675 getActionDefinitionsBuilder(G_FFLOOR) 676 .legalFor({S32, S64}) 677 .scalarize(0) 678 .clampScalar(0, S32, S64); 679 } 680 } 681 682 getActionDefinitionsBuilder(G_FPTRUNC) 683 .legalFor({{S32, S64}, {S16, S32}}) 684 .scalarize(0) 685 .lower(); 686 687 getActionDefinitionsBuilder(G_FPEXT) 688 .legalFor({{S64, S32}, {S32, S16}}) 689 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 690 .scalarize(0); 691 692 getActionDefinitionsBuilder(G_FSUB) 693 // Use actual fsub instruction 694 .legalFor({S32}) 695 // Must use fadd + fneg 696 .lowerFor({S64, S16, V2S16}) 697 .scalarize(0) 698 .clampScalar(0, S32, S64); 699 700 // Whether this is legal depends on the floating point mode for the function. 701 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 702 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 703 FMad.customFor({S32, S16}); 704 else if (ST.hasMadMacF32Insts()) 705 FMad.customFor({S32}); 706 else if (ST.hasMadF16()) 707 FMad.customFor({S16}); 708 FMad.scalarize(0) 709 .lower(); 710 711 auto &FRem = getActionDefinitionsBuilder(G_FREM); 712 if (ST.has16BitInsts()) { 713 FRem.customFor({S16, S32, S64}); 714 } else { 715 FRem.minScalar(0, S32) 716 .customFor({S32, S64}); 717 } 718 FRem.scalarize(0); 719 720 // TODO: Do we need to clamp maximum bitwidth? 721 getActionDefinitionsBuilder(G_TRUNC) 722 .legalIf(isScalar(0)) 723 .legalFor({{V2S16, V2S32}}) 724 .clampMaxNumElements(0, S16, 2) 725 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 726 // situations (like an invalid implicit use), we don't want to infinite loop 727 // in the legalizer. 728 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 729 .alwaysLegal(); 730 731 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 732 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 733 {S32, S1}, {S64, S1}, {S16, S1}}) 734 .scalarize(0) 735 .clampScalar(0, S32, S64) 736 .widenScalarToNextPow2(1, 32); 737 738 // TODO: Split s1->s64 during regbankselect for VALU. 739 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 740 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 741 .lowerFor({{S32, S64}}) 742 .lowerIf(typeIs(1, S1)) 743 .customFor({{S64, S64}}); 744 if (ST.has16BitInsts()) 745 IToFP.legalFor({{S16, S16}}); 746 IToFP.clampScalar(1, S32, S64) 747 .minScalar(0, S32) 748 .scalarize(0) 749 .widenScalarToNextPow2(1); 750 751 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 752 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 753 .customFor({{S64, S64}}) 754 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 755 if (ST.has16BitInsts()) 756 FPToI.legalFor({{S16, S16}}); 757 else 758 FPToI.minScalar(1, S32); 759 760 FPToI.minScalar(0, S32) 761 .scalarize(0) 762 .lower(); 763 764 // Lower roundeven into G_FRINT 765 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 766 .scalarize(0) 767 .lower(); 768 769 if (ST.has16BitInsts()) { 770 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 771 .legalFor({S16, S32, S64}) 772 .clampScalar(0, S16, S64) 773 .scalarize(0); 774 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 775 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 776 .legalFor({S32, S64}) 777 .clampScalar(0, S32, S64) 778 .scalarize(0); 779 } else { 780 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 781 .legalFor({S32}) 782 .customFor({S64}) 783 .clampScalar(0, S32, S64) 784 .scalarize(0); 785 } 786 787 getActionDefinitionsBuilder(G_PTR_ADD) 788 .legalIf(all(isPointer(0), sameSize(0, 1))) 789 .scalarize(0) 790 .scalarSameSizeAs(1, 0); 791 792 getActionDefinitionsBuilder(G_PTRMASK) 793 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 794 .scalarSameSizeAs(1, 0) 795 .scalarize(0); 796 797 auto &CmpBuilder = 798 getActionDefinitionsBuilder(G_ICMP) 799 // The compare output type differs based on the register bank of the output, 800 // so make both s1 and s32 legal. 801 // 802 // Scalar compares producing output in scc will be promoted to s32, as that 803 // is the allocatable register type that will be needed for the copy from 804 // scc. This will be promoted during RegBankSelect, and we assume something 805 // before that won't try to use s32 result types. 806 // 807 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 808 // bank. 809 .legalForCartesianProduct( 810 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 811 .legalForCartesianProduct( 812 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 813 if (ST.has16BitInsts()) { 814 CmpBuilder.legalFor({{S1, S16}}); 815 } 816 817 CmpBuilder 818 .widenScalarToNextPow2(1) 819 .clampScalar(1, S32, S64) 820 .scalarize(0) 821 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 822 823 getActionDefinitionsBuilder(G_FCMP) 824 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 825 .widenScalarToNextPow2(1) 826 .clampScalar(1, S32, S64) 827 .scalarize(0); 828 829 // FIXME: fpow has a selection pattern that should move to custom lowering. 830 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 831 if (ST.has16BitInsts()) 832 Exp2Ops.legalFor({S32, S16}); 833 else 834 Exp2Ops.legalFor({S32}); 835 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 836 Exp2Ops.scalarize(0); 837 838 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 839 if (ST.has16BitInsts()) 840 ExpOps.customFor({{S32}, {S16}}); 841 else 842 ExpOps.customFor({S32}); 843 ExpOps.clampScalar(0, MinScalarFPTy, S32) 844 .scalarize(0); 845 846 getActionDefinitionsBuilder(G_FPOWI) 847 .clampScalar(0, MinScalarFPTy, S32) 848 .lower(); 849 850 // The 64-bit versions produce 32-bit results, but only on the SALU. 851 getActionDefinitionsBuilder(G_CTPOP) 852 .legalFor({{S32, S32}, {S32, S64}}) 853 .clampScalar(0, S32, S32) 854 .clampScalar(1, S32, S64) 855 .scalarize(0) 856 .widenScalarToNextPow2(0, 32) 857 .widenScalarToNextPow2(1, 32); 858 859 // The hardware instructions return a different result on 0 than the generic 860 // instructions expect. The hardware produces -1, but these produce the 861 // bitwidth. 862 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 863 .scalarize(0) 864 .clampScalar(0, S32, S32) 865 .clampScalar(1, S32, S64) 866 .widenScalarToNextPow2(0, 32) 867 .widenScalarToNextPow2(1, 32) 868 .lower(); 869 870 // The 64-bit versions produce 32-bit results, but only on the SALU. 871 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 872 .legalFor({{S32, S32}, {S32, S64}}) 873 .clampScalar(0, S32, S32) 874 .clampScalar(1, S32, S64) 875 .scalarize(0) 876 .widenScalarToNextPow2(0, 32) 877 .widenScalarToNextPow2(1, 32); 878 879 getActionDefinitionsBuilder(G_BITREVERSE) 880 .legalFor({S32}) 881 .clampScalar(0, S32, S32) 882 .scalarize(0); 883 884 if (ST.has16BitInsts()) { 885 getActionDefinitionsBuilder(G_BSWAP) 886 .legalFor({S16, S32, V2S16}) 887 .clampMaxNumElements(0, S16, 2) 888 // FIXME: Fixing non-power-of-2 before clamp is workaround for 889 // narrowScalar limitation. 890 .widenScalarToNextPow2(0) 891 .clampScalar(0, S16, S32) 892 .scalarize(0); 893 894 if (ST.hasVOP3PInsts()) { 895 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 896 .legalFor({S32, S16, V2S16}) 897 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 898 .clampMaxNumElements(0, S16, 2) 899 .minScalar(0, S16) 900 .widenScalarToNextPow2(0) 901 .scalarize(0) 902 .lower(); 903 } else { 904 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 905 .legalFor({S32, S16}) 906 .widenScalarToNextPow2(0) 907 .minScalar(0, S16) 908 .scalarize(0) 909 .lower(); 910 } 911 } else { 912 // TODO: Should have same legality without v_perm_b32 913 getActionDefinitionsBuilder(G_BSWAP) 914 .legalFor({S32}) 915 .lowerIf(scalarNarrowerThan(0, 32)) 916 // FIXME: Fixing non-power-of-2 before clamp is workaround for 917 // narrowScalar limitation. 918 .widenScalarToNextPow2(0) 919 .maxScalar(0, S32) 920 .scalarize(0) 921 .lower(); 922 923 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 924 .legalFor({S32}) 925 .minScalar(0, S32) 926 .widenScalarToNextPow2(0) 927 .scalarize(0) 928 .lower(); 929 } 930 931 getActionDefinitionsBuilder(G_INTTOPTR) 932 // List the common cases 933 .legalForCartesianProduct(AddrSpaces64, {S64}) 934 .legalForCartesianProduct(AddrSpaces32, {S32}) 935 .scalarize(0) 936 // Accept any address space as long as the size matches 937 .legalIf(sameSize(0, 1)) 938 .widenScalarIf(smallerThan(1, 0), 939 [](const LegalityQuery &Query) { 940 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 941 }) 942 .narrowScalarIf(largerThan(1, 0), 943 [](const LegalityQuery &Query) { 944 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 945 }); 946 947 getActionDefinitionsBuilder(G_PTRTOINT) 948 // List the common cases 949 .legalForCartesianProduct(AddrSpaces64, {S64}) 950 .legalForCartesianProduct(AddrSpaces32, {S32}) 951 .scalarize(0) 952 // Accept any address space as long as the size matches 953 .legalIf(sameSize(0, 1)) 954 .widenScalarIf(smallerThan(0, 1), 955 [](const LegalityQuery &Query) { 956 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 957 }) 958 .narrowScalarIf( 959 largerThan(0, 1), 960 [](const LegalityQuery &Query) { 961 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 962 }); 963 964 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 965 .scalarize(0) 966 .custom(); 967 968 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 969 bool IsLoad) -> bool { 970 const LLT DstTy = Query.Types[0]; 971 972 // Split vector extloads. 973 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 974 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 975 976 if (MemSize < DstTy.getSizeInBits()) 977 MemSize = std::max(MemSize, AlignBits); 978 979 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 980 return true; 981 982 const LLT PtrTy = Query.Types[1]; 983 unsigned AS = PtrTy.getAddressSpace(); 984 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 985 return true; 986 987 // Catch weird sized loads that don't evenly divide into the access sizes 988 // TODO: May be able to widen depending on alignment etc. 989 unsigned NumRegs = (MemSize + 31) / 32; 990 if (NumRegs == 3) { 991 if (!ST.hasDwordx3LoadStores()) 992 return true; 993 } else { 994 // If the alignment allows, these should have been widened. 995 if (!isPowerOf2_32(NumRegs)) 996 return true; 997 } 998 999 if (AlignBits < MemSize) { 1000 const SITargetLowering *TLI = ST.getTargetLowering(); 1001 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1002 Align(AlignBits / 8)); 1003 } 1004 1005 return false; 1006 }; 1007 1008 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 1009 unsigned Opc) -> bool { 1010 unsigned Size = Query.Types[0].getSizeInBits(); 1011 if (isPowerOf2_32(Size)) 1012 return false; 1013 1014 if (Size == 96 && ST.hasDwordx3LoadStores()) 1015 return false; 1016 1017 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1018 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1019 return false; 1020 1021 unsigned Align = Query.MMODescrs[0].AlignInBits; 1022 unsigned RoundedSize = NextPowerOf2(Size); 1023 return (Align >= RoundedSize); 1024 }; 1025 1026 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1027 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1028 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1029 1030 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1031 // LDS 1032 // TODO: Unsupported flat for SI. 1033 1034 for (unsigned Op : {G_LOAD, G_STORE}) { 1035 const bool IsStore = Op == G_STORE; 1036 1037 auto &Actions = getActionDefinitionsBuilder(Op); 1038 // Explicitly list some common cases. 1039 // TODO: Does this help compile time at all? 1040 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1041 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1042 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1043 {S64, GlobalPtr, 64, GlobalAlign32}, 1044 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1045 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1046 {S32, GlobalPtr, 8, GlobalAlign8}, 1047 {S32, GlobalPtr, 16, GlobalAlign16}, 1048 1049 {S32, LocalPtr, 32, 32}, 1050 {S64, LocalPtr, 64, 32}, 1051 {V2S32, LocalPtr, 64, 32}, 1052 {S32, LocalPtr, 8, 8}, 1053 {S32, LocalPtr, 16, 16}, 1054 {V2S16, LocalPtr, 32, 32}, 1055 1056 {S32, PrivatePtr, 32, 32}, 1057 {S32, PrivatePtr, 8, 8}, 1058 {S32, PrivatePtr, 16, 16}, 1059 {V2S16, PrivatePtr, 32, 32}, 1060 1061 {S32, ConstantPtr, 32, GlobalAlign32}, 1062 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1063 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1064 {S64, ConstantPtr, 64, GlobalAlign32}, 1065 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1066 Actions.legalIf( 1067 [=](const LegalityQuery &Query) -> bool { 1068 return isLoadStoreLegal(ST, Query, Op); 1069 }); 1070 1071 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1072 // 64-bits. 1073 // 1074 // TODO: Should generalize bitcast action into coerce, which will also cover 1075 // inserting addrspacecasts. 1076 Actions.customIf(typeIs(1, Constant32Ptr)); 1077 1078 // Turn any illegal element vectors into something easier to deal 1079 // with. These will ultimately produce 32-bit scalar shifts to extract the 1080 // parts anyway. 1081 // 1082 // For odd 16-bit element vectors, prefer to split those into pieces with 1083 // 16-bit vector parts. 1084 Actions.bitcastIf( 1085 [=](const LegalityQuery &Query) -> bool { 1086 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1087 Query.MMODescrs[0].SizeInBits); 1088 }, bitcastToRegisterType(0)); 1089 1090 Actions 1091 .customIf(typeIs(1, Constant32Ptr)) 1092 // Widen suitably aligned loads by loading extra elements. 1093 .moreElementsIf([=](const LegalityQuery &Query) { 1094 const LLT Ty = Query.Types[0]; 1095 return Op == G_LOAD && Ty.isVector() && 1096 shouldWidenLoadResult(Query, Op); 1097 }, moreElementsToNextPow2(0)) 1098 .widenScalarIf([=](const LegalityQuery &Query) { 1099 const LLT Ty = Query.Types[0]; 1100 return Op == G_LOAD && !Ty.isVector() && 1101 shouldWidenLoadResult(Query, Op); 1102 }, widenScalarOrEltToNextPow2(0)) 1103 .narrowScalarIf( 1104 [=](const LegalityQuery &Query) -> bool { 1105 return !Query.Types[0].isVector() && 1106 needToSplitMemOp(Query, Op == G_LOAD); 1107 }, 1108 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1109 const LLT DstTy = Query.Types[0]; 1110 const LLT PtrTy = Query.Types[1]; 1111 1112 const unsigned DstSize = DstTy.getSizeInBits(); 1113 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1114 1115 // Split extloads. 1116 if (DstSize > MemSize) 1117 return std::make_pair(0, LLT::scalar(MemSize)); 1118 1119 if (!isPowerOf2_32(DstSize)) { 1120 // We're probably decomposing an odd sized store. Try to split 1121 // to the widest type. TODO: Account for alignment. As-is it 1122 // should be OK, since the new parts will be further legalized. 1123 unsigned FloorSize = PowerOf2Floor(DstSize); 1124 return std::make_pair(0, LLT::scalar(FloorSize)); 1125 } 1126 1127 if (DstSize > 32 && (DstSize % 32 != 0)) { 1128 // FIXME: Need a way to specify non-extload of larger size if 1129 // suitably aligned. 1130 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1131 } 1132 1133 unsigned MaxSize = maxSizeForAddrSpace(ST, 1134 PtrTy.getAddressSpace(), 1135 Op == G_LOAD); 1136 if (MemSize > MaxSize) 1137 return std::make_pair(0, LLT::scalar(MaxSize)); 1138 1139 unsigned Align = Query.MMODescrs[0].AlignInBits; 1140 return std::make_pair(0, LLT::scalar(Align)); 1141 }) 1142 .fewerElementsIf( 1143 [=](const LegalityQuery &Query) -> bool { 1144 return Query.Types[0].isVector() && 1145 needToSplitMemOp(Query, Op == G_LOAD); 1146 }, 1147 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1148 const LLT DstTy = Query.Types[0]; 1149 const LLT PtrTy = Query.Types[1]; 1150 1151 LLT EltTy = DstTy.getElementType(); 1152 unsigned MaxSize = maxSizeForAddrSpace(ST, 1153 PtrTy.getAddressSpace(), 1154 Op == G_LOAD); 1155 1156 // FIXME: Handle widened to power of 2 results better. This ends 1157 // up scalarizing. 1158 // FIXME: 3 element stores scalarized on SI 1159 1160 // Split if it's too large for the address space. 1161 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1162 unsigned NumElts = DstTy.getNumElements(); 1163 unsigned EltSize = EltTy.getSizeInBits(); 1164 1165 if (MaxSize % EltSize == 0) { 1166 return std::make_pair( 1167 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1168 } 1169 1170 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1171 1172 // FIXME: Refine when odd breakdowns handled 1173 // The scalars will need to be re-legalized. 1174 if (NumPieces == 1 || NumPieces >= NumElts || 1175 NumElts % NumPieces != 0) 1176 return std::make_pair(0, EltTy); 1177 1178 return std::make_pair(0, 1179 LLT::vector(NumElts / NumPieces, EltTy)); 1180 } 1181 1182 // FIXME: We could probably handle weird extending loads better. 1183 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1184 if (DstTy.getSizeInBits() > MemSize) 1185 return std::make_pair(0, EltTy); 1186 1187 unsigned EltSize = EltTy.getSizeInBits(); 1188 unsigned DstSize = DstTy.getSizeInBits(); 1189 if (!isPowerOf2_32(DstSize)) { 1190 // We're probably decomposing an odd sized store. Try to split 1191 // to the widest type. TODO: Account for alignment. As-is it 1192 // should be OK, since the new parts will be further legalized. 1193 unsigned FloorSize = PowerOf2Floor(DstSize); 1194 return std::make_pair( 1195 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1196 } 1197 1198 // Need to split because of alignment. 1199 unsigned Align = Query.MMODescrs[0].AlignInBits; 1200 if (EltSize > Align && 1201 (EltSize / Align < DstTy.getNumElements())) { 1202 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1203 } 1204 1205 // May need relegalization for the scalars. 1206 return std::make_pair(0, EltTy); 1207 }) 1208 .minScalar(0, S32); 1209 1210 if (IsStore) 1211 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1212 1213 // TODO: Need a bitcast lower option? 1214 Actions 1215 .widenScalarToNextPow2(0) 1216 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1217 } 1218 1219 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1220 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1221 {S32, GlobalPtr, 16, 2 * 8}, 1222 {S32, LocalPtr, 8, 8}, 1223 {S32, LocalPtr, 16, 16}, 1224 {S32, PrivatePtr, 8, 8}, 1225 {S32, PrivatePtr, 16, 16}, 1226 {S32, ConstantPtr, 8, 8}, 1227 {S32, ConstantPtr, 16, 2 * 8}}); 1228 if (ST.hasFlatAddressSpace()) { 1229 ExtLoads.legalForTypesWithMemDesc( 1230 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1231 } 1232 1233 ExtLoads.clampScalar(0, S32, S32) 1234 .widenScalarToNextPow2(0) 1235 .unsupportedIfMemSizeNotPow2() 1236 .lower(); 1237 1238 auto &Atomics = getActionDefinitionsBuilder( 1239 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1240 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1241 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1242 G_ATOMICRMW_UMIN}) 1243 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1244 {S64, GlobalPtr}, {S64, LocalPtr}, 1245 {S32, RegionPtr}, {S64, RegionPtr}}); 1246 if (ST.hasFlatAddressSpace()) { 1247 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1248 } 1249 1250 if (ST.hasLDSFPAtomics()) { 1251 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1252 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1253 } 1254 1255 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1256 // demarshalling 1257 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1258 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1259 {S32, FlatPtr}, {S64, FlatPtr}}) 1260 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1261 {S32, RegionPtr}, {S64, RegionPtr}}); 1262 // TODO: Pointer types, any 32-bit or 64-bit vector 1263 1264 // Condition should be s32 for scalar, s1 for vector. 1265 getActionDefinitionsBuilder(G_SELECT) 1266 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1267 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1268 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1269 .clampScalar(0, S16, S64) 1270 .scalarize(1) 1271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1272 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1273 .clampMaxNumElements(0, S32, 2) 1274 .clampMaxNumElements(0, LocalPtr, 2) 1275 .clampMaxNumElements(0, PrivatePtr, 2) 1276 .scalarize(0) 1277 .widenScalarToNextPow2(0) 1278 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1279 1280 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1281 // be more flexible with the shift amount type. 1282 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1283 .legalFor({{S32, S32}, {S64, S32}}); 1284 if (ST.has16BitInsts()) { 1285 if (ST.hasVOP3PInsts()) { 1286 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1287 .clampMaxNumElements(0, S16, 2); 1288 } else 1289 Shifts.legalFor({{S16, S16}}); 1290 1291 // TODO: Support 16-bit shift amounts for all types 1292 Shifts.widenScalarIf( 1293 [=](const LegalityQuery &Query) { 1294 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1295 // 32-bit amount. 1296 const LLT ValTy = Query.Types[0]; 1297 const LLT AmountTy = Query.Types[1]; 1298 return ValTy.getSizeInBits() <= 16 && 1299 AmountTy.getSizeInBits() < 16; 1300 }, changeTo(1, S16)); 1301 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1302 Shifts.clampScalar(1, S32, S32); 1303 Shifts.clampScalar(0, S16, S64); 1304 Shifts.widenScalarToNextPow2(0, 16); 1305 1306 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1307 .minScalar(0, S16) 1308 .scalarize(0) 1309 .lower(); 1310 } else { 1311 // Make sure we legalize the shift amount type first, as the general 1312 // expansion for the shifted type will produce much worse code if it hasn't 1313 // been truncated already. 1314 Shifts.clampScalar(1, S32, S32); 1315 Shifts.clampScalar(0, S32, S64); 1316 Shifts.widenScalarToNextPow2(0, 32); 1317 1318 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1319 .minScalar(0, S32) 1320 .scalarize(0) 1321 .lower(); 1322 } 1323 Shifts.scalarize(0); 1324 1325 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1326 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1327 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1328 unsigned IdxTypeIdx = 2; 1329 1330 getActionDefinitionsBuilder(Op) 1331 .customIf([=](const LegalityQuery &Query) { 1332 const LLT EltTy = Query.Types[EltTypeIdx]; 1333 const LLT VecTy = Query.Types[VecTypeIdx]; 1334 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1335 const unsigned EltSize = EltTy.getSizeInBits(); 1336 return (EltSize == 32 || EltSize == 64) && 1337 VecTy.getSizeInBits() % 32 == 0 && 1338 VecTy.getSizeInBits() <= MaxRegisterSize && 1339 IdxTy.getSizeInBits() == 32; 1340 }) 1341 .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), 1342 bitcastToVectorElement32(1)) 1343 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1344 .bitcastIf( 1345 all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), 1346 [=](const LegalityQuery &Query) { 1347 // For > 64-bit element types, try to turn this into a 64-bit 1348 // element vector since we may be able to do better indexing 1349 // if this is scalar. If not, fall back to 32. 1350 const LLT EltTy = Query.Types[EltTypeIdx]; 1351 const LLT VecTy = Query.Types[VecTypeIdx]; 1352 const unsigned DstEltSize = EltTy.getSizeInBits(); 1353 const unsigned VecSize = VecTy.getSizeInBits(); 1354 1355 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1356 return std::make_pair( 1357 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1358 }) 1359 .clampScalar(EltTypeIdx, S32, S64) 1360 .clampScalar(VecTypeIdx, S32, S64) 1361 .clampScalar(IdxTypeIdx, S32, S32) 1362 .clampMaxNumElements(1, S32, 32) 1363 // TODO: Clamp elements for 64-bit vectors? 1364 // It should only be necessary with variable indexes. 1365 // As a last resort, lower to the stack 1366 .lower(); 1367 } 1368 1369 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1370 .unsupportedIf([=](const LegalityQuery &Query) { 1371 const LLT &EltTy = Query.Types[1].getElementType(); 1372 return Query.Types[0] != EltTy; 1373 }); 1374 1375 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1376 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1377 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1378 1379 // FIXME: Doesn't handle extract of illegal sizes. 1380 getActionDefinitionsBuilder(Op) 1381 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1382 // FIXME: Multiples of 16 should not be legal. 1383 .legalIf([=](const LegalityQuery &Query) { 1384 const LLT BigTy = Query.Types[BigTyIdx]; 1385 const LLT LitTy = Query.Types[LitTyIdx]; 1386 return (BigTy.getSizeInBits() % 32 == 0) && 1387 (LitTy.getSizeInBits() % 16 == 0); 1388 }) 1389 .widenScalarIf( 1390 [=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return (BigTy.getScalarSizeInBits() < 16); 1393 }, 1394 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1395 .widenScalarIf( 1396 [=](const LegalityQuery &Query) { 1397 const LLT LitTy = Query.Types[LitTyIdx]; 1398 return (LitTy.getScalarSizeInBits() < 16); 1399 }, 1400 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1401 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1402 .widenScalarToNextPow2(BigTyIdx, 32); 1403 1404 } 1405 1406 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1407 .legalForCartesianProduct(AllS32Vectors, {S32}) 1408 .legalForCartesianProduct(AllS64Vectors, {S64}) 1409 .clampNumElements(0, V16S32, V32S32) 1410 .clampNumElements(0, V2S64, V16S64) 1411 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1412 1413 if (ST.hasScalarPackInsts()) { 1414 BuildVector 1415 // FIXME: Should probably widen s1 vectors straight to s32 1416 .minScalarOrElt(0, S16) 1417 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1418 .minScalar(1, S32); 1419 1420 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1421 .legalFor({V2S16, S32}) 1422 .lower(); 1423 BuildVector.minScalarOrElt(0, S32); 1424 } else { 1425 BuildVector.customFor({V2S16, S16}); 1426 BuildVector.minScalarOrElt(0, S32); 1427 1428 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1429 .customFor({V2S16, S32}) 1430 .lower(); 1431 } 1432 1433 BuildVector.legalIf(isRegisterType(0)); 1434 1435 // FIXME: Clamp maximum size 1436 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1437 .legalIf(isRegisterType(0)); 1438 1439 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1440 // pre-legalize. 1441 if (ST.hasVOP3PInsts()) { 1442 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1443 .customFor({V2S16, V2S16}) 1444 .lower(); 1445 } else 1446 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1447 1448 // Merge/Unmerge 1449 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1450 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1451 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1452 1453 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1454 const LLT Ty = Query.Types[TypeIdx]; 1455 if (Ty.isVector()) { 1456 const LLT &EltTy = Ty.getElementType(); 1457 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1458 return true; 1459 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1460 return true; 1461 } 1462 return false; 1463 }; 1464 1465 auto &Builder = getActionDefinitionsBuilder(Op) 1466 .lowerFor({{S16, V2S16}}) 1467 .lowerIf([=](const LegalityQuery &Query) { 1468 const LLT BigTy = Query.Types[BigTyIdx]; 1469 return BigTy.getSizeInBits() == 32; 1470 }) 1471 // Try to widen to s16 first for small types. 1472 // TODO: Only do this on targets with legal s16 shifts 1473 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1474 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1475 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1476 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1477 elementTypeIs(1, S16)), 1478 changeTo(1, V2S16)) 1479 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1480 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1481 // valid. 1482 .clampScalar(LitTyIdx, S32, S512) 1483 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1484 // Break up vectors with weird elements into scalars 1485 .fewerElementsIf( 1486 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1487 scalarize(0)) 1488 .fewerElementsIf( 1489 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1490 scalarize(1)) 1491 .clampScalar(BigTyIdx, S32, MaxScalar); 1492 1493 if (Op == G_MERGE_VALUES) { 1494 Builder.widenScalarIf( 1495 // TODO: Use 16-bit shifts if legal for 8-bit values? 1496 [=](const LegalityQuery &Query) { 1497 const LLT Ty = Query.Types[LitTyIdx]; 1498 return Ty.getSizeInBits() < 32; 1499 }, 1500 changeTo(LitTyIdx, S32)); 1501 } 1502 1503 Builder.widenScalarIf( 1504 [=](const LegalityQuery &Query) { 1505 const LLT Ty = Query.Types[BigTyIdx]; 1506 return !isPowerOf2_32(Ty.getSizeInBits()) && 1507 Ty.getSizeInBits() % 16 != 0; 1508 }, 1509 [=](const LegalityQuery &Query) { 1510 // Pick the next power of 2, or a multiple of 64 over 128. 1511 // Whichever is smaller. 1512 const LLT &Ty = Query.Types[BigTyIdx]; 1513 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1514 if (NewSizeInBits >= 256) { 1515 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1516 if (RoundedTo < NewSizeInBits) 1517 NewSizeInBits = RoundedTo; 1518 } 1519 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1520 }) 1521 .legalIf([=](const LegalityQuery &Query) { 1522 const LLT &BigTy = Query.Types[BigTyIdx]; 1523 const LLT &LitTy = Query.Types[LitTyIdx]; 1524 1525 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1526 return false; 1527 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1528 return false; 1529 1530 return BigTy.getSizeInBits() % 16 == 0 && 1531 LitTy.getSizeInBits() % 16 == 0 && 1532 BigTy.getSizeInBits() <= MaxRegisterSize; 1533 }) 1534 // Any vectors left are the wrong size. Scalarize them. 1535 .scalarize(0) 1536 .scalarize(1); 1537 } 1538 1539 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1540 // RegBankSelect. 1541 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1542 .legalFor({{S32}, {S64}}); 1543 1544 if (ST.hasVOP3PInsts()) { 1545 SextInReg.lowerFor({{V2S16}}) 1546 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1547 // get more vector shift opportunities, since we'll get those when 1548 // expanded. 1549 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1550 } else if (ST.has16BitInsts()) { 1551 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1552 } else { 1553 // Prefer to promote to s32 before lowering if we don't have 16-bit 1554 // shifts. This avoid a lot of intermediate truncate and extend operations. 1555 SextInReg.lowerFor({{S32}, {S64}}); 1556 } 1557 1558 SextInReg 1559 .scalarize(0) 1560 .clampScalar(0, S32, S64) 1561 .lower(); 1562 1563 getActionDefinitionsBuilder(G_FSHR) 1564 .legalFor({{S32, S32}}) 1565 .scalarize(0) 1566 .lower(); 1567 1568 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1569 .legalFor({S64}); 1570 1571 getActionDefinitionsBuilder(G_FENCE) 1572 .alwaysLegal(); 1573 1574 getActionDefinitionsBuilder({ 1575 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1576 G_FCOPYSIGN, 1577 1578 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1579 G_ATOMICRMW_NAND, 1580 G_ATOMICRMW_FSUB, 1581 G_READ_REGISTER, 1582 G_WRITE_REGISTER, 1583 1584 G_SADDO, G_SSUBO, 1585 1586 // TODO: Implement 1587 G_FMINIMUM, G_FMAXIMUM, 1588 G_FSHL 1589 }).lower(); 1590 1591 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1592 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1593 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1594 .unsupported(); 1595 1596 computeTables(); 1597 verify(*ST.getInstrInfo()); 1598 } 1599 1600 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1601 MachineInstr &MI) const { 1602 MachineIRBuilder &B = Helper.MIRBuilder; 1603 MachineRegisterInfo &MRI = *B.getMRI(); 1604 GISelChangeObserver &Observer = Helper.Observer; 1605 1606 switch (MI.getOpcode()) { 1607 case TargetOpcode::G_ADDRSPACE_CAST: 1608 return legalizeAddrSpaceCast(MI, MRI, B); 1609 case TargetOpcode::G_FRINT: 1610 return legalizeFrint(MI, MRI, B); 1611 case TargetOpcode::G_FCEIL: 1612 return legalizeFceil(MI, MRI, B); 1613 case TargetOpcode::G_FREM: 1614 return legalizeFrem(MI, MRI, B); 1615 case TargetOpcode::G_INTRINSIC_TRUNC: 1616 return legalizeIntrinsicTrunc(MI, MRI, B); 1617 case TargetOpcode::G_SITOFP: 1618 return legalizeITOFP(MI, MRI, B, true); 1619 case TargetOpcode::G_UITOFP: 1620 return legalizeITOFP(MI, MRI, B, false); 1621 case TargetOpcode::G_FPTOSI: 1622 return legalizeFPTOI(MI, MRI, B, true); 1623 case TargetOpcode::G_FPTOUI: 1624 return legalizeFPTOI(MI, MRI, B, false); 1625 case TargetOpcode::G_FMINNUM: 1626 case TargetOpcode::G_FMAXNUM: 1627 case TargetOpcode::G_FMINNUM_IEEE: 1628 case TargetOpcode::G_FMAXNUM_IEEE: 1629 return legalizeMinNumMaxNum(Helper, MI); 1630 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1631 return legalizeExtractVectorElt(MI, MRI, B); 1632 case TargetOpcode::G_INSERT_VECTOR_ELT: 1633 return legalizeInsertVectorElt(MI, MRI, B); 1634 case TargetOpcode::G_SHUFFLE_VECTOR: 1635 return legalizeShuffleVector(MI, MRI, B); 1636 case TargetOpcode::G_FSIN: 1637 case TargetOpcode::G_FCOS: 1638 return legalizeSinCos(MI, MRI, B); 1639 case TargetOpcode::G_GLOBAL_VALUE: 1640 return legalizeGlobalValue(MI, MRI, B); 1641 case TargetOpcode::G_LOAD: 1642 return legalizeLoad(MI, MRI, B, Observer); 1643 case TargetOpcode::G_FMAD: 1644 return legalizeFMad(MI, MRI, B); 1645 case TargetOpcode::G_FDIV: 1646 return legalizeFDIV(MI, MRI, B); 1647 case TargetOpcode::G_UDIV: 1648 case TargetOpcode::G_UREM: 1649 return legalizeUDIV_UREM(MI, MRI, B); 1650 case TargetOpcode::G_SDIV: 1651 case TargetOpcode::G_SREM: 1652 return legalizeSDIV_SREM(MI, MRI, B); 1653 case TargetOpcode::G_ATOMIC_CMPXCHG: 1654 return legalizeAtomicCmpXChg(MI, MRI, B); 1655 case TargetOpcode::G_FLOG: 1656 return legalizeFlog(MI, B, numbers::ln2f); 1657 case TargetOpcode::G_FLOG10: 1658 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1659 case TargetOpcode::G_FEXP: 1660 return legalizeFExp(MI, B); 1661 case TargetOpcode::G_FPOW: 1662 return legalizeFPow(MI, B); 1663 case TargetOpcode::G_FFLOOR: 1664 return legalizeFFloor(MI, MRI, B); 1665 case TargetOpcode::G_BUILD_VECTOR: 1666 return legalizeBuildVector(MI, MRI, B); 1667 default: 1668 return false; 1669 } 1670 1671 llvm_unreachable("expected switch to return"); 1672 } 1673 1674 Register AMDGPULegalizerInfo::getSegmentAperture( 1675 unsigned AS, 1676 MachineRegisterInfo &MRI, 1677 MachineIRBuilder &B) const { 1678 MachineFunction &MF = B.getMF(); 1679 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1680 const LLT S32 = LLT::scalar(32); 1681 1682 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1683 1684 if (ST.hasApertureRegs()) { 1685 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1686 // getreg. 1687 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1688 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1689 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1690 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1691 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1692 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1693 unsigned Encoding = 1694 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1695 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1696 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1697 1698 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1699 1700 B.buildInstr(AMDGPU::S_GETREG_B32) 1701 .addDef(GetReg) 1702 .addImm(Encoding); 1703 MRI.setType(GetReg, S32); 1704 1705 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1706 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1707 } 1708 1709 Register QueuePtr = MRI.createGenericVirtualRegister( 1710 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1711 1712 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1713 return Register(); 1714 1715 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1716 // private_segment_aperture_base_hi. 1717 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1718 1719 // TODO: can we be smarter about machine pointer info? 1720 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1721 MachineMemOperand *MMO = MF.getMachineMemOperand( 1722 PtrInfo, 1723 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1724 MachineMemOperand::MOInvariant, 1725 4, commonAlignment(Align(64), StructOffset)); 1726 1727 Register LoadAddr; 1728 1729 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1730 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1731 } 1732 1733 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1734 MachineInstr &MI, MachineRegisterInfo &MRI, 1735 MachineIRBuilder &B) const { 1736 MachineFunction &MF = B.getMF(); 1737 1738 const LLT S32 = LLT::scalar(32); 1739 Register Dst = MI.getOperand(0).getReg(); 1740 Register Src = MI.getOperand(1).getReg(); 1741 1742 LLT DstTy = MRI.getType(Dst); 1743 LLT SrcTy = MRI.getType(Src); 1744 unsigned DestAS = DstTy.getAddressSpace(); 1745 unsigned SrcAS = SrcTy.getAddressSpace(); 1746 1747 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1748 // vector element. 1749 assert(!DstTy.isVector()); 1750 1751 const AMDGPUTargetMachine &TM 1752 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1753 1754 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1755 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1756 return true; 1757 } 1758 1759 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1760 // Truncate. 1761 B.buildExtract(Dst, Src, 0); 1762 MI.eraseFromParent(); 1763 return true; 1764 } 1765 1766 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1767 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1768 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1769 1770 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1771 // another. Merge operands are required to be the same type, but creating an 1772 // extra ptrtoint would be kind of pointless. 1773 auto HighAddr = B.buildConstant( 1774 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1775 B.buildMerge(Dst, {Src, HighAddr}); 1776 MI.eraseFromParent(); 1777 return true; 1778 } 1779 1780 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1781 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1782 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1783 unsigned NullVal = TM.getNullPointerValue(DestAS); 1784 1785 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1786 auto FlatNull = B.buildConstant(SrcTy, 0); 1787 1788 // Extract low 32-bits of the pointer. 1789 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1790 1791 auto CmpRes = 1792 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1793 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1794 1795 MI.eraseFromParent(); 1796 return true; 1797 } 1798 1799 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1800 return false; 1801 1802 if (!ST.hasFlatAddressSpace()) 1803 return false; 1804 1805 auto SegmentNull = 1806 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1807 auto FlatNull = 1808 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1809 1810 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1811 if (!ApertureReg.isValid()) 1812 return false; 1813 1814 auto CmpRes = 1815 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1816 1817 // Coerce the type of the low half of the result so we can use merge_values. 1818 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1819 1820 // TODO: Should we allow mismatched types but matching sizes in merges to 1821 // avoid the ptrtoint? 1822 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1823 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1824 1825 MI.eraseFromParent(); 1826 return true; 1827 } 1828 1829 bool AMDGPULegalizerInfo::legalizeFrint( 1830 MachineInstr &MI, MachineRegisterInfo &MRI, 1831 MachineIRBuilder &B) const { 1832 Register Src = MI.getOperand(1).getReg(); 1833 LLT Ty = MRI.getType(Src); 1834 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1835 1836 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1837 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1838 1839 auto C1 = B.buildFConstant(Ty, C1Val); 1840 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1841 1842 // TODO: Should this propagate fast-math-flags? 1843 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1844 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1845 1846 auto C2 = B.buildFConstant(Ty, C2Val); 1847 auto Fabs = B.buildFAbs(Ty, Src); 1848 1849 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1850 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1851 MI.eraseFromParent(); 1852 return true; 1853 } 1854 1855 bool AMDGPULegalizerInfo::legalizeFceil( 1856 MachineInstr &MI, MachineRegisterInfo &MRI, 1857 MachineIRBuilder &B) const { 1858 1859 const LLT S1 = LLT::scalar(1); 1860 const LLT S64 = LLT::scalar(64); 1861 1862 Register Src = MI.getOperand(1).getReg(); 1863 assert(MRI.getType(Src) == S64); 1864 1865 // result = trunc(src) 1866 // if (src > 0.0 && src != result) 1867 // result += 1.0 1868 1869 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1870 1871 const auto Zero = B.buildFConstant(S64, 0.0); 1872 const auto One = B.buildFConstant(S64, 1.0); 1873 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1874 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1875 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1876 auto Add = B.buildSelect(S64, And, One, Zero); 1877 1878 // TODO: Should this propagate fast-math-flags? 1879 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeFrem( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B) const { 1886 Register DstReg = MI.getOperand(0).getReg(); 1887 Register Src0Reg = MI.getOperand(1).getReg(); 1888 Register Src1Reg = MI.getOperand(2).getReg(); 1889 auto Flags = MI.getFlags(); 1890 LLT Ty = MRI.getType(DstReg); 1891 1892 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1893 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1894 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1895 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1896 MI.eraseFromParent(); 1897 return true; 1898 } 1899 1900 static MachineInstrBuilder extractF64Exponent(Register Hi, 1901 MachineIRBuilder &B) { 1902 const unsigned FractBits = 52; 1903 const unsigned ExpBits = 11; 1904 LLT S32 = LLT::scalar(32); 1905 1906 auto Const0 = B.buildConstant(S32, FractBits - 32); 1907 auto Const1 = B.buildConstant(S32, ExpBits); 1908 1909 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1910 .addUse(Hi) 1911 .addUse(Const0.getReg(0)) 1912 .addUse(Const1.getReg(0)); 1913 1914 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1915 } 1916 1917 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1918 MachineInstr &MI, MachineRegisterInfo &MRI, 1919 MachineIRBuilder &B) const { 1920 const LLT S1 = LLT::scalar(1); 1921 const LLT S32 = LLT::scalar(32); 1922 const LLT S64 = LLT::scalar(64); 1923 1924 Register Src = MI.getOperand(1).getReg(); 1925 assert(MRI.getType(Src) == S64); 1926 1927 // TODO: Should this use extract since the low half is unused? 1928 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1929 Register Hi = Unmerge.getReg(1); 1930 1931 // Extract the upper half, since this is where we will find the sign and 1932 // exponent. 1933 auto Exp = extractF64Exponent(Hi, B); 1934 1935 const unsigned FractBits = 52; 1936 1937 // Extract the sign bit. 1938 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1939 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1940 1941 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1942 1943 const auto Zero32 = B.buildConstant(S32, 0); 1944 1945 // Extend back to 64-bits. 1946 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1947 1948 auto Shr = B.buildAShr(S64, FractMask, Exp); 1949 auto Not = B.buildNot(S64, Shr); 1950 auto Tmp0 = B.buildAnd(S64, Src, Not); 1951 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1952 1953 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1954 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1955 1956 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1957 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1958 MI.eraseFromParent(); 1959 return true; 1960 } 1961 1962 bool AMDGPULegalizerInfo::legalizeITOFP( 1963 MachineInstr &MI, MachineRegisterInfo &MRI, 1964 MachineIRBuilder &B, bool Signed) const { 1965 1966 Register Dst = MI.getOperand(0).getReg(); 1967 Register Src = MI.getOperand(1).getReg(); 1968 1969 const LLT S64 = LLT::scalar(64); 1970 const LLT S32 = LLT::scalar(32); 1971 1972 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1973 1974 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1975 1976 auto CvtHi = Signed ? 1977 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1978 B.buildUITOFP(S64, Unmerge.getReg(1)); 1979 1980 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1981 1982 auto ThirtyTwo = B.buildConstant(S32, 32); 1983 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1984 .addUse(CvtHi.getReg(0)) 1985 .addUse(ThirtyTwo.getReg(0)); 1986 1987 // TODO: Should this propagate fast-math-flags? 1988 B.buildFAdd(Dst, LdExp, CvtLo); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 // TODO: Copied from DAG implementation. Verify logic and document how this 1994 // actually works. 1995 bool AMDGPULegalizerInfo::legalizeFPTOI( 1996 MachineInstr &MI, MachineRegisterInfo &MRI, 1997 MachineIRBuilder &B, bool Signed) const { 1998 1999 Register Dst = MI.getOperand(0).getReg(); 2000 Register Src = MI.getOperand(1).getReg(); 2001 2002 const LLT S64 = LLT::scalar(64); 2003 const LLT S32 = LLT::scalar(32); 2004 2005 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2006 2007 unsigned Flags = MI.getFlags(); 2008 2009 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2010 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2011 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2012 2013 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2014 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2015 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2016 2017 auto Hi = Signed ? 2018 B.buildFPTOSI(S32, FloorMul) : 2019 B.buildFPTOUI(S32, FloorMul); 2020 auto Lo = B.buildFPTOUI(S32, Fma); 2021 2022 B.buildMerge(Dst, { Lo, Hi }); 2023 MI.eraseFromParent(); 2024 2025 return true; 2026 } 2027 2028 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2029 MachineInstr &MI) const { 2030 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2031 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2032 2033 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2034 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2035 2036 // With ieee_mode disabled, the instructions have the correct behavior 2037 // already for G_FMINNUM/G_FMAXNUM 2038 if (!MFI->getMode().IEEE) 2039 return !IsIEEEOp; 2040 2041 if (IsIEEEOp) 2042 return true; 2043 2044 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 MachineIRBuilder &B) const { 2050 // TODO: Should move some of this into LegalizerHelper. 2051 2052 // TODO: Promote dynamic indexing of s16 to s32 2053 2054 // FIXME: Artifact combiner probably should have replaced the truncated 2055 // constant before this, so we shouldn't need 2056 // getConstantVRegValWithLookThrough. 2057 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2058 MI.getOperand(2).getReg(), MRI); 2059 if (!IdxVal) // Dynamic case will be selected to register indexing. 2060 return true; 2061 2062 Register Dst = MI.getOperand(0).getReg(); 2063 Register Vec = MI.getOperand(1).getReg(); 2064 2065 LLT VecTy = MRI.getType(Vec); 2066 LLT EltTy = VecTy.getElementType(); 2067 assert(EltTy == MRI.getType(Dst)); 2068 2069 if (IdxVal->Value < VecTy.getNumElements()) 2070 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2071 else 2072 B.buildUndef(Dst); 2073 2074 MI.eraseFromParent(); 2075 return true; 2076 } 2077 2078 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2079 MachineInstr &MI, MachineRegisterInfo &MRI, 2080 MachineIRBuilder &B) const { 2081 // TODO: Should move some of this into LegalizerHelper. 2082 2083 // TODO: Promote dynamic indexing of s16 to s32 2084 2085 // FIXME: Artifact combiner probably should have replaced the truncated 2086 // constant before this, so we shouldn't need 2087 // getConstantVRegValWithLookThrough. 2088 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2089 MI.getOperand(3).getReg(), MRI); 2090 if (!IdxVal) // Dynamic case will be selected to register indexing. 2091 return true; 2092 2093 Register Dst = MI.getOperand(0).getReg(); 2094 Register Vec = MI.getOperand(1).getReg(); 2095 Register Ins = MI.getOperand(2).getReg(); 2096 2097 LLT VecTy = MRI.getType(Vec); 2098 LLT EltTy = VecTy.getElementType(); 2099 assert(EltTy == MRI.getType(Ins)); 2100 2101 if (IdxVal->Value < VecTy.getNumElements()) 2102 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2103 else 2104 B.buildUndef(Dst); 2105 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2111 MachineInstr &MI, MachineRegisterInfo &MRI, 2112 MachineIRBuilder &B) const { 2113 const LLT V2S16 = LLT::vector(2, 16); 2114 2115 Register Dst = MI.getOperand(0).getReg(); 2116 Register Src0 = MI.getOperand(1).getReg(); 2117 LLT DstTy = MRI.getType(Dst); 2118 LLT SrcTy = MRI.getType(Src0); 2119 2120 if (SrcTy == V2S16 && DstTy == V2S16 && 2121 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2122 return true; 2123 2124 MachineIRBuilder HelperBuilder(MI); 2125 GISelObserverWrapper DummyObserver; 2126 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2127 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2128 } 2129 2130 bool AMDGPULegalizerInfo::legalizeSinCos( 2131 MachineInstr &MI, MachineRegisterInfo &MRI, 2132 MachineIRBuilder &B) const { 2133 2134 Register DstReg = MI.getOperand(0).getReg(); 2135 Register SrcReg = MI.getOperand(1).getReg(); 2136 LLT Ty = MRI.getType(DstReg); 2137 unsigned Flags = MI.getFlags(); 2138 2139 Register TrigVal; 2140 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2141 if (ST.hasTrigReducedRange()) { 2142 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2143 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2144 .addUse(MulVal.getReg(0)) 2145 .setMIFlags(Flags).getReg(0); 2146 } else 2147 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2148 2149 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2150 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2151 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2152 .addUse(TrigVal) 2153 .setMIFlags(Flags); 2154 MI.eraseFromParent(); 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2159 MachineIRBuilder &B, 2160 const GlobalValue *GV, 2161 int64_t Offset, 2162 unsigned GAFlags) const { 2163 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2164 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2165 // to the following code sequence: 2166 // 2167 // For constant address space: 2168 // s_getpc_b64 s[0:1] 2169 // s_add_u32 s0, s0, $symbol 2170 // s_addc_u32 s1, s1, 0 2171 // 2172 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2173 // a fixup or relocation is emitted to replace $symbol with a literal 2174 // constant, which is a pc-relative offset from the encoding of the $symbol 2175 // operand to the global variable. 2176 // 2177 // For global address space: 2178 // s_getpc_b64 s[0:1] 2179 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2180 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2181 // 2182 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2183 // fixups or relocations are emitted to replace $symbol@*@lo and 2184 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2185 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2186 // operand to the global variable. 2187 // 2188 // What we want here is an offset from the value returned by s_getpc 2189 // (which is the address of the s_add_u32 instruction) to the global 2190 // variable, but since the encoding of $symbol starts 4 bytes after the start 2191 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2192 // small. This requires us to add 4 to the global variable offset in order to 2193 // compute the correct address. 2194 2195 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2196 2197 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2198 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2199 2200 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2201 .addDef(PCReg); 2202 2203 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2204 if (GAFlags == SIInstrInfo::MO_NONE) 2205 MIB.addImm(0); 2206 else 2207 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2208 2209 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2210 2211 if (PtrTy.getSizeInBits() == 32) 2212 B.buildExtract(DstReg, PCReg, 0); 2213 return true; 2214 } 2215 2216 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2217 MachineInstr &MI, MachineRegisterInfo &MRI, 2218 MachineIRBuilder &B) const { 2219 Register DstReg = MI.getOperand(0).getReg(); 2220 LLT Ty = MRI.getType(DstReg); 2221 unsigned AS = Ty.getAddressSpace(); 2222 2223 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2224 MachineFunction &MF = B.getMF(); 2225 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2226 2227 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2228 if (!MFI->isEntryFunction()) { 2229 const Function &Fn = MF.getFunction(); 2230 DiagnosticInfoUnsupported BadLDSDecl( 2231 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2232 DS_Warning); 2233 Fn.getContext().diagnose(BadLDSDecl); 2234 2235 // We currently don't have a way to correctly allocate LDS objects that 2236 // aren't directly associated with a kernel. We do force inlining of 2237 // functions that use local objects. However, if these dead functions are 2238 // not eliminated, we don't want a compile time error. Just emit a warning 2239 // and a trap, since there should be no callable path here. 2240 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2241 B.buildUndef(DstReg); 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 // TODO: We could emit code to handle the initialization somewhere. 2247 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2248 const SITargetLowering *TLI = ST.getTargetLowering(); 2249 if (!TLI->shouldUseLDSConstAddress(GV)) { 2250 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2251 return true; // Leave in place; 2252 } 2253 2254 B.buildConstant( 2255 DstReg, 2256 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2257 MI.eraseFromParent(); 2258 return true; 2259 } 2260 2261 const Function &Fn = MF.getFunction(); 2262 DiagnosticInfoUnsupported BadInit( 2263 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2264 Fn.getContext().diagnose(BadInit); 2265 return true; 2266 } 2267 2268 const SITargetLowering *TLI = ST.getTargetLowering(); 2269 2270 if (TLI->shouldEmitFixup(GV)) { 2271 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 if (TLI->shouldEmitPCReloc(GV)) { 2277 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2278 MI.eraseFromParent(); 2279 return true; 2280 } 2281 2282 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2283 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2284 2285 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2286 MachinePointerInfo::getGOT(MF), 2287 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2288 MachineMemOperand::MOInvariant, 2289 8 /*Size*/, Align(8)); 2290 2291 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2292 2293 if (Ty.getSizeInBits() == 32) { 2294 // Truncate if this is a 32-bit constant adrdess. 2295 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2296 B.buildExtract(DstReg, Load, 0); 2297 } else 2298 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2299 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 bool AMDGPULegalizerInfo::legalizeLoad( 2305 MachineInstr &MI, MachineRegisterInfo &MRI, 2306 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2307 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2308 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2309 Observer.changingInstr(MI); 2310 MI.getOperand(1).setReg(Cast.getReg(0)); 2311 Observer.changedInstr(MI); 2312 return true; 2313 } 2314 2315 bool AMDGPULegalizerInfo::legalizeFMad( 2316 MachineInstr &MI, MachineRegisterInfo &MRI, 2317 MachineIRBuilder &B) const { 2318 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2319 assert(Ty.isScalar()); 2320 2321 MachineFunction &MF = B.getMF(); 2322 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2323 2324 // TODO: Always legal with future ftz flag. 2325 // FIXME: Do we need just output? 2326 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2327 return true; 2328 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2329 return true; 2330 2331 MachineIRBuilder HelperBuilder(MI); 2332 GISelObserverWrapper DummyObserver; 2333 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2334 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2335 } 2336 2337 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2338 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2339 Register DstReg = MI.getOperand(0).getReg(); 2340 Register PtrReg = MI.getOperand(1).getReg(); 2341 Register CmpVal = MI.getOperand(2).getReg(); 2342 Register NewVal = MI.getOperand(3).getReg(); 2343 2344 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2345 "this should not have been custom lowered"); 2346 2347 LLT ValTy = MRI.getType(CmpVal); 2348 LLT VecTy = LLT::vector(2, ValTy); 2349 2350 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2351 2352 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2353 .addDef(DstReg) 2354 .addUse(PtrReg) 2355 .addUse(PackedVal) 2356 .setMemRefs(MI.memoperands()); 2357 2358 MI.eraseFromParent(); 2359 return true; 2360 } 2361 2362 bool AMDGPULegalizerInfo::legalizeFlog( 2363 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2364 Register Dst = MI.getOperand(0).getReg(); 2365 Register Src = MI.getOperand(1).getReg(); 2366 LLT Ty = B.getMRI()->getType(Dst); 2367 unsigned Flags = MI.getFlags(); 2368 2369 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2370 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2371 2372 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2373 MI.eraseFromParent(); 2374 return true; 2375 } 2376 2377 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2378 MachineIRBuilder &B) const { 2379 Register Dst = MI.getOperand(0).getReg(); 2380 Register Src = MI.getOperand(1).getReg(); 2381 unsigned Flags = MI.getFlags(); 2382 LLT Ty = B.getMRI()->getType(Dst); 2383 2384 auto K = B.buildFConstant(Ty, numbers::log2e); 2385 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2386 B.buildFExp2(Dst, Mul, Flags); 2387 MI.eraseFromParent(); 2388 return true; 2389 } 2390 2391 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2392 MachineIRBuilder &B) const { 2393 Register Dst = MI.getOperand(0).getReg(); 2394 Register Src0 = MI.getOperand(1).getReg(); 2395 Register Src1 = MI.getOperand(2).getReg(); 2396 unsigned Flags = MI.getFlags(); 2397 LLT Ty = B.getMRI()->getType(Dst); 2398 const LLT S16 = LLT::scalar(16); 2399 const LLT S32 = LLT::scalar(32); 2400 2401 if (Ty == S32) { 2402 auto Log = B.buildFLog2(S32, Src0, Flags); 2403 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2404 .addUse(Log.getReg(0)) 2405 .addUse(Src1) 2406 .setMIFlags(Flags); 2407 B.buildFExp2(Dst, Mul, Flags); 2408 } else if (Ty == S16) { 2409 // There's no f16 fmul_legacy, so we need to convert for it. 2410 auto Log = B.buildFLog2(S16, Src0, Flags); 2411 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2412 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2413 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2414 .addUse(Ext0.getReg(0)) 2415 .addUse(Ext1.getReg(0)) 2416 .setMIFlags(Flags); 2417 2418 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2419 } else 2420 return false; 2421 2422 MI.eraseFromParent(); 2423 return true; 2424 } 2425 2426 // Find a source register, ignoring any possible source modifiers. 2427 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2428 Register ModSrc = OrigSrc; 2429 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2430 ModSrc = SrcFNeg->getOperand(1).getReg(); 2431 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2432 ModSrc = SrcFAbs->getOperand(1).getReg(); 2433 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2434 ModSrc = SrcFAbs->getOperand(1).getReg(); 2435 return ModSrc; 2436 } 2437 2438 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2439 MachineRegisterInfo &MRI, 2440 MachineIRBuilder &B) const { 2441 2442 const LLT S1 = LLT::scalar(1); 2443 const LLT S64 = LLT::scalar(64); 2444 Register Dst = MI.getOperand(0).getReg(); 2445 Register OrigSrc = MI.getOperand(1).getReg(); 2446 unsigned Flags = MI.getFlags(); 2447 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2448 "this should not have been custom lowered"); 2449 2450 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2451 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2452 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2453 // V_FRACT bug is: 2454 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2455 // 2456 // Convert floor(x) to (x - fract(x)) 2457 2458 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2459 .addUse(OrigSrc) 2460 .setMIFlags(Flags); 2461 2462 // Give source modifier matching some assistance before obscuring a foldable 2463 // pattern. 2464 2465 // TODO: We can avoid the neg on the fract? The input sign to fract 2466 // shouldn't matter? 2467 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2468 2469 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2470 2471 Register Min = MRI.createGenericVirtualRegister(S64); 2472 2473 // We don't need to concern ourselves with the snan handling difference, so 2474 // use the one which will directly select. 2475 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2476 if (MFI->getMode().IEEE) 2477 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2478 else 2479 B.buildFMinNum(Min, Fract, Const, Flags); 2480 2481 Register CorrectedFract = Min; 2482 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2483 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2484 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2485 } 2486 2487 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2488 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2489 2490 MI.eraseFromParent(); 2491 return true; 2492 } 2493 2494 // Turn an illegal packed v2s16 build vector into bit operations. 2495 // TODO: This should probably be a bitcast action in LegalizerHelper. 2496 bool AMDGPULegalizerInfo::legalizeBuildVector( 2497 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2498 Register Dst = MI.getOperand(0).getReg(); 2499 const LLT S32 = LLT::scalar(32); 2500 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2501 2502 Register Src0 = MI.getOperand(1).getReg(); 2503 Register Src1 = MI.getOperand(2).getReg(); 2504 assert(MRI.getType(Src0) == LLT::scalar(16)); 2505 2506 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2507 B.buildBitcast(Dst, Merge); 2508 2509 MI.eraseFromParent(); 2510 return true; 2511 } 2512 2513 // Return the use branch instruction, otherwise null if the usage is invalid. 2514 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2515 MachineRegisterInfo &MRI, 2516 MachineInstr *&Br, 2517 MachineBasicBlock *&UncondBrTarget) { 2518 Register CondDef = MI.getOperand(0).getReg(); 2519 if (!MRI.hasOneNonDBGUse(CondDef)) 2520 return nullptr; 2521 2522 MachineBasicBlock *Parent = MI.getParent(); 2523 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2524 if (UseMI.getParent() != Parent || 2525 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2526 return nullptr; 2527 2528 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2529 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2530 if (Next == Parent->end()) { 2531 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2532 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2533 return nullptr; 2534 UncondBrTarget = &*NextMBB; 2535 } else { 2536 if (Next->getOpcode() != AMDGPU::G_BR) 2537 return nullptr; 2538 Br = &*Next; 2539 UncondBrTarget = Br->getOperand(0).getMBB(); 2540 } 2541 2542 return &UseMI; 2543 } 2544 2545 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2546 const ArgDescriptor *Arg, 2547 const TargetRegisterClass *ArgRC, 2548 LLT ArgTy) const { 2549 MCRegister SrcReg = Arg->getRegister(); 2550 assert(SrcReg.isPhysical() && "Physical register expected"); 2551 assert(DstReg.isVirtual() && "Virtual register expected"); 2552 2553 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2554 ArgTy); 2555 if (Arg->isMasked()) { 2556 // TODO: Should we try to emit this once in the entry block? 2557 const LLT S32 = LLT::scalar(32); 2558 const unsigned Mask = Arg->getMask(); 2559 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2560 2561 Register AndMaskSrc = LiveIn; 2562 2563 if (Shift != 0) { 2564 auto ShiftAmt = B.buildConstant(S32, Shift); 2565 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2566 } 2567 2568 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2569 } else { 2570 B.buildCopy(DstReg, LiveIn); 2571 } 2572 2573 return true; 2574 } 2575 2576 bool AMDGPULegalizerInfo::loadInputValue( 2577 Register DstReg, MachineIRBuilder &B, 2578 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2579 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2580 const ArgDescriptor *Arg; 2581 const TargetRegisterClass *ArgRC; 2582 LLT ArgTy; 2583 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2584 2585 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2586 return false; // TODO: Handle these 2587 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2591 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2592 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2593 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2594 return false; 2595 2596 MI.eraseFromParent(); 2597 return true; 2598 } 2599 2600 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2601 MachineRegisterInfo &MRI, 2602 MachineIRBuilder &B) const { 2603 Register Dst = MI.getOperand(0).getReg(); 2604 LLT DstTy = MRI.getType(Dst); 2605 LLT S16 = LLT::scalar(16); 2606 LLT S32 = LLT::scalar(32); 2607 LLT S64 = LLT::scalar(64); 2608 2609 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2610 return true; 2611 2612 if (DstTy == S16) 2613 return legalizeFDIV16(MI, MRI, B); 2614 if (DstTy == S32) 2615 return legalizeFDIV32(MI, MRI, B); 2616 if (DstTy == S64) 2617 return legalizeFDIV64(MI, MRI, B); 2618 2619 return false; 2620 } 2621 2622 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2623 Register DstReg, 2624 Register X, 2625 Register Y, 2626 bool IsDiv) const { 2627 const LLT S1 = LLT::scalar(1); 2628 const LLT S32 = LLT::scalar(32); 2629 2630 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2631 // algorithm used here. 2632 2633 // Initial estimate of inv(y). 2634 auto FloatY = B.buildUITOFP(S32, Y); 2635 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2636 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2637 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2638 auto Z = B.buildFPTOUI(S32, ScaledY); 2639 2640 // One round of UNR. 2641 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2642 auto NegYZ = B.buildMul(S32, NegY, Z); 2643 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2644 2645 // Quotient/remainder estimate. 2646 auto Q = B.buildUMulH(S32, X, Z); 2647 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2648 2649 // First quotient/remainder refinement. 2650 auto One = B.buildConstant(S32, 1); 2651 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2652 if (IsDiv) 2653 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2654 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2655 2656 // Second quotient/remainder refinement. 2657 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2658 if (IsDiv) 2659 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2660 else 2661 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2662 } 2663 2664 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2665 MachineRegisterInfo &MRI, 2666 MachineIRBuilder &B) const { 2667 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2668 Register DstReg = MI.getOperand(0).getReg(); 2669 Register Num = MI.getOperand(1).getReg(); 2670 Register Den = MI.getOperand(2).getReg(); 2671 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2672 MI.eraseFromParent(); 2673 return true; 2674 } 2675 2676 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2677 // 2678 // Return lo, hi of result 2679 // 2680 // %cvt.lo = G_UITOFP Val.lo 2681 // %cvt.hi = G_UITOFP Val.hi 2682 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2683 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2684 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2685 // %mul2 = G_FMUL %mul1, 2**(-32) 2686 // %trunc = G_INTRINSIC_TRUNC %mul2 2687 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2688 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2689 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2690 Register Val) { 2691 const LLT S32 = LLT::scalar(32); 2692 auto Unmerge = B.buildUnmerge(S32, Val); 2693 2694 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2695 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2696 2697 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2698 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2699 2700 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2701 auto Mul1 = 2702 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2703 2704 // 2**(-32) 2705 auto Mul2 = 2706 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2707 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2708 2709 // -(2**32) 2710 auto Mad2 = B.buildFMAD(S32, Trunc, 2711 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2712 2713 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2714 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2715 2716 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2717 } 2718 2719 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2720 Register DstReg, 2721 Register Numer, 2722 Register Denom, 2723 bool IsDiv) const { 2724 const LLT S32 = LLT::scalar(32); 2725 const LLT S64 = LLT::scalar(64); 2726 const LLT S1 = LLT::scalar(1); 2727 Register RcpLo, RcpHi; 2728 2729 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2730 2731 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2732 2733 auto Zero64 = B.buildConstant(S64, 0); 2734 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2735 2736 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2737 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2738 2739 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2740 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2741 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2742 2743 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2744 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2745 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2746 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2747 2748 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2749 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2750 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2751 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2752 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2753 2754 auto Zero32 = B.buildConstant(S32, 0); 2755 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2756 auto Add2_HiC = 2757 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2758 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2759 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2760 2761 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2762 Register NumerLo = UnmergeNumer.getReg(0); 2763 Register NumerHi = UnmergeNumer.getReg(1); 2764 2765 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2766 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2767 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2768 Register Mul3_Lo = UnmergeMul3.getReg(0); 2769 Register Mul3_Hi = UnmergeMul3.getReg(1); 2770 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2771 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2772 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2773 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2774 2775 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2776 Register DenomLo = UnmergeDenom.getReg(0); 2777 Register DenomHi = UnmergeDenom.getReg(1); 2778 2779 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2780 auto C1 = B.buildSExt(S32, CmpHi); 2781 2782 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2783 auto C2 = B.buildSExt(S32, CmpLo); 2784 2785 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2786 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2787 2788 // TODO: Here and below portions of the code can be enclosed into if/endif. 2789 // Currently control flow is unconditional and we have 4 selects after 2790 // potential endif to substitute PHIs. 2791 2792 // if C3 != 0 ... 2793 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2794 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2795 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2796 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2797 2798 auto One64 = B.buildConstant(S64, 1); 2799 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2800 2801 auto C4 = 2802 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2803 auto C5 = 2804 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2805 auto C6 = B.buildSelect( 2806 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2807 2808 // if (C6 != 0) 2809 auto Add4 = B.buildAdd(S64, Add3, One64); 2810 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2811 2812 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2813 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2814 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2815 2816 // endif C6 2817 // endif C3 2818 2819 if (IsDiv) { 2820 auto Sel1 = B.buildSelect( 2821 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2822 B.buildSelect(DstReg, 2823 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2824 } else { 2825 auto Sel2 = B.buildSelect( 2826 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2827 B.buildSelect(DstReg, 2828 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2829 } 2830 } 2831 2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2833 MachineRegisterInfo &MRI, 2834 MachineIRBuilder &B) const { 2835 const LLT S64 = LLT::scalar(64); 2836 const LLT S32 = LLT::scalar(32); 2837 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2838 Register DstReg = MI.getOperand(0).getReg(); 2839 Register Num = MI.getOperand(1).getReg(); 2840 Register Den = MI.getOperand(2).getReg(); 2841 LLT Ty = MRI.getType(DstReg); 2842 2843 if (Ty == S32) 2844 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2845 else if (Ty == S64) 2846 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2847 else 2848 return false; 2849 2850 MI.eraseFromParent(); 2851 return true; 2852 2853 } 2854 2855 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2856 MachineRegisterInfo &MRI, 2857 MachineIRBuilder &B) const { 2858 const LLT S64 = LLT::scalar(64); 2859 const LLT S32 = LLT::scalar(32); 2860 2861 Register DstReg = MI.getOperand(0).getReg(); 2862 const LLT Ty = MRI.getType(DstReg); 2863 if (Ty != S32 && Ty != S64) 2864 return false; 2865 2866 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2867 2868 Register LHS = MI.getOperand(1).getReg(); 2869 Register RHS = MI.getOperand(2).getReg(); 2870 2871 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2872 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2873 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2874 2875 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2876 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2877 2878 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2879 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2880 2881 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2882 if (Ty == S32) 2883 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2884 else 2885 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2886 2887 Register Sign; 2888 if (IsDiv) 2889 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2890 else 2891 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2892 2893 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2894 B.buildSub(DstReg, UDivRem, Sign); 2895 2896 MI.eraseFromParent(); 2897 return true; 2898 } 2899 2900 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2901 MachineRegisterInfo &MRI, 2902 MachineIRBuilder &B) const { 2903 Register Res = MI.getOperand(0).getReg(); 2904 Register LHS = MI.getOperand(1).getReg(); 2905 Register RHS = MI.getOperand(2).getReg(); 2906 2907 uint16_t Flags = MI.getFlags(); 2908 2909 LLT ResTy = MRI.getType(Res); 2910 LLT S32 = LLT::scalar(32); 2911 LLT S64 = LLT::scalar(64); 2912 2913 const MachineFunction &MF = B.getMF(); 2914 bool Unsafe = 2915 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2916 2917 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2918 return false; 2919 2920 if (!Unsafe && ResTy == S32 && 2921 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2922 return false; 2923 2924 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2925 // 1 / x -> RCP(x) 2926 if (CLHS->isExactlyValue(1.0)) { 2927 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2928 .addUse(RHS) 2929 .setMIFlags(Flags); 2930 2931 MI.eraseFromParent(); 2932 return true; 2933 } 2934 2935 // -1 / x -> RCP( FNEG(x) ) 2936 if (CLHS->isExactlyValue(-1.0)) { 2937 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2938 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2939 .addUse(FNeg.getReg(0)) 2940 .setMIFlags(Flags); 2941 2942 MI.eraseFromParent(); 2943 return true; 2944 } 2945 } 2946 2947 // x / y -> x * (1.0 / y) 2948 if (Unsafe) { 2949 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2950 .addUse(RHS) 2951 .setMIFlags(Flags); 2952 B.buildFMul(Res, LHS, RCP, Flags); 2953 2954 MI.eraseFromParent(); 2955 return true; 2956 } 2957 2958 return false; 2959 } 2960 2961 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2962 MachineRegisterInfo &MRI, 2963 MachineIRBuilder &B) const { 2964 Register Res = MI.getOperand(0).getReg(); 2965 Register LHS = MI.getOperand(1).getReg(); 2966 Register RHS = MI.getOperand(2).getReg(); 2967 2968 uint16_t Flags = MI.getFlags(); 2969 2970 LLT S16 = LLT::scalar(16); 2971 LLT S32 = LLT::scalar(32); 2972 2973 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2974 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2975 2976 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2977 .addUse(RHSExt.getReg(0)) 2978 .setMIFlags(Flags); 2979 2980 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2981 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2982 2983 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2984 .addUse(RDst.getReg(0)) 2985 .addUse(RHS) 2986 .addUse(LHS) 2987 .setMIFlags(Flags); 2988 2989 MI.eraseFromParent(); 2990 return true; 2991 } 2992 2993 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2994 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2995 static void toggleSPDenormMode(bool Enable, 2996 MachineIRBuilder &B, 2997 const GCNSubtarget &ST, 2998 AMDGPU::SIModeRegisterDefaults Mode) { 2999 // Set SP denorm mode to this value. 3000 unsigned SPDenormMode = 3001 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3002 3003 if (ST.hasDenormModeInst()) { 3004 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3005 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3006 3007 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3008 B.buildInstr(AMDGPU::S_DENORM_MODE) 3009 .addImm(NewDenormModeValue); 3010 3011 } else { 3012 // Select FP32 bit field in mode register. 3013 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3014 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3015 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3016 3017 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3018 .addImm(SPDenormMode) 3019 .addImm(SPDenormModeBitField); 3020 } 3021 } 3022 3023 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3024 MachineRegisterInfo &MRI, 3025 MachineIRBuilder &B) const { 3026 Register Res = MI.getOperand(0).getReg(); 3027 Register LHS = MI.getOperand(1).getReg(); 3028 Register RHS = MI.getOperand(2).getReg(); 3029 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3030 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3031 3032 uint16_t Flags = MI.getFlags(); 3033 3034 LLT S32 = LLT::scalar(32); 3035 LLT S1 = LLT::scalar(1); 3036 3037 auto One = B.buildFConstant(S32, 1.0f); 3038 3039 auto DenominatorScaled = 3040 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3041 .addUse(LHS) 3042 .addUse(RHS) 3043 .addImm(0) 3044 .setMIFlags(Flags); 3045 auto NumeratorScaled = 3046 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3047 .addUse(LHS) 3048 .addUse(RHS) 3049 .addImm(1) 3050 .setMIFlags(Flags); 3051 3052 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3053 .addUse(DenominatorScaled.getReg(0)) 3054 .setMIFlags(Flags); 3055 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3056 3057 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3058 // aren't modeled as reading it. 3059 if (!Mode.allFP32Denormals()) 3060 toggleSPDenormMode(true, B, ST, Mode); 3061 3062 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3063 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3064 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3065 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3066 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3067 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3068 3069 if (!Mode.allFP32Denormals()) 3070 toggleSPDenormMode(false, B, ST, Mode); 3071 3072 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3073 .addUse(Fma4.getReg(0)) 3074 .addUse(Fma1.getReg(0)) 3075 .addUse(Fma3.getReg(0)) 3076 .addUse(NumeratorScaled.getReg(1)) 3077 .setMIFlags(Flags); 3078 3079 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3080 .addUse(Fmas.getReg(0)) 3081 .addUse(RHS) 3082 .addUse(LHS) 3083 .setMIFlags(Flags); 3084 3085 MI.eraseFromParent(); 3086 return true; 3087 } 3088 3089 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3090 MachineRegisterInfo &MRI, 3091 MachineIRBuilder &B) const { 3092 Register Res = MI.getOperand(0).getReg(); 3093 Register LHS = MI.getOperand(1).getReg(); 3094 Register RHS = MI.getOperand(2).getReg(); 3095 3096 uint16_t Flags = MI.getFlags(); 3097 3098 LLT S64 = LLT::scalar(64); 3099 LLT S1 = LLT::scalar(1); 3100 3101 auto One = B.buildFConstant(S64, 1.0); 3102 3103 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3104 .addUse(LHS) 3105 .addUse(RHS) 3106 .addImm(0) 3107 .setMIFlags(Flags); 3108 3109 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3110 3111 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3112 .addUse(DivScale0.getReg(0)) 3113 .setMIFlags(Flags); 3114 3115 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3116 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3117 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3118 3119 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3120 .addUse(LHS) 3121 .addUse(RHS) 3122 .addImm(1) 3123 .setMIFlags(Flags); 3124 3125 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3126 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3127 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3128 3129 Register Scale; 3130 if (!ST.hasUsableDivScaleConditionOutput()) { 3131 // Workaround a hardware bug on SI where the condition output from div_scale 3132 // is not usable. 3133 3134 LLT S32 = LLT::scalar(32); 3135 3136 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3137 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3138 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3139 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3140 3141 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3142 Scale1Unmerge.getReg(1)); 3143 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3144 Scale0Unmerge.getReg(1)); 3145 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3146 } else { 3147 Scale = DivScale1.getReg(1); 3148 } 3149 3150 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3151 .addUse(Fma4.getReg(0)) 3152 .addUse(Fma3.getReg(0)) 3153 .addUse(Mul.getReg(0)) 3154 .addUse(Scale) 3155 .setMIFlags(Flags); 3156 3157 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3158 .addUse(Fmas.getReg(0)) 3159 .addUse(RHS) 3160 .addUse(LHS) 3161 .setMIFlags(Flags); 3162 3163 MI.eraseFromParent(); 3164 return true; 3165 } 3166 3167 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3168 MachineRegisterInfo &MRI, 3169 MachineIRBuilder &B) const { 3170 Register Res = MI.getOperand(0).getReg(); 3171 Register LHS = MI.getOperand(2).getReg(); 3172 Register RHS = MI.getOperand(3).getReg(); 3173 uint16_t Flags = MI.getFlags(); 3174 3175 LLT S32 = LLT::scalar(32); 3176 LLT S1 = LLT::scalar(1); 3177 3178 auto Abs = B.buildFAbs(S32, RHS, Flags); 3179 const APFloat C0Val(1.0f); 3180 3181 auto C0 = B.buildConstant(S32, 0x6f800000); 3182 auto C1 = B.buildConstant(S32, 0x2f800000); 3183 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3184 3185 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3186 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3187 3188 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3189 3190 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3191 .addUse(Mul0.getReg(0)) 3192 .setMIFlags(Flags); 3193 3194 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3195 3196 B.buildFMul(Res, Sel, Mul1, Flags); 3197 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3203 // FIXME: Why do we handle this one but not other removed instructions? 3204 // 3205 // Reciprocal square root. The clamp prevents infinite results, clamping 3206 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3207 // +-max_float. 3208 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3209 MachineRegisterInfo &MRI, 3210 MachineIRBuilder &B) const { 3211 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3212 return true; 3213 3214 Register Dst = MI.getOperand(0).getReg(); 3215 Register Src = MI.getOperand(2).getReg(); 3216 auto Flags = MI.getFlags(); 3217 3218 LLT Ty = MRI.getType(Dst); 3219 3220 const fltSemantics *FltSemantics; 3221 if (Ty == LLT::scalar(32)) 3222 FltSemantics = &APFloat::IEEEsingle(); 3223 else if (Ty == LLT::scalar(64)) 3224 FltSemantics = &APFloat::IEEEdouble(); 3225 else 3226 return false; 3227 3228 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3229 .addUse(Src) 3230 .setMIFlags(Flags); 3231 3232 // We don't need to concern ourselves with the snan handling difference, since 3233 // the rsq quieted (or not) so use the one which will directly select. 3234 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3235 const bool UseIEEE = MFI->getMode().IEEE; 3236 3237 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3238 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3239 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3240 3241 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3242 3243 if (UseIEEE) 3244 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3245 else 3246 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3247 MI.eraseFromParent(); 3248 return true; 3249 } 3250 3251 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3252 switch (IID) { 3253 case Intrinsic::amdgcn_ds_fadd: 3254 return AMDGPU::G_ATOMICRMW_FADD; 3255 case Intrinsic::amdgcn_ds_fmin: 3256 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3257 case Intrinsic::amdgcn_ds_fmax: 3258 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3259 default: 3260 llvm_unreachable("not a DS FP intrinsic"); 3261 } 3262 } 3263 3264 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3265 MachineInstr &MI, 3266 Intrinsic::ID IID) const { 3267 GISelChangeObserver &Observer = Helper.Observer; 3268 Observer.changingInstr(MI); 3269 3270 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3271 3272 // The remaining operands were used to set fields in the MemOperand on 3273 // construction. 3274 for (int I = 6; I > 3; --I) 3275 MI.RemoveOperand(I); 3276 3277 MI.RemoveOperand(1); // Remove the intrinsic ID. 3278 Observer.changedInstr(MI); 3279 return true; 3280 } 3281 3282 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3283 MachineRegisterInfo &MRI, 3284 MachineIRBuilder &B) const { 3285 uint64_t Offset = 3286 ST.getTargetLowering()->getImplicitParameterOffset( 3287 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3288 LLT DstTy = MRI.getType(DstReg); 3289 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3290 3291 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3292 if (!loadInputValue(KernargPtrReg, B, 3293 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3294 return false; 3295 3296 // FIXME: This should be nuw 3297 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3298 return true; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B) const { 3304 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3305 if (!MFI->isEntryFunction()) { 3306 return legalizePreloadedArgIntrin(MI, MRI, B, 3307 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3308 } 3309 3310 Register DstReg = MI.getOperand(0).getReg(); 3311 if (!getImplicitArgPtr(DstReg, MRI, B)) 3312 return false; 3313 3314 MI.eraseFromParent(); 3315 return true; 3316 } 3317 3318 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3319 MachineRegisterInfo &MRI, 3320 MachineIRBuilder &B, 3321 unsigned AddrSpace) const { 3322 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3323 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3324 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3325 MI.eraseFromParent(); 3326 return true; 3327 } 3328 3329 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3330 // offset (the offset that is included in bounds checking and swizzling, to be 3331 // split between the instruction's voffset and immoffset fields) and soffset 3332 // (the offset that is excluded from bounds checking and swizzling, to go in 3333 // the instruction's soffset field). This function takes the first kind of 3334 // offset and figures out how to split it between voffset and immoffset. 3335 std::tuple<Register, unsigned, unsigned> 3336 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3337 Register OrigOffset) const { 3338 const unsigned MaxImm = 4095; 3339 Register BaseReg; 3340 unsigned TotalConstOffset; 3341 MachineInstr *OffsetDef; 3342 const LLT S32 = LLT::scalar(32); 3343 3344 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3345 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3346 3347 unsigned ImmOffset = TotalConstOffset; 3348 3349 // If the immediate value is too big for the immoffset field, put the value 3350 // and -4096 into the immoffset field so that the value that is copied/added 3351 // for the voffset field is a multiple of 4096, and it stands more chance 3352 // of being CSEd with the copy/add for another similar load/store. 3353 // However, do not do that rounding down to a multiple of 4096 if that is a 3354 // negative number, as it appears to be illegal to have a negative offset 3355 // in the vgpr, even if adding the immediate offset makes it positive. 3356 unsigned Overflow = ImmOffset & ~MaxImm; 3357 ImmOffset -= Overflow; 3358 if ((int32_t)Overflow < 0) { 3359 Overflow += ImmOffset; 3360 ImmOffset = 0; 3361 } 3362 3363 if (Overflow != 0) { 3364 if (!BaseReg) { 3365 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3366 } else { 3367 auto OverflowVal = B.buildConstant(S32, Overflow); 3368 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3369 } 3370 } 3371 3372 if (!BaseReg) 3373 BaseReg = B.buildConstant(S32, 0).getReg(0); 3374 3375 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3376 } 3377 3378 /// Handle register layout difference for f16 images for some subtargets. 3379 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3380 MachineRegisterInfo &MRI, 3381 Register Reg) const { 3382 if (!ST.hasUnpackedD16VMem()) 3383 return Reg; 3384 3385 const LLT S16 = LLT::scalar(16); 3386 const LLT S32 = LLT::scalar(32); 3387 LLT StoreVT = MRI.getType(Reg); 3388 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3389 3390 auto Unmerge = B.buildUnmerge(S16, Reg); 3391 3392 SmallVector<Register, 4> WideRegs; 3393 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3394 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3395 3396 int NumElts = StoreVT.getNumElements(); 3397 3398 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3399 } 3400 3401 Register AMDGPULegalizerInfo::fixStoreSourceType( 3402 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3403 MachineRegisterInfo *MRI = B.getMRI(); 3404 LLT Ty = MRI->getType(VData); 3405 3406 const LLT S16 = LLT::scalar(16); 3407 3408 // Fixup illegal register types for i8 stores. 3409 if (Ty == LLT::scalar(8) || Ty == S16) { 3410 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3411 return AnyExt; 3412 } 3413 3414 if (Ty.isVector()) { 3415 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3416 if (IsFormat) 3417 return handleD16VData(B, *MRI, VData); 3418 } 3419 } 3420 3421 return VData; 3422 } 3423 3424 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3425 MachineRegisterInfo &MRI, 3426 MachineIRBuilder &B, 3427 bool IsTyped, 3428 bool IsFormat) const { 3429 Register VData = MI.getOperand(1).getReg(); 3430 LLT Ty = MRI.getType(VData); 3431 LLT EltTy = Ty.getScalarType(); 3432 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3433 const LLT S32 = LLT::scalar(32); 3434 3435 VData = fixStoreSourceType(B, VData, IsFormat); 3436 Register RSrc = MI.getOperand(2).getReg(); 3437 3438 MachineMemOperand *MMO = *MI.memoperands_begin(); 3439 const int MemSize = MMO->getSize(); 3440 3441 unsigned ImmOffset; 3442 unsigned TotalOffset; 3443 3444 // The typed intrinsics add an immediate after the registers. 3445 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3446 3447 // The struct intrinsic variants add one additional operand over raw. 3448 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3449 Register VIndex; 3450 int OpOffset = 0; 3451 if (HasVIndex) { 3452 VIndex = MI.getOperand(3).getReg(); 3453 OpOffset = 1; 3454 } 3455 3456 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3457 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3458 3459 unsigned Format = 0; 3460 if (IsTyped) { 3461 Format = MI.getOperand(5 + OpOffset).getImm(); 3462 ++OpOffset; 3463 } 3464 3465 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3466 3467 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3468 if (TotalOffset != 0) 3469 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3470 3471 unsigned Opc; 3472 if (IsTyped) { 3473 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3474 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3475 } else if (IsFormat) { 3476 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3477 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3478 } else { 3479 switch (MemSize) { 3480 case 1: 3481 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3482 break; 3483 case 2: 3484 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3485 break; 3486 default: 3487 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3488 break; 3489 } 3490 } 3491 3492 if (!VIndex) 3493 VIndex = B.buildConstant(S32, 0).getReg(0); 3494 3495 auto MIB = B.buildInstr(Opc) 3496 .addUse(VData) // vdata 3497 .addUse(RSrc) // rsrc 3498 .addUse(VIndex) // vindex 3499 .addUse(VOffset) // voffset 3500 .addUse(SOffset) // soffset 3501 .addImm(ImmOffset); // offset(imm) 3502 3503 if (IsTyped) 3504 MIB.addImm(Format); 3505 3506 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3507 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3508 .addMemOperand(MMO); 3509 3510 MI.eraseFromParent(); 3511 return true; 3512 } 3513 3514 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3515 MachineRegisterInfo &MRI, 3516 MachineIRBuilder &B, 3517 bool IsFormat, 3518 bool IsTyped) const { 3519 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3520 MachineMemOperand *MMO = *MI.memoperands_begin(); 3521 const int MemSize = MMO->getSize(); 3522 const LLT S32 = LLT::scalar(32); 3523 3524 Register Dst = MI.getOperand(0).getReg(); 3525 Register RSrc = MI.getOperand(2).getReg(); 3526 3527 // The typed intrinsics add an immediate after the registers. 3528 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3529 3530 // The struct intrinsic variants add one additional operand over raw. 3531 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3532 Register VIndex; 3533 int OpOffset = 0; 3534 if (HasVIndex) { 3535 VIndex = MI.getOperand(3).getReg(); 3536 OpOffset = 1; 3537 } 3538 3539 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3540 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3541 3542 unsigned Format = 0; 3543 if (IsTyped) { 3544 Format = MI.getOperand(5 + OpOffset).getImm(); 3545 ++OpOffset; 3546 } 3547 3548 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3549 unsigned ImmOffset; 3550 unsigned TotalOffset; 3551 3552 LLT Ty = MRI.getType(Dst); 3553 LLT EltTy = Ty.getScalarType(); 3554 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3555 const bool Unpacked = ST.hasUnpackedD16VMem(); 3556 3557 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3558 if (TotalOffset != 0) 3559 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3560 3561 unsigned Opc; 3562 3563 if (IsTyped) { 3564 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3565 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3566 } else if (IsFormat) { 3567 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3568 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3569 } else { 3570 switch (MemSize) { 3571 case 1: 3572 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3573 break; 3574 case 2: 3575 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3576 break; 3577 default: 3578 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3579 break; 3580 } 3581 } 3582 3583 Register LoadDstReg; 3584 3585 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3586 LLT UnpackedTy = Ty.changeElementSize(32); 3587 3588 if (IsExtLoad) 3589 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3590 else if (Unpacked && IsD16 && Ty.isVector()) 3591 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3592 else 3593 LoadDstReg = Dst; 3594 3595 if (!VIndex) 3596 VIndex = B.buildConstant(S32, 0).getReg(0); 3597 3598 auto MIB = B.buildInstr(Opc) 3599 .addDef(LoadDstReg) // vdata 3600 .addUse(RSrc) // rsrc 3601 .addUse(VIndex) // vindex 3602 .addUse(VOffset) // voffset 3603 .addUse(SOffset) // soffset 3604 .addImm(ImmOffset); // offset(imm) 3605 3606 if (IsTyped) 3607 MIB.addImm(Format); 3608 3609 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3610 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3611 .addMemOperand(MMO); 3612 3613 if (LoadDstReg != Dst) { 3614 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3615 3616 // Widen result for extending loads was widened. 3617 if (IsExtLoad) 3618 B.buildTrunc(Dst, LoadDstReg); 3619 else { 3620 // Repack to original 16-bit vector result 3621 // FIXME: G_TRUNC should work, but legalization currently fails 3622 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3623 SmallVector<Register, 4> Repack; 3624 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3625 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3626 B.buildMerge(Dst, Repack); 3627 } 3628 } 3629 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3635 MachineIRBuilder &B, 3636 bool IsInc) const { 3637 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3638 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3639 B.buildInstr(Opc) 3640 .addDef(MI.getOperand(0).getReg()) 3641 .addUse(MI.getOperand(2).getReg()) 3642 .addUse(MI.getOperand(3).getReg()) 3643 .cloneMemRefs(MI); 3644 MI.eraseFromParent(); 3645 return true; 3646 } 3647 3648 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3649 switch (IntrID) { 3650 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3651 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3652 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3653 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3654 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3656 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3657 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3658 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3659 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3660 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3661 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3662 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3663 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3664 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3665 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3666 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3667 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3668 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3669 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3671 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3672 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3674 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3675 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3676 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3677 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3678 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3679 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3680 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3681 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3682 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3683 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3684 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3685 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3686 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3687 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3688 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3689 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3690 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3691 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3692 default: 3693 llvm_unreachable("unhandled atomic opcode"); 3694 } 3695 } 3696 3697 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3698 MachineIRBuilder &B, 3699 Intrinsic::ID IID) const { 3700 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3701 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3702 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3703 3704 Register Dst; 3705 3706 int OpOffset = 0; 3707 if (HasReturn) { 3708 // A few FP atomics do not support return values. 3709 Dst = MI.getOperand(0).getReg(); 3710 } else { 3711 OpOffset = -1; 3712 } 3713 3714 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3715 Register CmpVal; 3716 3717 if (IsCmpSwap) { 3718 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3719 ++OpOffset; 3720 } 3721 3722 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3723 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3724 3725 // The struct intrinsic variants add one additional operand over raw. 3726 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3727 Register VIndex; 3728 if (HasVIndex) { 3729 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3730 ++OpOffset; 3731 } 3732 3733 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3734 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3735 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3736 3737 MachineMemOperand *MMO = *MI.memoperands_begin(); 3738 3739 unsigned ImmOffset; 3740 unsigned TotalOffset; 3741 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3742 if (TotalOffset != 0) 3743 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3744 3745 if (!VIndex) 3746 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3747 3748 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3749 3750 if (HasReturn) 3751 MIB.addDef(Dst); 3752 3753 MIB.addUse(VData); // vdata 3754 3755 if (IsCmpSwap) 3756 MIB.addReg(CmpVal); 3757 3758 MIB.addUse(RSrc) // rsrc 3759 .addUse(VIndex) // vindex 3760 .addUse(VOffset) // voffset 3761 .addUse(SOffset) // soffset 3762 .addImm(ImmOffset) // offset(imm) 3763 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3764 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3765 .addMemOperand(MMO); 3766 3767 MI.eraseFromParent(); 3768 return true; 3769 } 3770 3771 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3772 /// vector with s16 typed elements. 3773 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3774 SmallVectorImpl<Register> &PackedAddrs, 3775 int AddrIdx, int DimIdx, int EndIdx, 3776 int NumGradients) { 3777 const LLT S16 = LLT::scalar(16); 3778 const LLT V2S16 = LLT::vector(2, 16); 3779 3780 for (int I = AddrIdx; I < EndIdx; ++I) { 3781 MachineOperand &SrcOp = MI.getOperand(I); 3782 if (!SrcOp.isReg()) 3783 continue; // _L to _LZ may have eliminated this. 3784 3785 Register AddrReg = SrcOp.getReg(); 3786 3787 if (I < DimIdx) { 3788 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3789 PackedAddrs.push_back(AddrReg); 3790 } else { 3791 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3792 // derivatives dx/dh and dx/dv are packed with undef. 3793 if (((I + 1) >= EndIdx) || 3794 ((NumGradients / 2) % 2 == 1 && 3795 (I == DimIdx + (NumGradients / 2) - 1 || 3796 I == DimIdx + NumGradients - 1)) || 3797 // Check for _L to _LZ optimization 3798 !MI.getOperand(I + 1).isReg()) { 3799 PackedAddrs.push_back( 3800 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3801 .getReg(0)); 3802 } else { 3803 PackedAddrs.push_back( 3804 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3805 .getReg(0)); 3806 ++I; 3807 } 3808 } 3809 } 3810 } 3811 3812 /// Convert from separate vaddr components to a single vector address register, 3813 /// and replace the remaining operands with $noreg. 3814 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3815 int DimIdx, int NumVAddrs) { 3816 const LLT S32 = LLT::scalar(32); 3817 3818 SmallVector<Register, 8> AddrRegs; 3819 for (int I = 0; I != NumVAddrs; ++I) { 3820 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3821 if (SrcOp.isReg()) { 3822 AddrRegs.push_back(SrcOp.getReg()); 3823 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3824 } 3825 } 3826 3827 int NumAddrRegs = AddrRegs.size(); 3828 if (NumAddrRegs != 1) { 3829 // Round up to 8 elements for v5-v7 3830 // FIXME: Missing intermediate sized register classes and instructions. 3831 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3832 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3833 auto Undef = B.buildUndef(S32); 3834 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3835 NumAddrRegs = RoundedNumRegs; 3836 } 3837 3838 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3839 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3840 } 3841 3842 for (int I = 1; I != NumVAddrs; ++I) { 3843 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3844 if (SrcOp.isReg()) 3845 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3846 } 3847 } 3848 3849 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3850 /// 3851 /// Depending on the subtarget, load/store with 16-bit element data need to be 3852 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3853 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3854 /// registers. 3855 /// 3856 /// We don't want to directly select image instructions just yet, but also want 3857 /// to exposes all register repacking to the legalizer/combiners. We also don't 3858 /// want a selected instrution entering RegBankSelect. In order to avoid 3859 /// defining a multitude of intermediate image instructions, directly hack on 3860 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3861 /// now unnecessary arguments with $noreg. 3862 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3863 MachineInstr &MI, MachineIRBuilder &B, 3864 GISelChangeObserver &Observer, 3865 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3866 3867 const int NumDefs = MI.getNumExplicitDefs(); 3868 bool IsTFE = NumDefs == 2; 3869 // We are only processing the operands of d16 image operations on subtargets 3870 // that use the unpacked register layout, or need to repack the TFE result. 3871 3872 // TODO: Do we need to guard against already legalized intrinsics? 3873 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3874 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3875 3876 MachineRegisterInfo *MRI = B.getMRI(); 3877 const LLT S32 = LLT::scalar(32); 3878 const LLT S16 = LLT::scalar(16); 3879 const LLT V2S16 = LLT::vector(2, 16); 3880 3881 // Index of first address argument 3882 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3883 3884 int NumVAddrs, NumGradients; 3885 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3886 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3887 getDMaskIdx(BaseOpcode, NumDefs); 3888 unsigned DMask = 0; 3889 3890 // Check for 16 bit addresses and pack if true. 3891 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3892 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3893 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3894 const bool IsG16 = GradTy == S16; 3895 const bool IsA16 = AddrTy == S16; 3896 3897 int DMaskLanes = 0; 3898 if (!BaseOpcode->Atomic) { 3899 DMask = MI.getOperand(DMaskIdx).getImm(); 3900 if (BaseOpcode->Gather4) { 3901 DMaskLanes = 4; 3902 } else if (DMask != 0) { 3903 DMaskLanes = countPopulation(DMask); 3904 } else if (!IsTFE && !BaseOpcode->Store) { 3905 // If dmask is 0, this is a no-op load. This can be eliminated. 3906 B.buildUndef(MI.getOperand(0)); 3907 MI.eraseFromParent(); 3908 return true; 3909 } 3910 } 3911 3912 Observer.changingInstr(MI); 3913 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3914 3915 unsigned NewOpcode = NumDefs == 0 ? 3916 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3917 3918 // Track that we legalized this 3919 MI.setDesc(B.getTII().get(NewOpcode)); 3920 3921 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3922 // dmask to be at least 1 otherwise the instruction will fail 3923 if (IsTFE && DMask == 0) { 3924 DMask = 0x1; 3925 DMaskLanes = 1; 3926 MI.getOperand(DMaskIdx).setImm(DMask); 3927 } 3928 3929 if (BaseOpcode->Atomic) { 3930 Register VData0 = MI.getOperand(2).getReg(); 3931 LLT Ty = MRI->getType(VData0); 3932 3933 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3934 if (Ty.isVector()) 3935 return false; 3936 3937 if (BaseOpcode->AtomicX2) { 3938 Register VData1 = MI.getOperand(3).getReg(); 3939 // The two values are packed in one register. 3940 LLT PackedTy = LLT::vector(2, Ty); 3941 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3942 MI.getOperand(2).setReg(Concat.getReg(0)); 3943 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3944 } 3945 } 3946 3947 int CorrectedNumVAddrs = NumVAddrs; 3948 3949 // Optimize _L to _LZ when _L is zero 3950 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3951 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3952 const ConstantFP *ConstantLod; 3953 const int LodIdx = AddrIdx + NumVAddrs - 1; 3954 3955 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3956 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3957 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3958 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3959 LZMappingInfo->LZ, ImageDimIntr->Dim); 3960 3961 // The starting indexes should remain in the same place. 3962 --NumVAddrs; 3963 --CorrectedNumVAddrs; 3964 3965 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3966 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3967 MI.RemoveOperand(LodIdx); 3968 } 3969 } 3970 } 3971 3972 // Optimize _mip away, when 'lod' is zero 3973 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3974 int64_t ConstantLod; 3975 const int LodIdx = AddrIdx + NumVAddrs - 1; 3976 3977 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3978 if (ConstantLod == 0) { 3979 // TODO: Change intrinsic opcode and remove operand instead or replacing 3980 // it with 0, as the _L to _LZ handling is done above. 3981 MI.getOperand(LodIdx).ChangeToImmediate(0); 3982 --CorrectedNumVAddrs; 3983 } 3984 } 3985 } 3986 3987 // Rewrite the addressing register layout before doing anything else. 3988 if (IsA16 || IsG16) { 3989 if (IsA16) { 3990 // Target must support the feature and gradients need to be 16 bit too 3991 if (!ST.hasA16() || !IsG16) 3992 return false; 3993 } else if (!ST.hasG16()) 3994 return false; 3995 3996 if (NumVAddrs > 1) { 3997 SmallVector<Register, 4> PackedRegs; 3998 // Don't compress addresses for G16 3999 const int PackEndIdx = 4000 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 4001 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 4002 PackEndIdx, NumGradients); 4003 4004 if (!IsA16) { 4005 // Add uncompressed address 4006 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 4007 int AddrReg = MI.getOperand(I).getReg(); 4008 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4009 PackedRegs.push_back(AddrReg); 4010 } 4011 } 4012 4013 // See also below in the non-a16 branch 4014 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4015 4016 if (!UseNSA && PackedRegs.size() > 1) { 4017 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4018 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4019 PackedRegs[0] = Concat.getReg(0); 4020 PackedRegs.resize(1); 4021 } 4022 4023 const int NumPacked = PackedRegs.size(); 4024 for (int I = 0; I != NumVAddrs; ++I) { 4025 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 4026 if (!SrcOp.isReg()) { 4027 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4028 continue; 4029 } 4030 4031 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4032 4033 if (I < NumPacked) 4034 SrcOp.setReg(PackedRegs[I]); 4035 else 4036 SrcOp.setReg(AMDGPU::NoRegister); 4037 } 4038 } 4039 } else { 4040 // If the register allocator cannot place the address registers contiguously 4041 // without introducing moves, then using the non-sequential address encoding 4042 // is always preferable, since it saves VALU instructions and is usually a 4043 // wash in terms of code size or even better. 4044 // 4045 // However, we currently have no way of hinting to the register allocator 4046 // that MIMG addresses should be placed contiguously when it is possible to 4047 // do so, so force non-NSA for the common 2-address case as a heuristic. 4048 // 4049 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4050 // allocation when possible. 4051 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4052 4053 if (!UseNSA && NumVAddrs > 1) 4054 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 4055 } 4056 4057 int Flags = 0; 4058 if (IsA16) 4059 Flags |= 1; 4060 if (IsG16) 4061 Flags |= 2; 4062 MI.addOperand(MachineOperand::CreateImm(Flags)); 4063 4064 if (BaseOpcode->Store) { // No TFE for stores? 4065 // TODO: Handle dmask trim 4066 Register VData = MI.getOperand(1).getReg(); 4067 LLT Ty = MRI->getType(VData); 4068 if (!Ty.isVector() || Ty.getElementType() != S16) 4069 return true; 4070 4071 Register RepackedReg = handleD16VData(B, *MRI, VData); 4072 if (RepackedReg != VData) { 4073 MI.getOperand(1).setReg(RepackedReg); 4074 } 4075 4076 return true; 4077 } 4078 4079 Register DstReg = MI.getOperand(0).getReg(); 4080 LLT Ty = MRI->getType(DstReg); 4081 const LLT EltTy = Ty.getScalarType(); 4082 const bool IsD16 = Ty.getScalarType() == S16; 4083 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4084 4085 // Confirm that the return type is large enough for the dmask specified 4086 if (NumElts < DMaskLanes) 4087 return false; 4088 4089 if (NumElts > 4 || DMaskLanes > 4) 4090 return false; 4091 4092 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4093 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4094 4095 // The raw dword aligned data component of the load. The only legal cases 4096 // where this matters should be when using the packed D16 format, for 4097 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4098 LLT RoundedTy; 4099 4100 // S32 vector to to cover all data, plus TFE result element. 4101 LLT TFETy; 4102 4103 // Register type to use for each loaded component. Will be S32 or V2S16. 4104 LLT RegTy; 4105 4106 if (IsD16 && ST.hasUnpackedD16VMem()) { 4107 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4108 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4109 RegTy = S32; 4110 } else { 4111 unsigned EltSize = EltTy.getSizeInBits(); 4112 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4113 unsigned RoundedSize = 32 * RoundedElts; 4114 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4115 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4116 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4117 } 4118 4119 // The return type does not need adjustment. 4120 // TODO: Should we change s16 case to s32 or <2 x s16>? 4121 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4122 return true; 4123 4124 Register Dst1Reg; 4125 4126 // Insert after the instruction. 4127 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4128 4129 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4130 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4131 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4132 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4133 4134 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4135 4136 MI.getOperand(0).setReg(NewResultReg); 4137 4138 // In the IR, TFE is supposed to be used with a 2 element struct return 4139 // type. The intruction really returns these two values in one contiguous 4140 // register, with one additional dword beyond the loaded data. Rewrite the 4141 // return type to use a single register result. 4142 4143 if (IsTFE) { 4144 Dst1Reg = MI.getOperand(1).getReg(); 4145 if (MRI->getType(Dst1Reg) != S32) 4146 return false; 4147 4148 // TODO: Make sure the TFE operand bit is set. 4149 MI.RemoveOperand(1); 4150 4151 // Handle the easy case that requires no repack instructions. 4152 if (Ty == S32) { 4153 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4154 return true; 4155 } 4156 } 4157 4158 // Now figure out how to copy the new result register back into the old 4159 // result. 4160 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4161 4162 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4163 4164 if (ResultNumRegs == 1) { 4165 assert(!IsTFE); 4166 ResultRegs[0] = NewResultReg; 4167 } else { 4168 // We have to repack into a new vector of some kind. 4169 for (int I = 0; I != NumDataRegs; ++I) 4170 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4171 B.buildUnmerge(ResultRegs, NewResultReg); 4172 4173 // Drop the final TFE element to get the data part. The TFE result is 4174 // directly written to the right place already. 4175 if (IsTFE) 4176 ResultRegs.resize(NumDataRegs); 4177 } 4178 4179 // For an s16 scalar result, we form an s32 result with a truncate regardless 4180 // of packed vs. unpacked. 4181 if (IsD16 && !Ty.isVector()) { 4182 B.buildTrunc(DstReg, ResultRegs[0]); 4183 return true; 4184 } 4185 4186 // Avoid a build/concat_vector of 1 entry. 4187 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4188 B.buildBitcast(DstReg, ResultRegs[0]); 4189 return true; 4190 } 4191 4192 assert(Ty.isVector()); 4193 4194 if (IsD16) { 4195 // For packed D16 results with TFE enabled, all the data components are 4196 // S32. Cast back to the expected type. 4197 // 4198 // TODO: We don't really need to use load s32 elements. We would only need one 4199 // cast for the TFE result if a multiple of v2s16 was used. 4200 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4201 for (Register &Reg : ResultRegs) 4202 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4203 } else if (ST.hasUnpackedD16VMem()) { 4204 for (Register &Reg : ResultRegs) 4205 Reg = B.buildTrunc(S16, Reg).getReg(0); 4206 } 4207 } 4208 4209 auto padWithUndef = [&](LLT Ty, int NumElts) { 4210 if (NumElts == 0) 4211 return; 4212 Register Undef = B.buildUndef(Ty).getReg(0); 4213 for (int I = 0; I != NumElts; ++I) 4214 ResultRegs.push_back(Undef); 4215 }; 4216 4217 // Pad out any elements eliminated due to the dmask. 4218 LLT ResTy = MRI->getType(ResultRegs[0]); 4219 if (!ResTy.isVector()) { 4220 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4221 B.buildBuildVector(DstReg, ResultRegs); 4222 return true; 4223 } 4224 4225 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4226 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4227 4228 // Deal with the one annoying legal case. 4229 const LLT V3S16 = LLT::vector(3, 16); 4230 if (Ty == V3S16) { 4231 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4232 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4233 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4234 return true; 4235 } 4236 4237 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4238 B.buildConcatVectors(DstReg, ResultRegs); 4239 return true; 4240 } 4241 4242 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4243 LegalizerHelper &Helper, MachineInstr &MI) const { 4244 MachineIRBuilder &B = Helper.MIRBuilder; 4245 GISelChangeObserver &Observer = Helper.Observer; 4246 4247 Register Dst = MI.getOperand(0).getReg(); 4248 LLT Ty = B.getMRI()->getType(Dst); 4249 unsigned Size = Ty.getSizeInBits(); 4250 MachineFunction &MF = B.getMF(); 4251 4252 Observer.changingInstr(MI); 4253 4254 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4255 Ty = getBitcastRegisterType(Ty); 4256 Helper.bitcastDst(MI, Ty, 0); 4257 Dst = MI.getOperand(0).getReg(); 4258 B.setInsertPt(B.getMBB(), MI); 4259 } 4260 4261 // FIXME: We don't really need this intermediate instruction. The intrinsic 4262 // should be fixed to have a memory operand. Since it's readnone, we're not 4263 // allowed to add one. 4264 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4265 MI.RemoveOperand(1); // Remove intrinsic ID 4266 4267 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4268 // TODO: Should this use datalayout alignment? 4269 const unsigned MemSize = (Size + 7) / 8; 4270 const Align MemAlign(4); 4271 MachineMemOperand *MMO = MF.getMachineMemOperand( 4272 MachinePointerInfo(), 4273 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4274 MachineMemOperand::MOInvariant, 4275 MemSize, MemAlign); 4276 MI.addMemOperand(MF, MMO); 4277 4278 // There are no 96-bit result scalar loads, but widening to 128-bit should 4279 // always be legal. We may need to restore this to a 96-bit result if it turns 4280 // out this needs to be converted to a vector load during RegBankSelect. 4281 if (!isPowerOf2_32(Size)) { 4282 if (Ty.isVector()) 4283 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4284 else 4285 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4286 } 4287 4288 Observer.changedInstr(MI); 4289 return true; 4290 } 4291 4292 // TODO: Move to selection 4293 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4294 MachineRegisterInfo &MRI, 4295 MachineIRBuilder &B) const { 4296 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4297 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4298 !ST.isTrapHandlerEnabled()) { 4299 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4300 } else { 4301 // Pass queue pointer to trap handler as input, and insert trap instruction 4302 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4303 MachineRegisterInfo &MRI = *B.getMRI(); 4304 4305 Register LiveIn = 4306 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4307 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4308 return false; 4309 4310 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4311 B.buildCopy(SGPR01, LiveIn); 4312 B.buildInstr(AMDGPU::S_TRAP) 4313 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4314 .addReg(SGPR01, RegState::Implicit); 4315 } 4316 4317 MI.eraseFromParent(); 4318 return true; 4319 } 4320 4321 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4322 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4323 // Is non-HSA path or trap-handler disabled? then, report a warning 4324 // accordingly 4325 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4326 !ST.isTrapHandlerEnabled()) { 4327 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4328 "debugtrap handler not supported", 4329 MI.getDebugLoc(), DS_Warning); 4330 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4331 Ctx.diagnose(NoTrap); 4332 } else { 4333 // Insert debug-trap instruction 4334 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4335 } 4336 4337 MI.eraseFromParent(); 4338 return true; 4339 } 4340 4341 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4342 MachineInstr &MI) const { 4343 MachineIRBuilder &B = Helper.MIRBuilder; 4344 MachineRegisterInfo &MRI = *B.getMRI(); 4345 4346 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4347 auto IntrID = MI.getIntrinsicID(); 4348 switch (IntrID) { 4349 case Intrinsic::amdgcn_if: 4350 case Intrinsic::amdgcn_else: { 4351 MachineInstr *Br = nullptr; 4352 MachineBasicBlock *UncondBrTarget = nullptr; 4353 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4354 const SIRegisterInfo *TRI 4355 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4356 4357 Register Def = MI.getOperand(1).getReg(); 4358 Register Use = MI.getOperand(3).getReg(); 4359 4360 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4361 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4362 if (IntrID == Intrinsic::amdgcn_if) { 4363 B.buildInstr(AMDGPU::SI_IF) 4364 .addDef(Def) 4365 .addUse(Use) 4366 .addMBB(UncondBrTarget); 4367 } else { 4368 B.buildInstr(AMDGPU::SI_ELSE) 4369 .addDef(Def) 4370 .addUse(Use) 4371 .addMBB(UncondBrTarget) 4372 .addImm(0); 4373 } 4374 4375 if (Br) { 4376 Br->getOperand(0).setMBB(CondBrTarget); 4377 } else { 4378 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4379 // since we're swapping branch targets it needs to be reinserted. 4380 // FIXME: IRTranslator should probably not do this 4381 B.buildBr(*CondBrTarget); 4382 } 4383 4384 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4385 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4386 MI.eraseFromParent(); 4387 BrCond->eraseFromParent(); 4388 return true; 4389 } 4390 4391 return false; 4392 } 4393 case Intrinsic::amdgcn_loop: { 4394 MachineInstr *Br = nullptr; 4395 MachineBasicBlock *UncondBrTarget = nullptr; 4396 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4397 const SIRegisterInfo *TRI 4398 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4399 4400 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4401 Register Reg = MI.getOperand(2).getReg(); 4402 4403 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4404 B.buildInstr(AMDGPU::SI_LOOP) 4405 .addUse(Reg) 4406 .addMBB(UncondBrTarget); 4407 4408 if (Br) 4409 Br->getOperand(0).setMBB(CondBrTarget); 4410 else 4411 B.buildBr(*CondBrTarget); 4412 4413 MI.eraseFromParent(); 4414 BrCond->eraseFromParent(); 4415 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4416 return true; 4417 } 4418 4419 return false; 4420 } 4421 case Intrinsic::amdgcn_kernarg_segment_ptr: 4422 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4423 // This only makes sense to call in a kernel, so just lower to null. 4424 B.buildConstant(MI.getOperand(0).getReg(), 0); 4425 MI.eraseFromParent(); 4426 return true; 4427 } 4428 4429 return legalizePreloadedArgIntrin( 4430 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4431 case Intrinsic::amdgcn_implicitarg_ptr: 4432 return legalizeImplicitArgPtr(MI, MRI, B); 4433 case Intrinsic::amdgcn_workitem_id_x: 4434 return legalizePreloadedArgIntrin(MI, MRI, B, 4435 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4436 case Intrinsic::amdgcn_workitem_id_y: 4437 return legalizePreloadedArgIntrin(MI, MRI, B, 4438 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4439 case Intrinsic::amdgcn_workitem_id_z: 4440 return legalizePreloadedArgIntrin(MI, MRI, B, 4441 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4442 case Intrinsic::amdgcn_workgroup_id_x: 4443 return legalizePreloadedArgIntrin(MI, MRI, B, 4444 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4445 case Intrinsic::amdgcn_workgroup_id_y: 4446 return legalizePreloadedArgIntrin(MI, MRI, B, 4447 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4448 case Intrinsic::amdgcn_workgroup_id_z: 4449 return legalizePreloadedArgIntrin(MI, MRI, B, 4450 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4451 case Intrinsic::amdgcn_dispatch_ptr: 4452 return legalizePreloadedArgIntrin(MI, MRI, B, 4453 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4454 case Intrinsic::amdgcn_queue_ptr: 4455 return legalizePreloadedArgIntrin(MI, MRI, B, 4456 AMDGPUFunctionArgInfo::QUEUE_PTR); 4457 case Intrinsic::amdgcn_implicit_buffer_ptr: 4458 return legalizePreloadedArgIntrin( 4459 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4460 case Intrinsic::amdgcn_dispatch_id: 4461 return legalizePreloadedArgIntrin(MI, MRI, B, 4462 AMDGPUFunctionArgInfo::DISPATCH_ID); 4463 case Intrinsic::amdgcn_fdiv_fast: 4464 return legalizeFDIVFastIntrin(MI, MRI, B); 4465 case Intrinsic::amdgcn_is_shared: 4466 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4467 case Intrinsic::amdgcn_is_private: 4468 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4469 case Intrinsic::amdgcn_wavefrontsize: { 4470 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4471 MI.eraseFromParent(); 4472 return true; 4473 } 4474 case Intrinsic::amdgcn_s_buffer_load: 4475 return legalizeSBufferLoad(Helper, MI); 4476 case Intrinsic::amdgcn_raw_buffer_store: 4477 case Intrinsic::amdgcn_struct_buffer_store: 4478 return legalizeBufferStore(MI, MRI, B, false, false); 4479 case Intrinsic::amdgcn_raw_buffer_store_format: 4480 case Intrinsic::amdgcn_struct_buffer_store_format: 4481 return legalizeBufferStore(MI, MRI, B, false, true); 4482 case Intrinsic::amdgcn_raw_tbuffer_store: 4483 case Intrinsic::amdgcn_struct_tbuffer_store: 4484 return legalizeBufferStore(MI, MRI, B, true, true); 4485 case Intrinsic::amdgcn_raw_buffer_load: 4486 case Intrinsic::amdgcn_struct_buffer_load: 4487 return legalizeBufferLoad(MI, MRI, B, false, false); 4488 case Intrinsic::amdgcn_raw_buffer_load_format: 4489 case Intrinsic::amdgcn_struct_buffer_load_format: 4490 return legalizeBufferLoad(MI, MRI, B, true, false); 4491 case Intrinsic::amdgcn_raw_tbuffer_load: 4492 case Intrinsic::amdgcn_struct_tbuffer_load: 4493 return legalizeBufferLoad(MI, MRI, B, true, true); 4494 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4495 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4496 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4497 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4498 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4499 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4500 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4501 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4502 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4503 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4504 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4505 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4506 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4507 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4508 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4509 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4510 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4511 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4512 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4513 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4514 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4515 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4516 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4517 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4518 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4519 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4520 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4521 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4522 return legalizeBufferAtomic(MI, B, IntrID); 4523 case Intrinsic::amdgcn_atomic_inc: 4524 return legalizeAtomicIncDec(MI, B, true); 4525 case Intrinsic::amdgcn_atomic_dec: 4526 return legalizeAtomicIncDec(MI, B, false); 4527 case Intrinsic::trap: 4528 return legalizeTrapIntrinsic(MI, MRI, B); 4529 case Intrinsic::debugtrap: 4530 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4531 case Intrinsic::amdgcn_rsq_clamp: 4532 return legalizeRsqClampIntrinsic(MI, MRI, B); 4533 case Intrinsic::amdgcn_ds_fadd: 4534 case Intrinsic::amdgcn_ds_fmin: 4535 case Intrinsic::amdgcn_ds_fmax: 4536 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4537 default: { 4538 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4539 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4540 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4541 return true; 4542 } 4543 } 4544 4545 return true; 4546 } 4547