1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 return Ty.getSizeInBits() % 32 == 0; 77 }; 78 } 79 80 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getScalarType(); 84 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 85 }; 86 } 87 88 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 93 }; 94 } 95 96 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 const LLT EltTy = Ty.getElementType(); 100 unsigned Size = Ty.getSizeInBits(); 101 unsigned Pieces = (Size + 63) / 64; 102 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 103 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 104 }; 105 } 106 107 // Increase the number of vector elements to reach the next multiple of 32-bit 108 // type. 109 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 110 return [=](const LegalityQuery &Query) { 111 const LLT Ty = Query.Types[TypeIdx]; 112 113 const LLT EltTy = Ty.getElementType(); 114 const int Size = Ty.getSizeInBits(); 115 const int EltSize = EltTy.getSizeInBits(); 116 const int NextMul32 = (Size + 31) / 32; 117 118 assert(EltSize < 32); 119 120 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 121 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 122 }; 123 } 124 125 static LLT getBitcastRegisterType(const LLT Ty) { 126 const unsigned Size = Ty.getSizeInBits(); 127 128 LLT CoercedTy; 129 if (Size <= 32) { 130 // <2 x s8> -> s16 131 // <4 x s8> -> s32 132 return LLT::scalar(Size); 133 } 134 135 return LLT::scalarOrVector(Size / 32, 32); 136 } 137 138 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 142 }; 143 } 144 145 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 unsigned Size = Ty.getSizeInBits(); 149 assert(Size % 32 == 0); 150 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 151 }; 152 } 153 154 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 155 return [=](const LegalityQuery &Query) { 156 const LLT QueryTy = Query.Types[TypeIdx]; 157 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 158 }; 159 } 160 161 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 165 }; 166 } 167 168 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 172 }; 173 } 174 175 static bool isRegisterSize(unsigned Size) { 176 return Size % 32 == 0 && Size <= MaxRegisterSize; 177 } 178 179 static bool isRegisterVectorElementType(LLT EltTy) { 180 const int EltSize = EltTy.getSizeInBits(); 181 return EltSize == 16 || EltSize % 32 == 0; 182 } 183 184 static bool isRegisterVectorType(LLT Ty) { 185 const int EltSize = Ty.getElementType().getSizeInBits(); 186 return EltSize == 32 || EltSize == 64 || 187 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 188 EltSize == 128 || EltSize == 256; 189 } 190 191 static bool isRegisterType(LLT Ty) { 192 if (!isRegisterSize(Ty.getSizeInBits())) 193 return false; 194 195 if (Ty.isVector()) 196 return isRegisterVectorType(Ty); 197 198 return true; 199 } 200 201 // Any combination of 32 or 64-bit elements up the maximum register size, and 202 // multiples of v2s16. 203 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 204 return [=](const LegalityQuery &Query) { 205 return isRegisterType(Query.Types[TypeIdx]); 206 }; 207 } 208 209 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 const LLT QueryTy = Query.Types[TypeIdx]; 212 if (!QueryTy.isVector()) 213 return false; 214 const LLT EltTy = QueryTy.getElementType(); 215 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 216 }; 217 } 218 219 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT Ty = Query.Types[TypeIdx]; 222 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 223 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 224 }; 225 } 226 227 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 228 // handle some operations by just promoting the register during 229 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 230 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 231 bool IsLoad) { 232 switch (AS) { 233 case AMDGPUAS::PRIVATE_ADDRESS: 234 // FIXME: Private element size. 235 return 32; 236 case AMDGPUAS::LOCAL_ADDRESS: 237 return ST.useDS128() ? 128 : 64; 238 case AMDGPUAS::GLOBAL_ADDRESS: 239 case AMDGPUAS::CONSTANT_ADDRESS: 240 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 241 // Treat constant and global as identical. SMRD loads are sometimes usable for 242 // global loads (ideally constant address space should be eliminated) 243 // depending on the context. Legality cannot be context dependent, but 244 // RegBankSelect can split the load as necessary depending on the pointer 245 // register bank/uniformity and if the memory is invariant or not written in a 246 // kernel. 247 return IsLoad ? 512 : 128; 248 default: 249 // Flat addresses may contextually need to be split to 32-bit parts if they 250 // may alias scratch depending on the subtarget. 251 return 128; 252 } 253 } 254 255 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 256 const LegalityQuery &Query, 257 unsigned Opcode) { 258 const LLT Ty = Query.Types[0]; 259 260 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 261 const bool IsLoad = Opcode != AMDGPU::G_STORE; 262 263 unsigned RegSize = Ty.getSizeInBits(); 264 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 265 unsigned Align = Query.MMODescrs[0].AlignInBits; 266 unsigned AS = Query.Types[1].getAddressSpace(); 267 268 // All of these need to be custom lowered to cast the pointer operand. 269 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 270 return false; 271 272 // TODO: We should be able to widen loads if the alignment is high enough, but 273 // we also need to modify the memory access size. 274 #if 0 275 // Accept widening loads based on alignment. 276 if (IsLoad && MemSize < Size) 277 MemSize = std::max(MemSize, Align); 278 #endif 279 280 // Only 1-byte and 2-byte to 32-bit extloads are valid. 281 if (MemSize != RegSize && RegSize != 32) 282 return false; 283 284 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 285 return false; 286 287 switch (MemSize) { 288 case 8: 289 case 16: 290 case 32: 291 case 64: 292 case 128: 293 break; 294 case 96: 295 if (!ST.hasDwordx3LoadStores()) 296 return false; 297 break; 298 case 256: 299 case 512: 300 // These may contextually need to be broken down. 301 break; 302 default: 303 return false; 304 } 305 306 assert(RegSize >= MemSize); 307 308 if (Align < MemSize) { 309 const SITargetLowering *TLI = ST.getTargetLowering(); 310 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 311 return false; 312 } 313 314 return true; 315 } 316 317 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 318 // workaround this. Eventually it should ignore the type for loads and only care 319 // about the size. Return true in cases where we will workaround this for now by 320 // bitcasting. 321 static bool loadStoreBitcastWorkaround(const LLT Ty) { 322 if (EnableNewLegality) 323 return false; 324 325 const unsigned Size = Ty.getSizeInBits(); 326 if (Size <= 64) 327 return false; 328 if (!Ty.isVector()) 329 return true; 330 unsigned EltSize = Ty.getElementType().getSizeInBits(); 331 return EltSize != 32 && EltSize != 64; 332 } 333 334 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 335 unsigned Opcode) { 336 const LLT Ty = Query.Types[0]; 337 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 338 !loadStoreBitcastWorkaround(Ty); 339 } 340 341 /// Return true if a load or store of the type should be lowered with a bitcast 342 /// to a different type. 343 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 344 const unsigned MemSizeInBits) { 345 const unsigned Size = Ty.getSizeInBits(); 346 if (Size != MemSizeInBits) 347 return Size <= 32 && Ty.isVector(); 348 349 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 350 return true; 351 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 352 !isRegisterVectorElementType(Ty.getElementType()); 353 } 354 355 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 356 const GCNTargetMachine &TM) 357 : ST(ST_) { 358 using namespace TargetOpcode; 359 360 auto GetAddrSpacePtr = [&TM](unsigned AS) { 361 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 362 }; 363 364 const LLT S1 = LLT::scalar(1); 365 const LLT S16 = LLT::scalar(16); 366 const LLT S32 = LLT::scalar(32); 367 const LLT S64 = LLT::scalar(64); 368 const LLT S128 = LLT::scalar(128); 369 const LLT S256 = LLT::scalar(256); 370 const LLT S512 = LLT::scalar(512); 371 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 372 373 const LLT V2S16 = LLT::vector(2, 16); 374 const LLT V4S16 = LLT::vector(4, 16); 375 376 const LLT V2S32 = LLT::vector(2, 32); 377 const LLT V3S32 = LLT::vector(3, 32); 378 const LLT V4S32 = LLT::vector(4, 32); 379 const LLT V5S32 = LLT::vector(5, 32); 380 const LLT V6S32 = LLT::vector(6, 32); 381 const LLT V7S32 = LLT::vector(7, 32); 382 const LLT V8S32 = LLT::vector(8, 32); 383 const LLT V9S32 = LLT::vector(9, 32); 384 const LLT V10S32 = LLT::vector(10, 32); 385 const LLT V11S32 = LLT::vector(11, 32); 386 const LLT V12S32 = LLT::vector(12, 32); 387 const LLT V13S32 = LLT::vector(13, 32); 388 const LLT V14S32 = LLT::vector(14, 32); 389 const LLT V15S32 = LLT::vector(15, 32); 390 const LLT V16S32 = LLT::vector(16, 32); 391 const LLT V32S32 = LLT::vector(32, 32); 392 393 const LLT V2S64 = LLT::vector(2, 64); 394 const LLT V3S64 = LLT::vector(3, 64); 395 const LLT V4S64 = LLT::vector(4, 64); 396 const LLT V5S64 = LLT::vector(5, 64); 397 const LLT V6S64 = LLT::vector(6, 64); 398 const LLT V7S64 = LLT::vector(7, 64); 399 const LLT V8S64 = LLT::vector(8, 64); 400 const LLT V16S64 = LLT::vector(16, 64); 401 402 std::initializer_list<LLT> AllS32Vectors = 403 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 404 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 405 std::initializer_list<LLT> AllS64Vectors = 406 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 407 408 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 409 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 410 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 411 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 412 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 413 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 414 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 415 416 const LLT CodePtr = FlatPtr; 417 418 const std::initializer_list<LLT> AddrSpaces64 = { 419 GlobalPtr, ConstantPtr, FlatPtr 420 }; 421 422 const std::initializer_list<LLT> AddrSpaces32 = { 423 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 424 }; 425 426 const std::initializer_list<LLT> FPTypesBase = { 427 S32, S64 428 }; 429 430 const std::initializer_list<LLT> FPTypes16 = { 431 S32, S64, S16 432 }; 433 434 const std::initializer_list<LLT> FPTypesPK16 = { 435 S32, S64, S16, V2S16 436 }; 437 438 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 439 440 setAction({G_BRCOND, S1}, Legal); // VCC branches 441 setAction({G_BRCOND, S32}, Legal); // SCC branches 442 443 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 444 // elements for v3s16 445 getActionDefinitionsBuilder(G_PHI) 446 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 447 .legalFor(AllS32Vectors) 448 .legalFor(AllS64Vectors) 449 .legalFor(AddrSpaces64) 450 .legalFor(AddrSpaces32) 451 .legalIf(isPointer(0)) 452 .clampScalar(0, S32, S256) 453 .widenScalarToNextPow2(0, 32) 454 .clampMaxNumElements(0, S32, 16) 455 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 456 .scalarize(0); 457 458 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 459 // Full set of gfx9 features. 460 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 461 .legalFor({S32, S16, V2S16}) 462 .clampScalar(0, S16, S32) 463 .clampMaxNumElements(0, S16, 2) 464 .scalarize(0) 465 .widenScalarToNextPow2(0, 32); 466 467 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 468 .legalFor({S32, S16, V2S16}) // Clamp modifier 469 .minScalar(0, S16) 470 .clampMaxNumElements(0, S16, 2) 471 .scalarize(0) 472 .widenScalarToNextPow2(0, 32) 473 .lower(); 474 } else if (ST.has16BitInsts()) { 475 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 476 .legalFor({S32, S16}) 477 .clampScalar(0, S16, S32) 478 .scalarize(0) 479 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 480 481 // Technically the saturating operations require clamp bit support, but this 482 // was introduced at the same time as 16-bit operations. 483 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 484 .legalFor({S32, S16}) // Clamp modifier 485 .minScalar(0, S16) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 16) 488 .lower(); 489 490 // We're just lowering this, but it helps get a better result to try to 491 // coerce to the desired type first. 492 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 493 .minScalar(0, S16) 494 .scalarize(0) 495 .lower(); 496 } else { 497 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 if (ST.hasIntClamp()) { 503 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 504 .legalFor({S32}) // Clamp modifier. 505 .scalarize(0) 506 .minScalarOrElt(0, S32) 507 .lower(); 508 } else { 509 // Clamp bit support was added in VI, along with 16-bit operations. 510 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 511 .minScalar(0, S32) 512 .scalarize(0) 513 .lower(); 514 } 515 516 // FIXME: DAG expansion gets better results. The widening uses the smaller 517 // range values and goes for the min/max lowering directly. 518 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 525 .customFor({S32, S64}) 526 .clampScalar(0, S32, S64) 527 .widenScalarToNextPow2(0, 32) 528 .scalarize(0); 529 530 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 531 .legalFor({S32}) 532 .clampScalar(0, S32, S32) 533 .scalarize(0); 534 535 // Report legal for any types we can handle anywhere. For the cases only legal 536 // on the SALU, RegBankSelect will be able to re-legalize. 537 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 538 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 539 .clampScalar(0, S32, S64) 540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 541 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 542 .widenScalarToNextPow2(0) 543 .scalarize(0); 544 545 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 546 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 547 .legalFor({{S32, S1}, {S32, S32}}) 548 .minScalar(0, S32) 549 // TODO: .scalarize(0) 550 .lower(); 551 552 getActionDefinitionsBuilder(G_BITCAST) 553 // Don't worry about the size constraint. 554 .legalIf(all(isRegisterType(0), isRegisterType(1))) 555 .lower(); 556 557 558 getActionDefinitionsBuilder(G_CONSTANT) 559 .legalFor({S1, S32, S64, S16, GlobalPtr, 560 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 561 .legalIf(isPointer(0)) 562 .clampScalar(0, S32, S64) 563 .widenScalarToNextPow2(0); 564 565 getActionDefinitionsBuilder(G_FCONSTANT) 566 .legalFor({S32, S64, S16}) 567 .clampScalar(0, S16, S64); 568 569 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 570 .legalIf(isRegisterType(0)) 571 // s1 and s16 are special cases because they have legal operations on 572 // them, but don't really occupy registers in the normal way. 573 .legalFor({S1, S16}) 574 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 575 .clampScalarOrElt(0, S32, MaxScalar) 576 .widenScalarToNextPow2(0, 32) 577 .clampMaxNumElements(0, S32, 16); 578 579 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 580 581 // If the amount is divergent, we have to do a wave reduction to get the 582 // maximum value, so this is expanded during RegBankSelect. 583 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 584 .legalFor({{PrivatePtr, S32}}); 585 586 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 587 .customIf(typeIsNot(0, PrivatePtr)); 588 589 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 590 591 auto &FPOpActions = getActionDefinitionsBuilder( 592 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 593 .legalFor({S32, S64}); 594 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 595 .customFor({S32, S64}); 596 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 597 .customFor({S32, S64}); 598 599 if (ST.has16BitInsts()) { 600 if (ST.hasVOP3PInsts()) 601 FPOpActions.legalFor({S16, V2S16}); 602 else 603 FPOpActions.legalFor({S16}); 604 605 TrigActions.customFor({S16}); 606 FDIVActions.customFor({S16}); 607 } 608 609 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 610 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 611 612 if (ST.hasVOP3PInsts()) { 613 MinNumMaxNum.customFor(FPTypesPK16) 614 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 615 .clampMaxNumElements(0, S16, 2) 616 .clampScalar(0, S16, S64) 617 .scalarize(0); 618 } else if (ST.has16BitInsts()) { 619 MinNumMaxNum.customFor(FPTypes16) 620 .clampScalar(0, S16, S64) 621 .scalarize(0); 622 } else { 623 MinNumMaxNum.customFor(FPTypesBase) 624 .clampScalar(0, S32, S64) 625 .scalarize(0); 626 } 627 628 if (ST.hasVOP3PInsts()) 629 FPOpActions.clampMaxNumElements(0, S16, 2); 630 631 FPOpActions 632 .scalarize(0) 633 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 634 635 TrigActions 636 .scalarize(0) 637 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 638 639 FDIVActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 644 .legalFor(FPTypesPK16) 645 .clampMaxNumElements(0, S16, 2) 646 .scalarize(0) 647 .clampScalar(0, S16, S64); 648 649 if (ST.has16BitInsts()) { 650 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 651 .legalFor({S32, S64, S16}) 652 .scalarize(0) 653 .clampScalar(0, S16, S64); 654 } else { 655 getActionDefinitionsBuilder(G_FSQRT) 656 .legalFor({S32, S64}) 657 .scalarize(0) 658 .clampScalar(0, S32, S64); 659 660 if (ST.hasFractBug()) { 661 getActionDefinitionsBuilder(G_FFLOOR) 662 .customFor({S64}) 663 .legalFor({S32, S64}) 664 .scalarize(0) 665 .clampScalar(0, S32, S64); 666 } else { 667 getActionDefinitionsBuilder(G_FFLOOR) 668 .legalFor({S32, S64}) 669 .scalarize(0) 670 .clampScalar(0, S32, S64); 671 } 672 } 673 674 getActionDefinitionsBuilder(G_FPTRUNC) 675 .legalFor({{S32, S64}, {S16, S32}}) 676 .scalarize(0) 677 .lower(); 678 679 getActionDefinitionsBuilder(G_FPEXT) 680 .legalFor({{S64, S32}, {S32, S16}}) 681 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 682 .scalarize(0); 683 684 getActionDefinitionsBuilder(G_FSUB) 685 // Use actual fsub instruction 686 .legalFor({S32}) 687 // Must use fadd + fneg 688 .lowerFor({S64, S16, V2S16}) 689 .scalarize(0) 690 .clampScalar(0, S32, S64); 691 692 // Whether this is legal depends on the floating point mode for the function. 693 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 694 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 695 FMad.customFor({S32, S16}); 696 else if (ST.hasMadMacF32Insts()) 697 FMad.customFor({S32}); 698 else if (ST.hasMadF16()) 699 FMad.customFor({S16}); 700 FMad.scalarize(0) 701 .lower(); 702 703 // TODO: Do we need to clamp maximum bitwidth? 704 getActionDefinitionsBuilder(G_TRUNC) 705 .legalIf(isScalar(0)) 706 .legalFor({{V2S16, V2S32}}) 707 .clampMaxNumElements(0, S16, 2) 708 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 709 // situations (like an invalid implicit use), we don't want to infinite loop 710 // in the legalizer. 711 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 712 .alwaysLegal(); 713 714 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 715 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 716 {S32, S1}, {S64, S1}, {S16, S1}}) 717 .scalarize(0) 718 .clampScalar(0, S32, S64) 719 .widenScalarToNextPow2(1, 32); 720 721 // TODO: Split s1->s64 during regbankselect for VALU. 722 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 723 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 724 .lowerFor({{S32, S64}}) 725 .lowerIf(typeIs(1, S1)) 726 .customFor({{S64, S64}}); 727 if (ST.has16BitInsts()) 728 IToFP.legalFor({{S16, S16}}); 729 IToFP.clampScalar(1, S32, S64) 730 .minScalar(0, S32) 731 .scalarize(0) 732 .widenScalarToNextPow2(1); 733 734 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 735 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 736 .customFor({{S64, S64}}) 737 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 738 if (ST.has16BitInsts()) 739 FPToI.legalFor({{S16, S16}}); 740 else 741 FPToI.minScalar(1, S32); 742 743 FPToI.minScalar(0, S32) 744 .scalarize(0) 745 .lower(); 746 747 // Lower roundeven into G_FRINT 748 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 749 .scalarize(0) 750 .lower(); 751 752 if (ST.has16BitInsts()) { 753 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 754 .legalFor({S16, S32, S64}) 755 .clampScalar(0, S16, S64) 756 .scalarize(0); 757 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 758 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 759 .legalFor({S32, S64}) 760 .clampScalar(0, S32, S64) 761 .scalarize(0); 762 } else { 763 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 764 .legalFor({S32}) 765 .customFor({S64}) 766 .clampScalar(0, S32, S64) 767 .scalarize(0); 768 } 769 770 getActionDefinitionsBuilder(G_PTR_ADD) 771 .legalIf(all(isPointer(0), sameSize(0, 1))) 772 .scalarize(0) 773 .scalarSameSizeAs(1, 0); 774 775 getActionDefinitionsBuilder(G_PTRMASK) 776 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 777 .scalarSameSizeAs(1, 0) 778 .scalarize(0); 779 780 auto &CmpBuilder = 781 getActionDefinitionsBuilder(G_ICMP) 782 // The compare output type differs based on the register bank of the output, 783 // so make both s1 and s32 legal. 784 // 785 // Scalar compares producing output in scc will be promoted to s32, as that 786 // is the allocatable register type that will be needed for the copy from 787 // scc. This will be promoted during RegBankSelect, and we assume something 788 // before that won't try to use s32 result types. 789 // 790 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 791 // bank. 792 .legalForCartesianProduct( 793 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 794 .legalForCartesianProduct( 795 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 796 if (ST.has16BitInsts()) { 797 CmpBuilder.legalFor({{S1, S16}}); 798 } 799 800 CmpBuilder 801 .widenScalarToNextPow2(1) 802 .clampScalar(1, S32, S64) 803 .scalarize(0) 804 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 805 806 getActionDefinitionsBuilder(G_FCMP) 807 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 808 .widenScalarToNextPow2(1) 809 .clampScalar(1, S32, S64) 810 .scalarize(0); 811 812 // FIXME: fpow has a selection pattern that should move to custom lowering. 813 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 814 if (ST.has16BitInsts()) 815 Exp2Ops.legalFor({S32, S16}); 816 else 817 Exp2Ops.legalFor({S32}); 818 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 819 Exp2Ops.scalarize(0); 820 821 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 822 if (ST.has16BitInsts()) 823 ExpOps.customFor({{S32}, {S16}}); 824 else 825 ExpOps.customFor({S32}); 826 ExpOps.clampScalar(0, MinScalarFPTy, S32) 827 .scalarize(0); 828 829 getActionDefinitionsBuilder(G_FPOWI) 830 .clampScalar(0, MinScalarFPTy, S32) 831 .lower(); 832 833 // The 64-bit versions produce 32-bit results, but only on the SALU. 834 getActionDefinitionsBuilder(G_CTPOP) 835 .legalFor({{S32, S32}, {S32, S64}}) 836 .clampScalar(0, S32, S32) 837 .clampScalar(1, S32, S64) 838 .scalarize(0) 839 .widenScalarToNextPow2(0, 32) 840 .widenScalarToNextPow2(1, 32); 841 842 // The hardware instructions return a different result on 0 than the generic 843 // instructions expect. The hardware produces -1, but these produce the 844 // bitwidth. 845 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 846 .scalarize(0) 847 .clampScalar(0, S32, S32) 848 .clampScalar(1, S32, S64) 849 .widenScalarToNextPow2(0, 32) 850 .widenScalarToNextPow2(1, 32) 851 .lower(); 852 853 // The 64-bit versions produce 32-bit results, but only on the SALU. 854 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 855 .legalFor({{S32, S32}, {S32, S64}}) 856 .clampScalar(0, S32, S32) 857 .clampScalar(1, S32, S64) 858 .scalarize(0) 859 .widenScalarToNextPow2(0, 32) 860 .widenScalarToNextPow2(1, 32); 861 862 getActionDefinitionsBuilder(G_BITREVERSE) 863 .legalFor({S32}) 864 .clampScalar(0, S32, S32) 865 .scalarize(0); 866 867 if (ST.has16BitInsts()) { 868 getActionDefinitionsBuilder(G_BSWAP) 869 .legalFor({S16, S32, V2S16}) 870 .clampMaxNumElements(0, S16, 2) 871 // FIXME: Fixing non-power-of-2 before clamp is workaround for 872 // narrowScalar limitation. 873 .widenScalarToNextPow2(0) 874 .clampScalar(0, S16, S32) 875 .scalarize(0); 876 877 if (ST.hasVOP3PInsts()) { 878 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 879 .legalFor({S32, S16, V2S16}) 880 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 881 .clampMaxNumElements(0, S16, 2) 882 .minScalar(0, S16) 883 .widenScalarToNextPow2(0) 884 .scalarize(0) 885 .lower(); 886 } else { 887 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 888 .legalFor({S32, S16}) 889 .widenScalarToNextPow2(0) 890 .minScalar(0, S16) 891 .scalarize(0) 892 .lower(); 893 } 894 } else { 895 // TODO: Should have same legality without v_perm_b32 896 getActionDefinitionsBuilder(G_BSWAP) 897 .legalFor({S32}) 898 .lowerIf(scalarNarrowerThan(0, 32)) 899 // FIXME: Fixing non-power-of-2 before clamp is workaround for 900 // narrowScalar limitation. 901 .widenScalarToNextPow2(0) 902 .maxScalar(0, S32) 903 .scalarize(0) 904 .lower(); 905 906 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 907 .legalFor({S32}) 908 .minScalar(0, S32) 909 .widenScalarToNextPow2(0) 910 .scalarize(0) 911 .lower(); 912 } 913 914 getActionDefinitionsBuilder(G_INTTOPTR) 915 // List the common cases 916 .legalForCartesianProduct(AddrSpaces64, {S64}) 917 .legalForCartesianProduct(AddrSpaces32, {S32}) 918 .scalarize(0) 919 // Accept any address space as long as the size matches 920 .legalIf(sameSize(0, 1)) 921 .widenScalarIf(smallerThan(1, 0), 922 [](const LegalityQuery &Query) { 923 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 924 }) 925 .narrowScalarIf(largerThan(1, 0), 926 [](const LegalityQuery &Query) { 927 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 928 }); 929 930 getActionDefinitionsBuilder(G_PTRTOINT) 931 // List the common cases 932 .legalForCartesianProduct(AddrSpaces64, {S64}) 933 .legalForCartesianProduct(AddrSpaces32, {S32}) 934 .scalarize(0) 935 // Accept any address space as long as the size matches 936 .legalIf(sameSize(0, 1)) 937 .widenScalarIf(smallerThan(0, 1), 938 [](const LegalityQuery &Query) { 939 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 940 }) 941 .narrowScalarIf( 942 largerThan(0, 1), 943 [](const LegalityQuery &Query) { 944 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 945 }); 946 947 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 948 .scalarize(0) 949 .custom(); 950 951 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 952 bool IsLoad) -> bool { 953 const LLT DstTy = Query.Types[0]; 954 955 // Split vector extloads. 956 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 957 unsigned Align = Query.MMODescrs[0].AlignInBits; 958 959 if (MemSize < DstTy.getSizeInBits()) 960 MemSize = std::max(MemSize, Align); 961 962 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 963 return true; 964 965 const LLT PtrTy = Query.Types[1]; 966 unsigned AS = PtrTy.getAddressSpace(); 967 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 968 return true; 969 970 // Catch weird sized loads that don't evenly divide into the access sizes 971 // TODO: May be able to widen depending on alignment etc. 972 unsigned NumRegs = (MemSize + 31) / 32; 973 if (NumRegs == 3) { 974 if (!ST.hasDwordx3LoadStores()) 975 return true; 976 } else { 977 // If the alignment allows, these should have been widened. 978 if (!isPowerOf2_32(NumRegs)) 979 return true; 980 } 981 982 if (Align < MemSize) { 983 const SITargetLowering *TLI = ST.getTargetLowering(); 984 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 985 } 986 987 return false; 988 }; 989 990 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 991 unsigned Opc) -> bool { 992 unsigned Size = Query.Types[0].getSizeInBits(); 993 if (isPowerOf2_32(Size)) 994 return false; 995 996 if (Size == 96 && ST.hasDwordx3LoadStores()) 997 return false; 998 999 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1000 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1001 return false; 1002 1003 unsigned Align = Query.MMODescrs[0].AlignInBits; 1004 unsigned RoundedSize = NextPowerOf2(Size); 1005 return (Align >= RoundedSize); 1006 }; 1007 1008 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1009 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1010 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1011 1012 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1013 // LDS 1014 // TODO: Unsupported flat for SI. 1015 1016 for (unsigned Op : {G_LOAD, G_STORE}) { 1017 const bool IsStore = Op == G_STORE; 1018 1019 auto &Actions = getActionDefinitionsBuilder(Op); 1020 // Explicitly list some common cases. 1021 // TODO: Does this help compile time at all? 1022 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1023 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1024 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1025 {S64, GlobalPtr, 64, GlobalAlign32}, 1026 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1027 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1028 {S32, GlobalPtr, 8, GlobalAlign8}, 1029 {S32, GlobalPtr, 16, GlobalAlign16}, 1030 1031 {S32, LocalPtr, 32, 32}, 1032 {S64, LocalPtr, 64, 32}, 1033 {V2S32, LocalPtr, 64, 32}, 1034 {S32, LocalPtr, 8, 8}, 1035 {S32, LocalPtr, 16, 16}, 1036 {V2S16, LocalPtr, 32, 32}, 1037 1038 {S32, PrivatePtr, 32, 32}, 1039 {S32, PrivatePtr, 8, 8}, 1040 {S32, PrivatePtr, 16, 16}, 1041 {V2S16, PrivatePtr, 32, 32}, 1042 1043 {S32, ConstantPtr, 32, GlobalAlign32}, 1044 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1045 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1046 {S64, ConstantPtr, 64, GlobalAlign32}, 1047 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1048 Actions.legalIf( 1049 [=](const LegalityQuery &Query) -> bool { 1050 return isLoadStoreLegal(ST, Query, Op); 1051 }); 1052 1053 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1054 // 64-bits. 1055 // 1056 // TODO: Should generalize bitcast action into coerce, which will also cover 1057 // inserting addrspacecasts. 1058 Actions.customIf(typeIs(1, Constant32Ptr)); 1059 1060 // Turn any illegal element vectors into something easier to deal 1061 // with. These will ultimately produce 32-bit scalar shifts to extract the 1062 // parts anyway. 1063 // 1064 // For odd 16-bit element vectors, prefer to split those into pieces with 1065 // 16-bit vector parts. 1066 Actions.bitcastIf( 1067 [=](const LegalityQuery &Query) -> bool { 1068 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1069 Query.MMODescrs[0].SizeInBits); 1070 }, bitcastToRegisterType(0)); 1071 1072 Actions 1073 .customIf(typeIs(1, Constant32Ptr)) 1074 // Widen suitably aligned loads by loading extra elements. 1075 .moreElementsIf([=](const LegalityQuery &Query) { 1076 const LLT Ty = Query.Types[0]; 1077 return Op == G_LOAD && Ty.isVector() && 1078 shouldWidenLoadResult(Query, Op); 1079 }, moreElementsToNextPow2(0)) 1080 .widenScalarIf([=](const LegalityQuery &Query) { 1081 const LLT Ty = Query.Types[0]; 1082 return Op == G_LOAD && !Ty.isVector() && 1083 shouldWidenLoadResult(Query, Op); 1084 }, widenScalarOrEltToNextPow2(0)) 1085 .narrowScalarIf( 1086 [=](const LegalityQuery &Query) -> bool { 1087 return !Query.Types[0].isVector() && 1088 needToSplitMemOp(Query, Op == G_LOAD); 1089 }, 1090 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1091 const LLT DstTy = Query.Types[0]; 1092 const LLT PtrTy = Query.Types[1]; 1093 1094 const unsigned DstSize = DstTy.getSizeInBits(); 1095 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1096 1097 // Split extloads. 1098 if (DstSize > MemSize) 1099 return std::make_pair(0, LLT::scalar(MemSize)); 1100 1101 if (!isPowerOf2_32(DstSize)) { 1102 // We're probably decomposing an odd sized store. Try to split 1103 // to the widest type. TODO: Account for alignment. As-is it 1104 // should be OK, since the new parts will be further legalized. 1105 unsigned FloorSize = PowerOf2Floor(DstSize); 1106 return std::make_pair(0, LLT::scalar(FloorSize)); 1107 } 1108 1109 if (DstSize > 32 && (DstSize % 32 != 0)) { 1110 // FIXME: Need a way to specify non-extload of larger size if 1111 // suitably aligned. 1112 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1113 } 1114 1115 unsigned MaxSize = maxSizeForAddrSpace(ST, 1116 PtrTy.getAddressSpace(), 1117 Op == G_LOAD); 1118 if (MemSize > MaxSize) 1119 return std::make_pair(0, LLT::scalar(MaxSize)); 1120 1121 unsigned Align = Query.MMODescrs[0].AlignInBits; 1122 return std::make_pair(0, LLT::scalar(Align)); 1123 }) 1124 .fewerElementsIf( 1125 [=](const LegalityQuery &Query) -> bool { 1126 return Query.Types[0].isVector() && 1127 needToSplitMemOp(Query, Op == G_LOAD); 1128 }, 1129 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1130 const LLT DstTy = Query.Types[0]; 1131 const LLT PtrTy = Query.Types[1]; 1132 1133 LLT EltTy = DstTy.getElementType(); 1134 unsigned MaxSize = maxSizeForAddrSpace(ST, 1135 PtrTy.getAddressSpace(), 1136 Op == G_LOAD); 1137 1138 // FIXME: Handle widened to power of 2 results better. This ends 1139 // up scalarizing. 1140 // FIXME: 3 element stores scalarized on SI 1141 1142 // Split if it's too large for the address space. 1143 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1144 unsigned NumElts = DstTy.getNumElements(); 1145 unsigned EltSize = EltTy.getSizeInBits(); 1146 1147 if (MaxSize % EltSize == 0) { 1148 return std::make_pair( 1149 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1150 } 1151 1152 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1153 1154 // FIXME: Refine when odd breakdowns handled 1155 // The scalars will need to be re-legalized. 1156 if (NumPieces == 1 || NumPieces >= NumElts || 1157 NumElts % NumPieces != 0) 1158 return std::make_pair(0, EltTy); 1159 1160 return std::make_pair(0, 1161 LLT::vector(NumElts / NumPieces, EltTy)); 1162 } 1163 1164 // FIXME: We could probably handle weird extending loads better. 1165 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1166 if (DstTy.getSizeInBits() > MemSize) 1167 return std::make_pair(0, EltTy); 1168 1169 unsigned EltSize = EltTy.getSizeInBits(); 1170 unsigned DstSize = DstTy.getSizeInBits(); 1171 if (!isPowerOf2_32(DstSize)) { 1172 // We're probably decomposing an odd sized store. Try to split 1173 // to the widest type. TODO: Account for alignment. As-is it 1174 // should be OK, since the new parts will be further legalized. 1175 unsigned FloorSize = PowerOf2Floor(DstSize); 1176 return std::make_pair( 1177 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1178 } 1179 1180 // Need to split because of alignment. 1181 unsigned Align = Query.MMODescrs[0].AlignInBits; 1182 if (EltSize > Align && 1183 (EltSize / Align < DstTy.getNumElements())) { 1184 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1185 } 1186 1187 // May need relegalization for the scalars. 1188 return std::make_pair(0, EltTy); 1189 }) 1190 .minScalar(0, S32); 1191 1192 if (IsStore) 1193 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1194 1195 // TODO: Need a bitcast lower option? 1196 Actions 1197 .widenScalarToNextPow2(0) 1198 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1199 } 1200 1201 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1202 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1203 {S32, GlobalPtr, 16, 2 * 8}, 1204 {S32, LocalPtr, 8, 8}, 1205 {S32, LocalPtr, 16, 16}, 1206 {S32, PrivatePtr, 8, 8}, 1207 {S32, PrivatePtr, 16, 16}, 1208 {S32, ConstantPtr, 8, 8}, 1209 {S32, ConstantPtr, 16, 2 * 8}}); 1210 if (ST.hasFlatAddressSpace()) { 1211 ExtLoads.legalForTypesWithMemDesc( 1212 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1213 } 1214 1215 ExtLoads.clampScalar(0, S32, S32) 1216 .widenScalarToNextPow2(0) 1217 .unsupportedIfMemSizeNotPow2() 1218 .lower(); 1219 1220 auto &Atomics = getActionDefinitionsBuilder( 1221 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1222 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1223 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1224 G_ATOMICRMW_UMIN}) 1225 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1226 {S64, GlobalPtr}, {S64, LocalPtr}, 1227 {S32, RegionPtr}, {S64, RegionPtr}}); 1228 if (ST.hasFlatAddressSpace()) { 1229 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1230 } 1231 1232 if (ST.hasLDSFPAtomics()) { 1233 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1234 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1235 } 1236 1237 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1238 // demarshalling 1239 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1240 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1241 {S32, FlatPtr}, {S64, FlatPtr}}) 1242 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1243 {S32, RegionPtr}, {S64, RegionPtr}}); 1244 // TODO: Pointer types, any 32-bit or 64-bit vector 1245 1246 // Condition should be s32 for scalar, s1 for vector. 1247 getActionDefinitionsBuilder(G_SELECT) 1248 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1249 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1250 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1251 .clampScalar(0, S16, S64) 1252 .scalarize(1) 1253 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1254 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1255 .clampMaxNumElements(0, S32, 2) 1256 .clampMaxNumElements(0, LocalPtr, 2) 1257 .clampMaxNumElements(0, PrivatePtr, 2) 1258 .scalarize(0) 1259 .widenScalarToNextPow2(0) 1260 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1261 1262 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1263 // be more flexible with the shift amount type. 1264 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1265 .legalFor({{S32, S32}, {S64, S32}}); 1266 if (ST.has16BitInsts()) { 1267 if (ST.hasVOP3PInsts()) { 1268 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1269 .clampMaxNumElements(0, S16, 2); 1270 } else 1271 Shifts.legalFor({{S16, S16}}); 1272 1273 // TODO: Support 16-bit shift amounts for all types 1274 Shifts.widenScalarIf( 1275 [=](const LegalityQuery &Query) { 1276 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1277 // 32-bit amount. 1278 const LLT ValTy = Query.Types[0]; 1279 const LLT AmountTy = Query.Types[1]; 1280 return ValTy.getSizeInBits() <= 16 && 1281 AmountTy.getSizeInBits() < 16; 1282 }, changeTo(1, S16)); 1283 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1284 Shifts.clampScalar(1, S32, S32); 1285 Shifts.clampScalar(0, S16, S64); 1286 Shifts.widenScalarToNextPow2(0, 16); 1287 } else { 1288 // Make sure we legalize the shift amount type first, as the general 1289 // expansion for the shifted type will produce much worse code if it hasn't 1290 // been truncated already. 1291 Shifts.clampScalar(1, S32, S32); 1292 Shifts.clampScalar(0, S32, S64); 1293 Shifts.widenScalarToNextPow2(0, 32); 1294 } 1295 Shifts.scalarize(0); 1296 1297 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1298 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1299 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1300 unsigned IdxTypeIdx = 2; 1301 1302 getActionDefinitionsBuilder(Op) 1303 .customIf([=](const LegalityQuery &Query) { 1304 const LLT EltTy = Query.Types[EltTypeIdx]; 1305 const LLT VecTy = Query.Types[VecTypeIdx]; 1306 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1307 const unsigned EltSize = EltTy.getSizeInBits(); 1308 return (EltSize == 32 || EltSize == 64) && 1309 VecTy.getSizeInBits() % 32 == 0 && 1310 VecTy.getSizeInBits() <= MaxRegisterSize && 1311 IdxTy.getSizeInBits() == 32; 1312 }) 1313 .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), 1314 bitcastToVectorElement32(1)) 1315 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1316 .bitcastIf( 1317 all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), 1318 [=](const LegalityQuery &Query) { 1319 // For > 64-bit element types, try to turn this into a 64-bit 1320 // element vector since we may be able to do better indexing 1321 // if this is scalar. If not, fall back to 32. 1322 const LLT EltTy = Query.Types[EltTypeIdx]; 1323 const LLT VecTy = Query.Types[VecTypeIdx]; 1324 const unsigned DstEltSize = EltTy.getSizeInBits(); 1325 const unsigned VecSize = VecTy.getSizeInBits(); 1326 1327 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1328 return std::make_pair( 1329 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1330 }) 1331 .clampScalar(EltTypeIdx, S32, S64) 1332 .clampScalar(VecTypeIdx, S32, S64) 1333 .clampScalar(IdxTypeIdx, S32, S32) 1334 // TODO: Clamp the number of elements before resorting to stack lowering. 1335 // It should only be necessary with variable indexes. 1336 // As a last resort, lower to the stack 1337 .lower(); 1338 } 1339 1340 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1341 .unsupportedIf([=](const LegalityQuery &Query) { 1342 const LLT &EltTy = Query.Types[1].getElementType(); 1343 return Query.Types[0] != EltTy; 1344 }); 1345 1346 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1347 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1348 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1349 1350 // FIXME: Doesn't handle extract of illegal sizes. 1351 getActionDefinitionsBuilder(Op) 1352 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1353 // FIXME: Multiples of 16 should not be legal. 1354 .legalIf([=](const LegalityQuery &Query) { 1355 const LLT BigTy = Query.Types[BigTyIdx]; 1356 const LLT LitTy = Query.Types[LitTyIdx]; 1357 return (BigTy.getSizeInBits() % 32 == 0) && 1358 (LitTy.getSizeInBits() % 16 == 0); 1359 }) 1360 .widenScalarIf( 1361 [=](const LegalityQuery &Query) { 1362 const LLT BigTy = Query.Types[BigTyIdx]; 1363 return (BigTy.getScalarSizeInBits() < 16); 1364 }, 1365 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1366 .widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 const LLT LitTy = Query.Types[LitTyIdx]; 1369 return (LitTy.getScalarSizeInBits() < 16); 1370 }, 1371 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1372 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1373 .widenScalarToNextPow2(BigTyIdx, 32); 1374 1375 } 1376 1377 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1378 .legalForCartesianProduct(AllS32Vectors, {S32}) 1379 .legalForCartesianProduct(AllS64Vectors, {S64}) 1380 .clampNumElements(0, V16S32, V32S32) 1381 .clampNumElements(0, V2S64, V16S64) 1382 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1383 1384 if (ST.hasScalarPackInsts()) { 1385 BuildVector 1386 // FIXME: Should probably widen s1 vectors straight to s32 1387 .minScalarOrElt(0, S16) 1388 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1389 .minScalar(1, S32); 1390 1391 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1392 .legalFor({V2S16, S32}) 1393 .lower(); 1394 BuildVector.minScalarOrElt(0, S32); 1395 } else { 1396 BuildVector.customFor({V2S16, S16}); 1397 BuildVector.minScalarOrElt(0, S32); 1398 1399 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1400 .customFor({V2S16, S32}) 1401 .lower(); 1402 } 1403 1404 BuildVector.legalIf(isRegisterType(0)); 1405 1406 // FIXME: Clamp maximum size 1407 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1408 .legalIf(isRegisterType(0)); 1409 1410 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1411 // pre-legalize. 1412 if (ST.hasVOP3PInsts()) { 1413 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1414 .customFor({V2S16, V2S16}) 1415 .lower(); 1416 } else 1417 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1418 1419 // Merge/Unmerge 1420 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1421 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1422 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1423 1424 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1425 const LLT Ty = Query.Types[TypeIdx]; 1426 if (Ty.isVector()) { 1427 const LLT &EltTy = Ty.getElementType(); 1428 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1429 return true; 1430 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1431 return true; 1432 } 1433 return false; 1434 }; 1435 1436 auto &Builder = getActionDefinitionsBuilder(Op) 1437 .lowerFor({{S16, V2S16}}) 1438 .lowerIf([=](const LegalityQuery &Query) { 1439 const LLT BigTy = Query.Types[BigTyIdx]; 1440 return BigTy.getSizeInBits() == 32; 1441 }) 1442 // Try to widen to s16 first for small types. 1443 // TODO: Only do this on targets with legal s16 shifts 1444 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1445 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1446 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1447 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1448 elementTypeIs(1, S16)), 1449 changeTo(1, V2S16)) 1450 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1451 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1452 // valid. 1453 .clampScalar(LitTyIdx, S32, S512) 1454 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1455 // Break up vectors with weird elements into scalars 1456 .fewerElementsIf( 1457 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1458 scalarize(0)) 1459 .fewerElementsIf( 1460 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1461 scalarize(1)) 1462 .clampScalar(BigTyIdx, S32, MaxScalar); 1463 1464 if (Op == G_MERGE_VALUES) { 1465 Builder.widenScalarIf( 1466 // TODO: Use 16-bit shifts if legal for 8-bit values? 1467 [=](const LegalityQuery &Query) { 1468 const LLT Ty = Query.Types[LitTyIdx]; 1469 return Ty.getSizeInBits() < 32; 1470 }, 1471 changeTo(LitTyIdx, S32)); 1472 } 1473 1474 Builder.widenScalarIf( 1475 [=](const LegalityQuery &Query) { 1476 const LLT Ty = Query.Types[BigTyIdx]; 1477 return !isPowerOf2_32(Ty.getSizeInBits()) && 1478 Ty.getSizeInBits() % 16 != 0; 1479 }, 1480 [=](const LegalityQuery &Query) { 1481 // Pick the next power of 2, or a multiple of 64 over 128. 1482 // Whichever is smaller. 1483 const LLT &Ty = Query.Types[BigTyIdx]; 1484 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1485 if (NewSizeInBits >= 256) { 1486 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1487 if (RoundedTo < NewSizeInBits) 1488 NewSizeInBits = RoundedTo; 1489 } 1490 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1491 }) 1492 .legalIf([=](const LegalityQuery &Query) { 1493 const LLT &BigTy = Query.Types[BigTyIdx]; 1494 const LLT &LitTy = Query.Types[LitTyIdx]; 1495 1496 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1497 return false; 1498 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1499 return false; 1500 1501 return BigTy.getSizeInBits() % 16 == 0 && 1502 LitTy.getSizeInBits() % 16 == 0 && 1503 BigTy.getSizeInBits() <= MaxRegisterSize; 1504 }) 1505 // Any vectors left are the wrong size. Scalarize them. 1506 .scalarize(0) 1507 .scalarize(1); 1508 } 1509 1510 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1511 // RegBankSelect. 1512 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1513 .legalFor({{S32}, {S64}}); 1514 1515 if (ST.hasVOP3PInsts()) { 1516 SextInReg.lowerFor({{V2S16}}) 1517 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1518 // get more vector shift opportunities, since we'll get those when 1519 // expanded. 1520 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1521 } else if (ST.has16BitInsts()) { 1522 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1523 } else { 1524 // Prefer to promote to s32 before lowering if we don't have 16-bit 1525 // shifts. This avoid a lot of intermediate truncate and extend operations. 1526 SextInReg.lowerFor({{S32}, {S64}}); 1527 } 1528 1529 SextInReg 1530 .scalarize(0) 1531 .clampScalar(0, S32, S64) 1532 .lower(); 1533 1534 getActionDefinitionsBuilder(G_FSHR) 1535 .legalFor({{S32, S32}}) 1536 .scalarize(0) 1537 .lower(); 1538 1539 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1540 .legalFor({S64}); 1541 1542 getActionDefinitionsBuilder(G_FENCE) 1543 .alwaysLegal(); 1544 1545 getActionDefinitionsBuilder({ 1546 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1547 G_FCOPYSIGN, 1548 1549 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1550 G_ATOMICRMW_NAND, 1551 G_ATOMICRMW_FSUB, 1552 G_READ_REGISTER, 1553 G_WRITE_REGISTER, 1554 1555 G_SADDO, G_SSUBO, 1556 1557 // TODO: Implement 1558 G_FMINIMUM, G_FMAXIMUM, 1559 G_FSHL 1560 }).lower(); 1561 1562 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1563 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1564 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1565 .unsupported(); 1566 1567 computeTables(); 1568 verify(*ST.getInstrInfo()); 1569 } 1570 1571 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1572 MachineInstr &MI) const { 1573 MachineIRBuilder &B = Helper.MIRBuilder; 1574 MachineRegisterInfo &MRI = *B.getMRI(); 1575 GISelChangeObserver &Observer = Helper.Observer; 1576 1577 switch (MI.getOpcode()) { 1578 case TargetOpcode::G_ADDRSPACE_CAST: 1579 return legalizeAddrSpaceCast(MI, MRI, B); 1580 case TargetOpcode::G_FRINT: 1581 return legalizeFrint(MI, MRI, B); 1582 case TargetOpcode::G_FCEIL: 1583 return legalizeFceil(MI, MRI, B); 1584 case TargetOpcode::G_INTRINSIC_TRUNC: 1585 return legalizeIntrinsicTrunc(MI, MRI, B); 1586 case TargetOpcode::G_SITOFP: 1587 return legalizeITOFP(MI, MRI, B, true); 1588 case TargetOpcode::G_UITOFP: 1589 return legalizeITOFP(MI, MRI, B, false); 1590 case TargetOpcode::G_FPTOSI: 1591 return legalizeFPTOI(MI, MRI, B, true); 1592 case TargetOpcode::G_FPTOUI: 1593 return legalizeFPTOI(MI, MRI, B, false); 1594 case TargetOpcode::G_FMINNUM: 1595 case TargetOpcode::G_FMAXNUM: 1596 case TargetOpcode::G_FMINNUM_IEEE: 1597 case TargetOpcode::G_FMAXNUM_IEEE: 1598 return legalizeMinNumMaxNum(Helper, MI); 1599 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1600 return legalizeExtractVectorElt(MI, MRI, B); 1601 case TargetOpcode::G_INSERT_VECTOR_ELT: 1602 return legalizeInsertVectorElt(MI, MRI, B); 1603 case TargetOpcode::G_SHUFFLE_VECTOR: 1604 return legalizeShuffleVector(MI, MRI, B); 1605 case TargetOpcode::G_FSIN: 1606 case TargetOpcode::G_FCOS: 1607 return legalizeSinCos(MI, MRI, B); 1608 case TargetOpcode::G_GLOBAL_VALUE: 1609 return legalizeGlobalValue(MI, MRI, B); 1610 case TargetOpcode::G_LOAD: 1611 return legalizeLoad(MI, MRI, B, Observer); 1612 case TargetOpcode::G_FMAD: 1613 return legalizeFMad(MI, MRI, B); 1614 case TargetOpcode::G_FDIV: 1615 return legalizeFDIV(MI, MRI, B); 1616 case TargetOpcode::G_UDIV: 1617 case TargetOpcode::G_UREM: 1618 return legalizeUDIV_UREM(MI, MRI, B); 1619 case TargetOpcode::G_SDIV: 1620 case TargetOpcode::G_SREM: 1621 return legalizeSDIV_SREM(MI, MRI, B); 1622 case TargetOpcode::G_ATOMIC_CMPXCHG: 1623 return legalizeAtomicCmpXChg(MI, MRI, B); 1624 case TargetOpcode::G_FLOG: 1625 return legalizeFlog(MI, B, numbers::ln2f); 1626 case TargetOpcode::G_FLOG10: 1627 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1628 case TargetOpcode::G_FEXP: 1629 return legalizeFExp(MI, B); 1630 case TargetOpcode::G_FPOW: 1631 return legalizeFPow(MI, B); 1632 case TargetOpcode::G_FFLOOR: 1633 return legalizeFFloor(MI, MRI, B); 1634 case TargetOpcode::G_BUILD_VECTOR: 1635 return legalizeBuildVector(MI, MRI, B); 1636 default: 1637 return false; 1638 } 1639 1640 llvm_unreachable("expected switch to return"); 1641 } 1642 1643 Register AMDGPULegalizerInfo::getSegmentAperture( 1644 unsigned AS, 1645 MachineRegisterInfo &MRI, 1646 MachineIRBuilder &B) const { 1647 MachineFunction &MF = B.getMF(); 1648 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1649 const LLT S32 = LLT::scalar(32); 1650 1651 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1652 1653 if (ST.hasApertureRegs()) { 1654 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1655 // getreg. 1656 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1657 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1658 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1659 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1660 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1661 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1662 unsigned Encoding = 1663 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1664 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1665 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1666 1667 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1668 1669 B.buildInstr(AMDGPU::S_GETREG_B32) 1670 .addDef(GetReg) 1671 .addImm(Encoding); 1672 MRI.setType(GetReg, S32); 1673 1674 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1675 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1676 } 1677 1678 Register QueuePtr = MRI.createGenericVirtualRegister( 1679 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1680 1681 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1682 return Register(); 1683 1684 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1685 // private_segment_aperture_base_hi. 1686 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1687 1688 // TODO: can we be smarter about machine pointer info? 1689 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1690 MachineMemOperand *MMO = MF.getMachineMemOperand( 1691 PtrInfo, 1692 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1693 MachineMemOperand::MOInvariant, 1694 4, commonAlignment(Align(64), StructOffset)); 1695 1696 Register LoadAddr; 1697 1698 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1699 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1700 } 1701 1702 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1703 MachineInstr &MI, MachineRegisterInfo &MRI, 1704 MachineIRBuilder &B) const { 1705 MachineFunction &MF = B.getMF(); 1706 1707 const LLT S32 = LLT::scalar(32); 1708 Register Dst = MI.getOperand(0).getReg(); 1709 Register Src = MI.getOperand(1).getReg(); 1710 1711 LLT DstTy = MRI.getType(Dst); 1712 LLT SrcTy = MRI.getType(Src); 1713 unsigned DestAS = DstTy.getAddressSpace(); 1714 unsigned SrcAS = SrcTy.getAddressSpace(); 1715 1716 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1717 // vector element. 1718 assert(!DstTy.isVector()); 1719 1720 const AMDGPUTargetMachine &TM 1721 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1722 1723 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1724 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1725 return true; 1726 } 1727 1728 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1729 // Truncate. 1730 B.buildExtract(Dst, Src, 0); 1731 MI.eraseFromParent(); 1732 return true; 1733 } 1734 1735 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1736 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1737 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1738 1739 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1740 // another. Merge operands are required to be the same type, but creating an 1741 // extra ptrtoint would be kind of pointless. 1742 auto HighAddr = B.buildConstant( 1743 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1744 B.buildMerge(Dst, {Src, HighAddr}); 1745 MI.eraseFromParent(); 1746 return true; 1747 } 1748 1749 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1750 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1751 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1752 unsigned NullVal = TM.getNullPointerValue(DestAS); 1753 1754 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1755 auto FlatNull = B.buildConstant(SrcTy, 0); 1756 1757 // Extract low 32-bits of the pointer. 1758 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1759 1760 auto CmpRes = 1761 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1762 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1763 1764 MI.eraseFromParent(); 1765 return true; 1766 } 1767 1768 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1769 return false; 1770 1771 if (!ST.hasFlatAddressSpace()) 1772 return false; 1773 1774 auto SegmentNull = 1775 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1776 auto FlatNull = 1777 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1778 1779 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1780 if (!ApertureReg.isValid()) 1781 return false; 1782 1783 auto CmpRes = 1784 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1785 1786 // Coerce the type of the low half of the result so we can use merge_values. 1787 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1788 1789 // TODO: Should we allow mismatched types but matching sizes in merges to 1790 // avoid the ptrtoint? 1791 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1792 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1793 1794 MI.eraseFromParent(); 1795 return true; 1796 } 1797 1798 bool AMDGPULegalizerInfo::legalizeFrint( 1799 MachineInstr &MI, MachineRegisterInfo &MRI, 1800 MachineIRBuilder &B) const { 1801 Register Src = MI.getOperand(1).getReg(); 1802 LLT Ty = MRI.getType(Src); 1803 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1804 1805 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1806 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1807 1808 auto C1 = B.buildFConstant(Ty, C1Val); 1809 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1810 1811 // TODO: Should this propagate fast-math-flags? 1812 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1813 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1814 1815 auto C2 = B.buildFConstant(Ty, C2Val); 1816 auto Fabs = B.buildFAbs(Ty, Src); 1817 1818 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1819 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1820 MI.eraseFromParent(); 1821 return true; 1822 } 1823 1824 bool AMDGPULegalizerInfo::legalizeFceil( 1825 MachineInstr &MI, MachineRegisterInfo &MRI, 1826 MachineIRBuilder &B) const { 1827 1828 const LLT S1 = LLT::scalar(1); 1829 const LLT S64 = LLT::scalar(64); 1830 1831 Register Src = MI.getOperand(1).getReg(); 1832 assert(MRI.getType(Src) == S64); 1833 1834 // result = trunc(src) 1835 // if (src > 0.0 && src != result) 1836 // result += 1.0 1837 1838 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1839 1840 const auto Zero = B.buildFConstant(S64, 0.0); 1841 const auto One = B.buildFConstant(S64, 1.0); 1842 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1843 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1844 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1845 auto Add = B.buildSelect(S64, And, One, Zero); 1846 1847 // TODO: Should this propagate fast-math-flags? 1848 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1849 return true; 1850 } 1851 1852 static MachineInstrBuilder extractF64Exponent(Register Hi, 1853 MachineIRBuilder &B) { 1854 const unsigned FractBits = 52; 1855 const unsigned ExpBits = 11; 1856 LLT S32 = LLT::scalar(32); 1857 1858 auto Const0 = B.buildConstant(S32, FractBits - 32); 1859 auto Const1 = B.buildConstant(S32, ExpBits); 1860 1861 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1862 .addUse(Hi) 1863 .addUse(Const0.getReg(0)) 1864 .addUse(Const1.getReg(0)); 1865 1866 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1867 } 1868 1869 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1870 MachineInstr &MI, MachineRegisterInfo &MRI, 1871 MachineIRBuilder &B) const { 1872 const LLT S1 = LLT::scalar(1); 1873 const LLT S32 = LLT::scalar(32); 1874 const LLT S64 = LLT::scalar(64); 1875 1876 Register Src = MI.getOperand(1).getReg(); 1877 assert(MRI.getType(Src) == S64); 1878 1879 // TODO: Should this use extract since the low half is unused? 1880 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1881 Register Hi = Unmerge.getReg(1); 1882 1883 // Extract the upper half, since this is where we will find the sign and 1884 // exponent. 1885 auto Exp = extractF64Exponent(Hi, B); 1886 1887 const unsigned FractBits = 52; 1888 1889 // Extract the sign bit. 1890 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1891 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1892 1893 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1894 1895 const auto Zero32 = B.buildConstant(S32, 0); 1896 1897 // Extend back to 64-bits. 1898 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1899 1900 auto Shr = B.buildAShr(S64, FractMask, Exp); 1901 auto Not = B.buildNot(S64, Shr); 1902 auto Tmp0 = B.buildAnd(S64, Src, Not); 1903 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1904 1905 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1906 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1907 1908 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1909 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1910 MI.eraseFromParent(); 1911 return true; 1912 } 1913 1914 bool AMDGPULegalizerInfo::legalizeITOFP( 1915 MachineInstr &MI, MachineRegisterInfo &MRI, 1916 MachineIRBuilder &B, bool Signed) const { 1917 1918 Register Dst = MI.getOperand(0).getReg(); 1919 Register Src = MI.getOperand(1).getReg(); 1920 1921 const LLT S64 = LLT::scalar(64); 1922 const LLT S32 = LLT::scalar(32); 1923 1924 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1925 1926 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1927 1928 auto CvtHi = Signed ? 1929 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1930 B.buildUITOFP(S64, Unmerge.getReg(1)); 1931 1932 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1933 1934 auto ThirtyTwo = B.buildConstant(S32, 32); 1935 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1936 .addUse(CvtHi.getReg(0)) 1937 .addUse(ThirtyTwo.getReg(0)); 1938 1939 // TODO: Should this propagate fast-math-flags? 1940 B.buildFAdd(Dst, LdExp, CvtLo); 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 // TODO: Copied from DAG implementation. Verify logic and document how this 1946 // actually works. 1947 bool AMDGPULegalizerInfo::legalizeFPTOI( 1948 MachineInstr &MI, MachineRegisterInfo &MRI, 1949 MachineIRBuilder &B, bool Signed) const { 1950 1951 Register Dst = MI.getOperand(0).getReg(); 1952 Register Src = MI.getOperand(1).getReg(); 1953 1954 const LLT S64 = LLT::scalar(64); 1955 const LLT S32 = LLT::scalar(32); 1956 1957 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1958 1959 unsigned Flags = MI.getFlags(); 1960 1961 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1962 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1963 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1964 1965 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1966 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1967 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1968 1969 auto Hi = Signed ? 1970 B.buildFPTOSI(S32, FloorMul) : 1971 B.buildFPTOUI(S32, FloorMul); 1972 auto Lo = B.buildFPTOUI(S32, Fma); 1973 1974 B.buildMerge(Dst, { Lo, Hi }); 1975 MI.eraseFromParent(); 1976 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1981 MachineInstr &MI) const { 1982 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1983 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1984 1985 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1986 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1987 1988 // With ieee_mode disabled, the instructions have the correct behavior 1989 // already for G_FMINNUM/G_FMAXNUM 1990 if (!MFI->getMode().IEEE) 1991 return !IsIEEEOp; 1992 1993 if (IsIEEEOp) 1994 return true; 1995 1996 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 // TODO: Should move some of this into LegalizerHelper. 2003 2004 // TODO: Promote dynamic indexing of s16 to s32 2005 2006 // FIXME: Artifact combiner probably should have replaced the truncated 2007 // constant before this, so we shouldn't need 2008 // getConstantVRegValWithLookThrough. 2009 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2010 MI.getOperand(2).getReg(), MRI); 2011 if (!IdxVal) // Dynamic case will be selected to register indexing. 2012 return true; 2013 2014 Register Dst = MI.getOperand(0).getReg(); 2015 Register Vec = MI.getOperand(1).getReg(); 2016 2017 LLT VecTy = MRI.getType(Vec); 2018 LLT EltTy = VecTy.getElementType(); 2019 assert(EltTy == MRI.getType(Dst)); 2020 2021 if (IdxVal->Value < VecTy.getNumElements()) 2022 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2023 else 2024 B.buildUndef(Dst); 2025 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2031 MachineInstr &MI, MachineRegisterInfo &MRI, 2032 MachineIRBuilder &B) const { 2033 // TODO: Should move some of this into LegalizerHelper. 2034 2035 // TODO: Promote dynamic indexing of s16 to s32 2036 2037 // FIXME: Artifact combiner probably should have replaced the truncated 2038 // constant before this, so we shouldn't need 2039 // getConstantVRegValWithLookThrough. 2040 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2041 MI.getOperand(3).getReg(), MRI); 2042 if (!IdxVal) // Dynamic case will be selected to register indexing. 2043 return true; 2044 2045 Register Dst = MI.getOperand(0).getReg(); 2046 Register Vec = MI.getOperand(1).getReg(); 2047 Register Ins = MI.getOperand(2).getReg(); 2048 2049 LLT VecTy = MRI.getType(Vec); 2050 LLT EltTy = VecTy.getElementType(); 2051 assert(EltTy == MRI.getType(Ins)); 2052 2053 if (IdxVal->Value < VecTy.getNumElements()) 2054 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2055 else 2056 B.buildUndef(Dst); 2057 2058 MI.eraseFromParent(); 2059 return true; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2063 MachineInstr &MI, MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B) const { 2065 const LLT V2S16 = LLT::vector(2, 16); 2066 2067 Register Dst = MI.getOperand(0).getReg(); 2068 Register Src0 = MI.getOperand(1).getReg(); 2069 LLT DstTy = MRI.getType(Dst); 2070 LLT SrcTy = MRI.getType(Src0); 2071 2072 if (SrcTy == V2S16 && DstTy == V2S16 && 2073 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2074 return true; 2075 2076 MachineIRBuilder HelperBuilder(MI); 2077 GISelObserverWrapper DummyObserver; 2078 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2079 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizeSinCos( 2083 MachineInstr &MI, MachineRegisterInfo &MRI, 2084 MachineIRBuilder &B) const { 2085 2086 Register DstReg = MI.getOperand(0).getReg(); 2087 Register SrcReg = MI.getOperand(1).getReg(); 2088 LLT Ty = MRI.getType(DstReg); 2089 unsigned Flags = MI.getFlags(); 2090 2091 Register TrigVal; 2092 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2093 if (ST.hasTrigReducedRange()) { 2094 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2095 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2096 .addUse(MulVal.getReg(0)) 2097 .setMIFlags(Flags).getReg(0); 2098 } else 2099 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2100 2101 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2102 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2103 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2104 .addUse(TrigVal) 2105 .setMIFlags(Flags); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2111 MachineIRBuilder &B, 2112 const GlobalValue *GV, 2113 int64_t Offset, 2114 unsigned GAFlags) const { 2115 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2116 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2117 // to the following code sequence: 2118 // 2119 // For constant address space: 2120 // s_getpc_b64 s[0:1] 2121 // s_add_u32 s0, s0, $symbol 2122 // s_addc_u32 s1, s1, 0 2123 // 2124 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2125 // a fixup or relocation is emitted to replace $symbol with a literal 2126 // constant, which is a pc-relative offset from the encoding of the $symbol 2127 // operand to the global variable. 2128 // 2129 // For global address space: 2130 // s_getpc_b64 s[0:1] 2131 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2132 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2133 // 2134 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2135 // fixups or relocations are emitted to replace $symbol@*@lo and 2136 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2137 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2138 // operand to the global variable. 2139 // 2140 // What we want here is an offset from the value returned by s_getpc 2141 // (which is the address of the s_add_u32 instruction) to the global 2142 // variable, but since the encoding of $symbol starts 4 bytes after the start 2143 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2144 // small. This requires us to add 4 to the global variable offset in order to 2145 // compute the correct address. 2146 2147 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2148 2149 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2150 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2151 2152 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2153 .addDef(PCReg); 2154 2155 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2156 if (GAFlags == SIInstrInfo::MO_NONE) 2157 MIB.addImm(0); 2158 else 2159 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2160 2161 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2162 2163 if (PtrTy.getSizeInBits() == 32) 2164 B.buildExtract(DstReg, PCReg, 0); 2165 return true; 2166 } 2167 2168 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2169 MachineInstr &MI, MachineRegisterInfo &MRI, 2170 MachineIRBuilder &B) const { 2171 Register DstReg = MI.getOperand(0).getReg(); 2172 LLT Ty = MRI.getType(DstReg); 2173 unsigned AS = Ty.getAddressSpace(); 2174 2175 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2176 MachineFunction &MF = B.getMF(); 2177 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2178 2179 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2180 if (!MFI->isEntryFunction()) { 2181 const Function &Fn = MF.getFunction(); 2182 DiagnosticInfoUnsupported BadLDSDecl( 2183 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2184 DS_Warning); 2185 Fn.getContext().diagnose(BadLDSDecl); 2186 2187 // We currently don't have a way to correctly allocate LDS objects that 2188 // aren't directly associated with a kernel. We do force inlining of 2189 // functions that use local objects. However, if these dead functions are 2190 // not eliminated, we don't want a compile time error. Just emit a warning 2191 // and a trap, since there should be no callable path here. 2192 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2193 B.buildUndef(DstReg); 2194 MI.eraseFromParent(); 2195 return true; 2196 } 2197 2198 // TODO: We could emit code to handle the initialization somewhere. 2199 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2200 const SITargetLowering *TLI = ST.getTargetLowering(); 2201 if (!TLI->shouldUseLDSConstAddress(GV)) { 2202 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2203 return true; // Leave in place; 2204 } 2205 2206 B.buildConstant( 2207 DstReg, 2208 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 const Function &Fn = MF.getFunction(); 2214 DiagnosticInfoUnsupported BadInit( 2215 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2216 Fn.getContext().diagnose(BadInit); 2217 return true; 2218 } 2219 2220 const SITargetLowering *TLI = ST.getTargetLowering(); 2221 2222 if (TLI->shouldEmitFixup(GV)) { 2223 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2224 MI.eraseFromParent(); 2225 return true; 2226 } 2227 2228 if (TLI->shouldEmitPCReloc(GV)) { 2229 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2235 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2236 2237 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2238 MachinePointerInfo::getGOT(MF), 2239 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2240 MachineMemOperand::MOInvariant, 2241 8 /*Size*/, Align(8)); 2242 2243 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2244 2245 if (Ty.getSizeInBits() == 32) { 2246 // Truncate if this is a 32-bit constant adrdess. 2247 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2248 B.buildExtract(DstReg, Load, 0); 2249 } else 2250 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2251 2252 MI.eraseFromParent(); 2253 return true; 2254 } 2255 2256 bool AMDGPULegalizerInfo::legalizeLoad( 2257 MachineInstr &MI, MachineRegisterInfo &MRI, 2258 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2259 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2260 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2261 Observer.changingInstr(MI); 2262 MI.getOperand(1).setReg(Cast.getReg(0)); 2263 Observer.changedInstr(MI); 2264 return true; 2265 } 2266 2267 bool AMDGPULegalizerInfo::legalizeFMad( 2268 MachineInstr &MI, MachineRegisterInfo &MRI, 2269 MachineIRBuilder &B) const { 2270 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2271 assert(Ty.isScalar()); 2272 2273 MachineFunction &MF = B.getMF(); 2274 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2275 2276 // TODO: Always legal with future ftz flag. 2277 // FIXME: Do we need just output? 2278 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2279 return true; 2280 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2281 return true; 2282 2283 MachineIRBuilder HelperBuilder(MI); 2284 GISelObserverWrapper DummyObserver; 2285 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2286 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2287 } 2288 2289 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2290 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2291 Register DstReg = MI.getOperand(0).getReg(); 2292 Register PtrReg = MI.getOperand(1).getReg(); 2293 Register CmpVal = MI.getOperand(2).getReg(); 2294 Register NewVal = MI.getOperand(3).getReg(); 2295 2296 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2297 "this should not have been custom lowered"); 2298 2299 LLT ValTy = MRI.getType(CmpVal); 2300 LLT VecTy = LLT::vector(2, ValTy); 2301 2302 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2303 2304 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2305 .addDef(DstReg) 2306 .addUse(PtrReg) 2307 .addUse(PackedVal) 2308 .setMemRefs(MI.memoperands()); 2309 2310 MI.eraseFromParent(); 2311 return true; 2312 } 2313 2314 bool AMDGPULegalizerInfo::legalizeFlog( 2315 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2316 Register Dst = MI.getOperand(0).getReg(); 2317 Register Src = MI.getOperand(1).getReg(); 2318 LLT Ty = B.getMRI()->getType(Dst); 2319 unsigned Flags = MI.getFlags(); 2320 2321 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2322 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2323 2324 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2330 MachineIRBuilder &B) const { 2331 Register Dst = MI.getOperand(0).getReg(); 2332 Register Src = MI.getOperand(1).getReg(); 2333 unsigned Flags = MI.getFlags(); 2334 LLT Ty = B.getMRI()->getType(Dst); 2335 2336 auto K = B.buildFConstant(Ty, numbers::log2e); 2337 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2338 B.buildFExp2(Dst, Mul, Flags); 2339 MI.eraseFromParent(); 2340 return true; 2341 } 2342 2343 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2344 MachineIRBuilder &B) const { 2345 Register Dst = MI.getOperand(0).getReg(); 2346 Register Src0 = MI.getOperand(1).getReg(); 2347 Register Src1 = MI.getOperand(2).getReg(); 2348 unsigned Flags = MI.getFlags(); 2349 LLT Ty = B.getMRI()->getType(Dst); 2350 const LLT S16 = LLT::scalar(16); 2351 const LLT S32 = LLT::scalar(32); 2352 2353 if (Ty == S32) { 2354 auto Log = B.buildFLog2(S32, Src0, Flags); 2355 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2356 .addUse(Log.getReg(0)) 2357 .addUse(Src1) 2358 .setMIFlags(Flags); 2359 B.buildFExp2(Dst, Mul, Flags); 2360 } else if (Ty == S16) { 2361 // There's no f16 fmul_legacy, so we need to convert for it. 2362 auto Log = B.buildFLog2(S16, Src0, Flags); 2363 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2364 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2365 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2366 .addUse(Ext0.getReg(0)) 2367 .addUse(Ext1.getReg(0)) 2368 .setMIFlags(Flags); 2369 2370 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2371 } else 2372 return false; 2373 2374 MI.eraseFromParent(); 2375 return true; 2376 } 2377 2378 // Find a source register, ignoring any possible source modifiers. 2379 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2380 Register ModSrc = OrigSrc; 2381 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2382 ModSrc = SrcFNeg->getOperand(1).getReg(); 2383 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2384 ModSrc = SrcFAbs->getOperand(1).getReg(); 2385 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2386 ModSrc = SrcFAbs->getOperand(1).getReg(); 2387 return ModSrc; 2388 } 2389 2390 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2391 MachineRegisterInfo &MRI, 2392 MachineIRBuilder &B) const { 2393 2394 const LLT S1 = LLT::scalar(1); 2395 const LLT S64 = LLT::scalar(64); 2396 Register Dst = MI.getOperand(0).getReg(); 2397 Register OrigSrc = MI.getOperand(1).getReg(); 2398 unsigned Flags = MI.getFlags(); 2399 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2400 "this should not have been custom lowered"); 2401 2402 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2403 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2404 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2405 // V_FRACT bug is: 2406 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2407 // 2408 // Convert floor(x) to (x - fract(x)) 2409 2410 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2411 .addUse(OrigSrc) 2412 .setMIFlags(Flags); 2413 2414 // Give source modifier matching some assistance before obscuring a foldable 2415 // pattern. 2416 2417 // TODO: We can avoid the neg on the fract? The input sign to fract 2418 // shouldn't matter? 2419 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2420 2421 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2422 2423 Register Min = MRI.createGenericVirtualRegister(S64); 2424 2425 // We don't need to concern ourselves with the snan handling difference, so 2426 // use the one which will directly select. 2427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2428 if (MFI->getMode().IEEE) 2429 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2430 else 2431 B.buildFMinNum(Min, Fract, Const, Flags); 2432 2433 Register CorrectedFract = Min; 2434 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2435 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2436 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2437 } 2438 2439 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2440 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2441 2442 MI.eraseFromParent(); 2443 return true; 2444 } 2445 2446 // Turn an illegal packed v2s16 build vector into bit operations. 2447 // TODO: This should probably be a bitcast action in LegalizerHelper. 2448 bool AMDGPULegalizerInfo::legalizeBuildVector( 2449 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2450 Register Dst = MI.getOperand(0).getReg(); 2451 const LLT S32 = LLT::scalar(32); 2452 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2453 2454 Register Src0 = MI.getOperand(1).getReg(); 2455 Register Src1 = MI.getOperand(2).getReg(); 2456 assert(MRI.getType(Src0) == LLT::scalar(16)); 2457 2458 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2459 B.buildBitcast(Dst, Merge); 2460 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 2465 // Return the use branch instruction, otherwise null if the usage is invalid. 2466 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2467 MachineRegisterInfo &MRI, 2468 MachineInstr *&Br, 2469 MachineBasicBlock *&UncondBrTarget) { 2470 Register CondDef = MI.getOperand(0).getReg(); 2471 if (!MRI.hasOneNonDBGUse(CondDef)) 2472 return nullptr; 2473 2474 MachineBasicBlock *Parent = MI.getParent(); 2475 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2476 if (UseMI.getParent() != Parent || 2477 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2478 return nullptr; 2479 2480 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2481 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2482 if (Next == Parent->end()) { 2483 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2484 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2485 return nullptr; 2486 UncondBrTarget = &*NextMBB; 2487 } else { 2488 if (Next->getOpcode() != AMDGPU::G_BR) 2489 return nullptr; 2490 Br = &*Next; 2491 UncondBrTarget = Br->getOperand(0).getMBB(); 2492 } 2493 2494 return &UseMI; 2495 } 2496 2497 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2498 MachineRegisterInfo &MRI, 2499 Register LiveIn, 2500 Register PhyReg) const { 2501 assert(PhyReg.isPhysical() && "Physical register expected"); 2502 2503 // Insert the live-in copy, if required, by defining destination virtual 2504 // register. 2505 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2506 if (!MRI.getVRegDef(LiveIn)) { 2507 // FIXME: Should have scoped insert pt 2508 MachineBasicBlock &OrigInsBB = B.getMBB(); 2509 auto OrigInsPt = B.getInsertPt(); 2510 2511 MachineBasicBlock &EntryMBB = B.getMF().front(); 2512 EntryMBB.addLiveIn(PhyReg); 2513 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2514 B.buildCopy(LiveIn, PhyReg); 2515 2516 B.setInsertPt(OrigInsBB, OrigInsPt); 2517 } 2518 2519 return LiveIn; 2520 } 2521 2522 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2523 MachineRegisterInfo &MRI, 2524 Register PhyReg, LLT Ty, 2525 bool InsertLiveInCopy) const { 2526 assert(PhyReg.isPhysical() && "Physical register expected"); 2527 2528 // Get or create virtual live-in regester 2529 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2530 if (!LiveIn) { 2531 LiveIn = MRI.createGenericVirtualRegister(Ty); 2532 MRI.addLiveIn(PhyReg, LiveIn); 2533 } 2534 2535 // When the actual true copy required is from virtual register to physical 2536 // register (to be inserted later), live-in copy insertion from physical 2537 // to register virtual register is not required 2538 if (!InsertLiveInCopy) 2539 return LiveIn; 2540 2541 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2542 } 2543 2544 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2545 const ArgDescriptor *Arg, 2546 const TargetRegisterClass *ArgRC, 2547 LLT ArgTy) const { 2548 MCRegister SrcReg = Arg->getRegister(); 2549 assert(SrcReg.isPhysical() && "Physical register expected"); 2550 assert(DstReg.isVirtual() && "Virtual register expected"); 2551 2552 MachineRegisterInfo &MRI = *B.getMRI(); 2553 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); 2554 2555 if (Arg->isMasked()) { 2556 // TODO: Should we try to emit this once in the entry block? 2557 const LLT S32 = LLT::scalar(32); 2558 const unsigned Mask = Arg->getMask(); 2559 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2560 2561 Register AndMaskSrc = LiveIn; 2562 2563 if (Shift != 0) { 2564 auto ShiftAmt = B.buildConstant(S32, Shift); 2565 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2566 } 2567 2568 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2569 } else { 2570 B.buildCopy(DstReg, LiveIn); 2571 } 2572 2573 return true; 2574 } 2575 2576 bool AMDGPULegalizerInfo::loadInputValue( 2577 Register DstReg, MachineIRBuilder &B, 2578 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2579 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2580 const ArgDescriptor *Arg; 2581 const TargetRegisterClass *ArgRC; 2582 LLT ArgTy; 2583 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2584 2585 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2586 return false; // TODO: Handle these 2587 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2588 } 2589 2590 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2591 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2592 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2593 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2594 return false; 2595 2596 MI.eraseFromParent(); 2597 return true; 2598 } 2599 2600 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2601 MachineRegisterInfo &MRI, 2602 MachineIRBuilder &B) const { 2603 Register Dst = MI.getOperand(0).getReg(); 2604 LLT DstTy = MRI.getType(Dst); 2605 LLT S16 = LLT::scalar(16); 2606 LLT S32 = LLT::scalar(32); 2607 LLT S64 = LLT::scalar(64); 2608 2609 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2610 return true; 2611 2612 if (DstTy == S16) 2613 return legalizeFDIV16(MI, MRI, B); 2614 if (DstTy == S32) 2615 return legalizeFDIV32(MI, MRI, B); 2616 if (DstTy == S64) 2617 return legalizeFDIV64(MI, MRI, B); 2618 2619 return false; 2620 } 2621 2622 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2623 Register DstReg, 2624 Register X, 2625 Register Y, 2626 bool IsDiv) const { 2627 const LLT S1 = LLT::scalar(1); 2628 const LLT S32 = LLT::scalar(32); 2629 2630 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2631 // algorithm used here. 2632 2633 // Initial estimate of inv(y). 2634 auto FloatY = B.buildUITOFP(S32, Y); 2635 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2636 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2637 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2638 auto Z = B.buildFPTOUI(S32, ScaledY); 2639 2640 // One round of UNR. 2641 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2642 auto NegYZ = B.buildMul(S32, NegY, Z); 2643 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2644 2645 // Quotient/remainder estimate. 2646 auto Q = B.buildUMulH(S32, X, Z); 2647 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2648 2649 // First quotient/remainder refinement. 2650 auto One = B.buildConstant(S32, 1); 2651 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2652 if (IsDiv) 2653 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2654 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2655 2656 // Second quotient/remainder refinement. 2657 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2658 if (IsDiv) 2659 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2660 else 2661 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2662 } 2663 2664 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2665 MachineRegisterInfo &MRI, 2666 MachineIRBuilder &B) const { 2667 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2668 Register DstReg = MI.getOperand(0).getReg(); 2669 Register Num = MI.getOperand(1).getReg(); 2670 Register Den = MI.getOperand(2).getReg(); 2671 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2672 MI.eraseFromParent(); 2673 return true; 2674 } 2675 2676 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2677 // 2678 // Return lo, hi of result 2679 // 2680 // %cvt.lo = G_UITOFP Val.lo 2681 // %cvt.hi = G_UITOFP Val.hi 2682 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2683 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2684 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2685 // %mul2 = G_FMUL %mul1, 2**(-32) 2686 // %trunc = G_INTRINSIC_TRUNC %mul2 2687 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2688 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2689 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2690 Register Val) { 2691 const LLT S32 = LLT::scalar(32); 2692 auto Unmerge = B.buildUnmerge(S32, Val); 2693 2694 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2695 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2696 2697 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2698 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2699 2700 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2701 auto Mul1 = 2702 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2703 2704 // 2**(-32) 2705 auto Mul2 = 2706 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2707 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2708 2709 // -(2**32) 2710 auto Mad2 = B.buildFMAD(S32, Trunc, 2711 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2712 2713 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2714 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2715 2716 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2717 } 2718 2719 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2720 Register DstReg, 2721 Register Numer, 2722 Register Denom, 2723 bool IsDiv) const { 2724 const LLT S32 = LLT::scalar(32); 2725 const LLT S64 = LLT::scalar(64); 2726 const LLT S1 = LLT::scalar(1); 2727 Register RcpLo, RcpHi; 2728 2729 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2730 2731 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2732 2733 auto Zero64 = B.buildConstant(S64, 0); 2734 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2735 2736 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2737 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2738 2739 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2740 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2741 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2742 2743 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2744 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2745 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2746 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2747 2748 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2749 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2750 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2751 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2752 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2753 2754 auto Zero32 = B.buildConstant(S32, 0); 2755 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2756 auto Add2_HiC = 2757 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2758 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2759 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2760 2761 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2762 Register NumerLo = UnmergeNumer.getReg(0); 2763 Register NumerHi = UnmergeNumer.getReg(1); 2764 2765 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2766 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2767 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2768 Register Mul3_Lo = UnmergeMul3.getReg(0); 2769 Register Mul3_Hi = UnmergeMul3.getReg(1); 2770 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2771 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2772 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2773 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2774 2775 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2776 Register DenomLo = UnmergeDenom.getReg(0); 2777 Register DenomHi = UnmergeDenom.getReg(1); 2778 2779 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2780 auto C1 = B.buildSExt(S32, CmpHi); 2781 2782 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2783 auto C2 = B.buildSExt(S32, CmpLo); 2784 2785 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2786 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2787 2788 // TODO: Here and below portions of the code can be enclosed into if/endif. 2789 // Currently control flow is unconditional and we have 4 selects after 2790 // potential endif to substitute PHIs. 2791 2792 // if C3 != 0 ... 2793 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2794 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2795 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2796 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2797 2798 auto One64 = B.buildConstant(S64, 1); 2799 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2800 2801 auto C4 = 2802 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2803 auto C5 = 2804 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2805 auto C6 = B.buildSelect( 2806 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2807 2808 // if (C6 != 0) 2809 auto Add4 = B.buildAdd(S64, Add3, One64); 2810 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2811 2812 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2813 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2814 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2815 2816 // endif C6 2817 // endif C3 2818 2819 if (IsDiv) { 2820 auto Sel1 = B.buildSelect( 2821 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2822 B.buildSelect(DstReg, 2823 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2824 } else { 2825 auto Sel2 = B.buildSelect( 2826 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2827 B.buildSelect(DstReg, 2828 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2829 } 2830 } 2831 2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2833 MachineRegisterInfo &MRI, 2834 MachineIRBuilder &B) const { 2835 const LLT S64 = LLT::scalar(64); 2836 const LLT S32 = LLT::scalar(32); 2837 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2838 Register DstReg = MI.getOperand(0).getReg(); 2839 Register Num = MI.getOperand(1).getReg(); 2840 Register Den = MI.getOperand(2).getReg(); 2841 LLT Ty = MRI.getType(DstReg); 2842 2843 if (Ty == S32) 2844 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2845 else if (Ty == S64) 2846 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2847 else 2848 return false; 2849 2850 MI.eraseFromParent(); 2851 return true; 2852 2853 } 2854 2855 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2856 MachineRegisterInfo &MRI, 2857 MachineIRBuilder &B) const { 2858 const LLT S64 = LLT::scalar(64); 2859 const LLT S32 = LLT::scalar(32); 2860 2861 Register DstReg = MI.getOperand(0).getReg(); 2862 const LLT Ty = MRI.getType(DstReg); 2863 if (Ty != S32 && Ty != S64) 2864 return false; 2865 2866 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2867 2868 Register LHS = MI.getOperand(1).getReg(); 2869 Register RHS = MI.getOperand(2).getReg(); 2870 2871 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2872 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2873 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2874 2875 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2876 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2877 2878 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2879 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2880 2881 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2882 if (Ty == S32) 2883 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2884 else 2885 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2886 2887 Register Sign; 2888 if (IsDiv) 2889 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2890 else 2891 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2892 2893 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2894 B.buildSub(DstReg, UDivRem, Sign); 2895 2896 MI.eraseFromParent(); 2897 return true; 2898 } 2899 2900 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2901 MachineRegisterInfo &MRI, 2902 MachineIRBuilder &B) const { 2903 Register Res = MI.getOperand(0).getReg(); 2904 Register LHS = MI.getOperand(1).getReg(); 2905 Register RHS = MI.getOperand(2).getReg(); 2906 2907 uint16_t Flags = MI.getFlags(); 2908 2909 LLT ResTy = MRI.getType(Res); 2910 LLT S32 = LLT::scalar(32); 2911 LLT S64 = LLT::scalar(64); 2912 2913 const MachineFunction &MF = B.getMF(); 2914 bool Unsafe = 2915 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2916 2917 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2918 return false; 2919 2920 if (!Unsafe && ResTy == S32 && 2921 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2922 return false; 2923 2924 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2925 // 1 / x -> RCP(x) 2926 if (CLHS->isExactlyValue(1.0)) { 2927 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2928 .addUse(RHS) 2929 .setMIFlags(Flags); 2930 2931 MI.eraseFromParent(); 2932 return true; 2933 } 2934 2935 // -1 / x -> RCP( FNEG(x) ) 2936 if (CLHS->isExactlyValue(-1.0)) { 2937 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2938 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2939 .addUse(FNeg.getReg(0)) 2940 .setMIFlags(Flags); 2941 2942 MI.eraseFromParent(); 2943 return true; 2944 } 2945 } 2946 2947 // x / y -> x * (1.0 / y) 2948 if (Unsafe) { 2949 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2950 .addUse(RHS) 2951 .setMIFlags(Flags); 2952 B.buildFMul(Res, LHS, RCP, Flags); 2953 2954 MI.eraseFromParent(); 2955 return true; 2956 } 2957 2958 return false; 2959 } 2960 2961 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2962 MachineRegisterInfo &MRI, 2963 MachineIRBuilder &B) const { 2964 Register Res = MI.getOperand(0).getReg(); 2965 Register LHS = MI.getOperand(1).getReg(); 2966 Register RHS = MI.getOperand(2).getReg(); 2967 2968 uint16_t Flags = MI.getFlags(); 2969 2970 LLT S16 = LLT::scalar(16); 2971 LLT S32 = LLT::scalar(32); 2972 2973 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2974 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2975 2976 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2977 .addUse(RHSExt.getReg(0)) 2978 .setMIFlags(Flags); 2979 2980 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2981 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2982 2983 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2984 .addUse(RDst.getReg(0)) 2985 .addUse(RHS) 2986 .addUse(LHS) 2987 .setMIFlags(Flags); 2988 2989 MI.eraseFromParent(); 2990 return true; 2991 } 2992 2993 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2994 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2995 static void toggleSPDenormMode(bool Enable, 2996 MachineIRBuilder &B, 2997 const GCNSubtarget &ST, 2998 AMDGPU::SIModeRegisterDefaults Mode) { 2999 // Set SP denorm mode to this value. 3000 unsigned SPDenormMode = 3001 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3002 3003 if (ST.hasDenormModeInst()) { 3004 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3005 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3006 3007 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3008 B.buildInstr(AMDGPU::S_DENORM_MODE) 3009 .addImm(NewDenormModeValue); 3010 3011 } else { 3012 // Select FP32 bit field in mode register. 3013 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3014 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3015 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3016 3017 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3018 .addImm(SPDenormMode) 3019 .addImm(SPDenormModeBitField); 3020 } 3021 } 3022 3023 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3024 MachineRegisterInfo &MRI, 3025 MachineIRBuilder &B) const { 3026 Register Res = MI.getOperand(0).getReg(); 3027 Register LHS = MI.getOperand(1).getReg(); 3028 Register RHS = MI.getOperand(2).getReg(); 3029 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3030 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3031 3032 uint16_t Flags = MI.getFlags(); 3033 3034 LLT S32 = LLT::scalar(32); 3035 LLT S1 = LLT::scalar(1); 3036 3037 auto One = B.buildFConstant(S32, 1.0f); 3038 3039 auto DenominatorScaled = 3040 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3041 .addUse(LHS) 3042 .addUse(RHS) 3043 .addImm(0) 3044 .setMIFlags(Flags); 3045 auto NumeratorScaled = 3046 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3047 .addUse(LHS) 3048 .addUse(RHS) 3049 .addImm(1) 3050 .setMIFlags(Flags); 3051 3052 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3053 .addUse(DenominatorScaled.getReg(0)) 3054 .setMIFlags(Flags); 3055 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3056 3057 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3058 // aren't modeled as reading it. 3059 if (!Mode.allFP32Denormals()) 3060 toggleSPDenormMode(true, B, ST, Mode); 3061 3062 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3063 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3064 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3065 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3066 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3067 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3068 3069 if (!Mode.allFP32Denormals()) 3070 toggleSPDenormMode(false, B, ST, Mode); 3071 3072 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3073 .addUse(Fma4.getReg(0)) 3074 .addUse(Fma1.getReg(0)) 3075 .addUse(Fma3.getReg(0)) 3076 .addUse(NumeratorScaled.getReg(1)) 3077 .setMIFlags(Flags); 3078 3079 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3080 .addUse(Fmas.getReg(0)) 3081 .addUse(RHS) 3082 .addUse(LHS) 3083 .setMIFlags(Flags); 3084 3085 MI.eraseFromParent(); 3086 return true; 3087 } 3088 3089 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3090 MachineRegisterInfo &MRI, 3091 MachineIRBuilder &B) const { 3092 Register Res = MI.getOperand(0).getReg(); 3093 Register LHS = MI.getOperand(1).getReg(); 3094 Register RHS = MI.getOperand(2).getReg(); 3095 3096 uint16_t Flags = MI.getFlags(); 3097 3098 LLT S64 = LLT::scalar(64); 3099 LLT S1 = LLT::scalar(1); 3100 3101 auto One = B.buildFConstant(S64, 1.0); 3102 3103 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3104 .addUse(LHS) 3105 .addUse(RHS) 3106 .addImm(0) 3107 .setMIFlags(Flags); 3108 3109 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3110 3111 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3112 .addUse(DivScale0.getReg(0)) 3113 .setMIFlags(Flags); 3114 3115 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3116 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3117 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3118 3119 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3120 .addUse(LHS) 3121 .addUse(RHS) 3122 .addImm(1) 3123 .setMIFlags(Flags); 3124 3125 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3126 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3127 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3128 3129 Register Scale; 3130 if (!ST.hasUsableDivScaleConditionOutput()) { 3131 // Workaround a hardware bug on SI where the condition output from div_scale 3132 // is not usable. 3133 3134 LLT S32 = LLT::scalar(32); 3135 3136 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3137 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3138 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3139 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3140 3141 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3142 Scale1Unmerge.getReg(1)); 3143 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3144 Scale0Unmerge.getReg(1)); 3145 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3146 } else { 3147 Scale = DivScale1.getReg(1); 3148 } 3149 3150 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3151 .addUse(Fma4.getReg(0)) 3152 .addUse(Fma3.getReg(0)) 3153 .addUse(Mul.getReg(0)) 3154 .addUse(Scale) 3155 .setMIFlags(Flags); 3156 3157 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3158 .addUse(Fmas.getReg(0)) 3159 .addUse(RHS) 3160 .addUse(LHS) 3161 .setMIFlags(Flags); 3162 3163 MI.eraseFromParent(); 3164 return true; 3165 } 3166 3167 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3168 MachineRegisterInfo &MRI, 3169 MachineIRBuilder &B) const { 3170 Register Res = MI.getOperand(0).getReg(); 3171 Register LHS = MI.getOperand(2).getReg(); 3172 Register RHS = MI.getOperand(3).getReg(); 3173 uint16_t Flags = MI.getFlags(); 3174 3175 LLT S32 = LLT::scalar(32); 3176 LLT S1 = LLT::scalar(1); 3177 3178 auto Abs = B.buildFAbs(S32, RHS, Flags); 3179 const APFloat C0Val(1.0f); 3180 3181 auto C0 = B.buildConstant(S32, 0x6f800000); 3182 auto C1 = B.buildConstant(S32, 0x2f800000); 3183 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3184 3185 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3186 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3187 3188 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3189 3190 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3191 .addUse(Mul0.getReg(0)) 3192 .setMIFlags(Flags); 3193 3194 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3195 3196 B.buildFMul(Res, Sel, Mul1, Flags); 3197 3198 MI.eraseFromParent(); 3199 return true; 3200 } 3201 3202 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3203 MachineRegisterInfo &MRI, 3204 MachineIRBuilder &B) const { 3205 uint64_t Offset = 3206 ST.getTargetLowering()->getImplicitParameterOffset( 3207 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3208 LLT DstTy = MRI.getType(DstReg); 3209 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3210 3211 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3212 if (!loadInputValue(KernargPtrReg, B, 3213 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3214 return false; 3215 3216 // FIXME: This should be nuw 3217 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3218 return true; 3219 } 3220 3221 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3222 MachineRegisterInfo &MRI, 3223 MachineIRBuilder &B) const { 3224 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3225 if (!MFI->isEntryFunction()) { 3226 return legalizePreloadedArgIntrin(MI, MRI, B, 3227 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3228 } 3229 3230 Register DstReg = MI.getOperand(0).getReg(); 3231 if (!getImplicitArgPtr(DstReg, MRI, B)) 3232 return false; 3233 3234 MI.eraseFromParent(); 3235 return true; 3236 } 3237 3238 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3239 MachineRegisterInfo &MRI, 3240 MachineIRBuilder &B, 3241 unsigned AddrSpace) const { 3242 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3243 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3244 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3245 MI.eraseFromParent(); 3246 return true; 3247 } 3248 3249 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3250 // offset (the offset that is included in bounds checking and swizzling, to be 3251 // split between the instruction's voffset and immoffset fields) and soffset 3252 // (the offset that is excluded from bounds checking and swizzling, to go in 3253 // the instruction's soffset field). This function takes the first kind of 3254 // offset and figures out how to split it between voffset and immoffset. 3255 std::tuple<Register, unsigned, unsigned> 3256 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3257 Register OrigOffset) const { 3258 const unsigned MaxImm = 4095; 3259 Register BaseReg; 3260 unsigned TotalConstOffset; 3261 MachineInstr *OffsetDef; 3262 const LLT S32 = LLT::scalar(32); 3263 3264 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3265 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3266 3267 unsigned ImmOffset = TotalConstOffset; 3268 3269 // If the immediate value is too big for the immoffset field, put the value 3270 // and -4096 into the immoffset field so that the value that is copied/added 3271 // for the voffset field is a multiple of 4096, and it stands more chance 3272 // of being CSEd with the copy/add for another similar load/store. 3273 // However, do not do that rounding down to a multiple of 4096 if that is a 3274 // negative number, as it appears to be illegal to have a negative offset 3275 // in the vgpr, even if adding the immediate offset makes it positive. 3276 unsigned Overflow = ImmOffset & ~MaxImm; 3277 ImmOffset -= Overflow; 3278 if ((int32_t)Overflow < 0) { 3279 Overflow += ImmOffset; 3280 ImmOffset = 0; 3281 } 3282 3283 if (Overflow != 0) { 3284 if (!BaseReg) { 3285 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3286 } else { 3287 auto OverflowVal = B.buildConstant(S32, Overflow); 3288 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3289 } 3290 } 3291 3292 if (!BaseReg) 3293 BaseReg = B.buildConstant(S32, 0).getReg(0); 3294 3295 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3296 } 3297 3298 /// Handle register layout difference for f16 images for some subtargets. 3299 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3300 MachineRegisterInfo &MRI, 3301 Register Reg) const { 3302 if (!ST.hasUnpackedD16VMem()) 3303 return Reg; 3304 3305 const LLT S16 = LLT::scalar(16); 3306 const LLT S32 = LLT::scalar(32); 3307 LLT StoreVT = MRI.getType(Reg); 3308 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3309 3310 auto Unmerge = B.buildUnmerge(S16, Reg); 3311 3312 SmallVector<Register, 4> WideRegs; 3313 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3314 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3315 3316 int NumElts = StoreVT.getNumElements(); 3317 3318 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3319 } 3320 3321 Register AMDGPULegalizerInfo::fixStoreSourceType( 3322 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3323 MachineRegisterInfo *MRI = B.getMRI(); 3324 LLT Ty = MRI->getType(VData); 3325 3326 const LLT S16 = LLT::scalar(16); 3327 3328 // Fixup illegal register types for i8 stores. 3329 if (Ty == LLT::scalar(8) || Ty == S16) { 3330 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3331 return AnyExt; 3332 } 3333 3334 if (Ty.isVector()) { 3335 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3336 if (IsFormat) 3337 return handleD16VData(B, *MRI, VData); 3338 } 3339 } 3340 3341 return VData; 3342 } 3343 3344 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3345 MachineRegisterInfo &MRI, 3346 MachineIRBuilder &B, 3347 bool IsTyped, 3348 bool IsFormat) const { 3349 Register VData = MI.getOperand(1).getReg(); 3350 LLT Ty = MRI.getType(VData); 3351 LLT EltTy = Ty.getScalarType(); 3352 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3353 const LLT S32 = LLT::scalar(32); 3354 3355 VData = fixStoreSourceType(B, VData, IsFormat); 3356 Register RSrc = MI.getOperand(2).getReg(); 3357 3358 MachineMemOperand *MMO = *MI.memoperands_begin(); 3359 const int MemSize = MMO->getSize(); 3360 3361 unsigned ImmOffset; 3362 unsigned TotalOffset; 3363 3364 // The typed intrinsics add an immediate after the registers. 3365 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3366 3367 // The struct intrinsic variants add one additional operand over raw. 3368 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3369 Register VIndex; 3370 int OpOffset = 0; 3371 if (HasVIndex) { 3372 VIndex = MI.getOperand(3).getReg(); 3373 OpOffset = 1; 3374 } 3375 3376 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3377 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3378 3379 unsigned Format = 0; 3380 if (IsTyped) { 3381 Format = MI.getOperand(5 + OpOffset).getImm(); 3382 ++OpOffset; 3383 } 3384 3385 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3386 3387 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3388 if (TotalOffset != 0) 3389 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3390 3391 unsigned Opc; 3392 if (IsTyped) { 3393 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3394 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3395 } else if (IsFormat) { 3396 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3397 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3398 } else { 3399 switch (MemSize) { 3400 case 1: 3401 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3402 break; 3403 case 2: 3404 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3405 break; 3406 default: 3407 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3408 break; 3409 } 3410 } 3411 3412 if (!VIndex) 3413 VIndex = B.buildConstant(S32, 0).getReg(0); 3414 3415 auto MIB = B.buildInstr(Opc) 3416 .addUse(VData) // vdata 3417 .addUse(RSrc) // rsrc 3418 .addUse(VIndex) // vindex 3419 .addUse(VOffset) // voffset 3420 .addUse(SOffset) // soffset 3421 .addImm(ImmOffset); // offset(imm) 3422 3423 if (IsTyped) 3424 MIB.addImm(Format); 3425 3426 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3427 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3428 .addMemOperand(MMO); 3429 3430 MI.eraseFromParent(); 3431 return true; 3432 } 3433 3434 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3435 MachineRegisterInfo &MRI, 3436 MachineIRBuilder &B, 3437 bool IsFormat, 3438 bool IsTyped) const { 3439 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3440 MachineMemOperand *MMO = *MI.memoperands_begin(); 3441 const int MemSize = MMO->getSize(); 3442 const LLT S32 = LLT::scalar(32); 3443 3444 Register Dst = MI.getOperand(0).getReg(); 3445 Register RSrc = MI.getOperand(2).getReg(); 3446 3447 // The typed intrinsics add an immediate after the registers. 3448 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3449 3450 // The struct intrinsic variants add one additional operand over raw. 3451 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3452 Register VIndex; 3453 int OpOffset = 0; 3454 if (HasVIndex) { 3455 VIndex = MI.getOperand(3).getReg(); 3456 OpOffset = 1; 3457 } 3458 3459 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3460 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3461 3462 unsigned Format = 0; 3463 if (IsTyped) { 3464 Format = MI.getOperand(5 + OpOffset).getImm(); 3465 ++OpOffset; 3466 } 3467 3468 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3469 unsigned ImmOffset; 3470 unsigned TotalOffset; 3471 3472 LLT Ty = MRI.getType(Dst); 3473 LLT EltTy = Ty.getScalarType(); 3474 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3475 const bool Unpacked = ST.hasUnpackedD16VMem(); 3476 3477 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3478 if (TotalOffset != 0) 3479 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3480 3481 unsigned Opc; 3482 3483 if (IsTyped) { 3484 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3485 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3486 } else if (IsFormat) { 3487 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3488 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3489 } else { 3490 switch (MemSize) { 3491 case 1: 3492 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3493 break; 3494 case 2: 3495 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3496 break; 3497 default: 3498 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3499 break; 3500 } 3501 } 3502 3503 Register LoadDstReg; 3504 3505 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3506 LLT UnpackedTy = Ty.changeElementSize(32); 3507 3508 if (IsExtLoad) 3509 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3510 else if (Unpacked && IsD16 && Ty.isVector()) 3511 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3512 else 3513 LoadDstReg = Dst; 3514 3515 if (!VIndex) 3516 VIndex = B.buildConstant(S32, 0).getReg(0); 3517 3518 auto MIB = B.buildInstr(Opc) 3519 .addDef(LoadDstReg) // vdata 3520 .addUse(RSrc) // rsrc 3521 .addUse(VIndex) // vindex 3522 .addUse(VOffset) // voffset 3523 .addUse(SOffset) // soffset 3524 .addImm(ImmOffset); // offset(imm) 3525 3526 if (IsTyped) 3527 MIB.addImm(Format); 3528 3529 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3530 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3531 .addMemOperand(MMO); 3532 3533 if (LoadDstReg != Dst) { 3534 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3535 3536 // Widen result for extending loads was widened. 3537 if (IsExtLoad) 3538 B.buildTrunc(Dst, LoadDstReg); 3539 else { 3540 // Repack to original 16-bit vector result 3541 // FIXME: G_TRUNC should work, but legalization currently fails 3542 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3543 SmallVector<Register, 4> Repack; 3544 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3545 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3546 B.buildMerge(Dst, Repack); 3547 } 3548 } 3549 3550 MI.eraseFromParent(); 3551 return true; 3552 } 3553 3554 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3555 MachineIRBuilder &B, 3556 bool IsInc) const { 3557 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3558 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3559 B.buildInstr(Opc) 3560 .addDef(MI.getOperand(0).getReg()) 3561 .addUse(MI.getOperand(2).getReg()) 3562 .addUse(MI.getOperand(3).getReg()) 3563 .cloneMemRefs(MI); 3564 MI.eraseFromParent(); 3565 return true; 3566 } 3567 3568 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3569 switch (IntrID) { 3570 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3571 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3572 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3573 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3574 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3575 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3576 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3577 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3578 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3579 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3580 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3581 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3582 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3583 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3584 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3585 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3586 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3587 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3588 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3589 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3591 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3592 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3593 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3594 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3595 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3596 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3597 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3598 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3599 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3600 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3601 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3602 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3603 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3604 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3606 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3607 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3608 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3609 default: 3610 llvm_unreachable("unhandled atomic opcode"); 3611 } 3612 } 3613 3614 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3615 MachineIRBuilder &B, 3616 Intrinsic::ID IID) const { 3617 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3618 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3619 3620 Register Dst = MI.getOperand(0).getReg(); 3621 Register VData = MI.getOperand(2).getReg(); 3622 3623 Register CmpVal; 3624 int OpOffset = 0; 3625 3626 if (IsCmpSwap) { 3627 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3628 ++OpOffset; 3629 } 3630 3631 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3632 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3633 3634 // The struct intrinsic variants add one additional operand over raw. 3635 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3636 Register VIndex; 3637 if (HasVIndex) { 3638 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3639 ++OpOffset; 3640 } 3641 3642 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3643 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3644 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3645 3646 MachineMemOperand *MMO = *MI.memoperands_begin(); 3647 3648 unsigned ImmOffset; 3649 unsigned TotalOffset; 3650 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3651 if (TotalOffset != 0) 3652 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3653 3654 if (!VIndex) 3655 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3656 3657 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3658 .addDef(Dst) 3659 .addUse(VData); // vdata 3660 3661 if (IsCmpSwap) 3662 MIB.addReg(CmpVal); 3663 3664 MIB.addUse(RSrc) // rsrc 3665 .addUse(VIndex) // vindex 3666 .addUse(VOffset) // voffset 3667 .addUse(SOffset) // soffset 3668 .addImm(ImmOffset) // offset(imm) 3669 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3670 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3671 .addMemOperand(MMO); 3672 3673 MI.eraseFromParent(); 3674 return true; 3675 } 3676 3677 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3678 /// vector with s16 typed elements. 3679 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3680 SmallVectorImpl<Register> &PackedAddrs, 3681 int AddrIdx, int DimIdx, int EndIdx, 3682 int NumGradients) { 3683 const LLT S16 = LLT::scalar(16); 3684 const LLT V2S16 = LLT::vector(2, 16); 3685 3686 for (int I = AddrIdx; I < EndIdx; ++I) { 3687 MachineOperand &SrcOp = MI.getOperand(I); 3688 if (!SrcOp.isReg()) 3689 continue; // _L to _LZ may have eliminated this. 3690 3691 Register AddrReg = SrcOp.getReg(); 3692 3693 if (I < DimIdx) { 3694 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3695 PackedAddrs.push_back(AddrReg); 3696 } else { 3697 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3698 // derivatives dx/dh and dx/dv are packed with undef. 3699 if (((I + 1) >= EndIdx) || 3700 ((NumGradients / 2) % 2 == 1 && 3701 (I == DimIdx + (NumGradients / 2) - 1 || 3702 I == DimIdx + NumGradients - 1)) || 3703 // Check for _L to _LZ optimization 3704 !MI.getOperand(I + 1).isReg()) { 3705 PackedAddrs.push_back( 3706 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3707 .getReg(0)); 3708 } else { 3709 PackedAddrs.push_back( 3710 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3711 .getReg(0)); 3712 ++I; 3713 } 3714 } 3715 } 3716 } 3717 3718 /// Convert from separate vaddr components to a single vector address register, 3719 /// and replace the remaining operands with $noreg. 3720 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3721 int DimIdx, int NumVAddrs) { 3722 const LLT S32 = LLT::scalar(32); 3723 3724 SmallVector<Register, 8> AddrRegs; 3725 for (int I = 0; I != NumVAddrs; ++I) { 3726 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3727 if (SrcOp.isReg()) { 3728 AddrRegs.push_back(SrcOp.getReg()); 3729 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3730 } 3731 } 3732 3733 int NumAddrRegs = AddrRegs.size(); 3734 if (NumAddrRegs != 1) { 3735 // Round up to 8 elements for v5-v7 3736 // FIXME: Missing intermediate sized register classes and instructions. 3737 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3738 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3739 auto Undef = B.buildUndef(S32); 3740 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3741 NumAddrRegs = RoundedNumRegs; 3742 } 3743 3744 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3745 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3746 } 3747 3748 for (int I = 1; I != NumVAddrs; ++I) { 3749 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3750 if (SrcOp.isReg()) 3751 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3752 } 3753 } 3754 3755 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3756 /// 3757 /// Depending on the subtarget, load/store with 16-bit element data need to be 3758 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3759 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3760 /// registers. 3761 /// 3762 /// We don't want to directly select image instructions just yet, but also want 3763 /// to exposes all register repacking to the legalizer/combiners. We also don't 3764 /// want a selected instrution entering RegBankSelect. In order to avoid 3765 /// defining a multitude of intermediate image instructions, directly hack on 3766 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3767 /// now unnecessary arguments with $noreg. 3768 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3769 MachineInstr &MI, MachineIRBuilder &B, 3770 GISelChangeObserver &Observer, 3771 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3772 3773 const int NumDefs = MI.getNumExplicitDefs(); 3774 bool IsTFE = NumDefs == 2; 3775 // We are only processing the operands of d16 image operations on subtargets 3776 // that use the unpacked register layout, or need to repack the TFE result. 3777 3778 // TODO: Do we need to guard against already legalized intrinsics? 3779 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3780 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3781 3782 MachineRegisterInfo *MRI = B.getMRI(); 3783 const LLT S32 = LLT::scalar(32); 3784 const LLT S16 = LLT::scalar(16); 3785 const LLT V2S16 = LLT::vector(2, 16); 3786 3787 // Index of first address argument 3788 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3789 3790 int NumVAddrs, NumGradients; 3791 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3792 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3793 getDMaskIdx(BaseOpcode, NumDefs); 3794 unsigned DMask = 0; 3795 3796 // Check for 16 bit addresses and pack if true. 3797 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3798 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3799 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3800 const bool IsG16 = GradTy == S16; 3801 const bool IsA16 = AddrTy == S16; 3802 3803 int DMaskLanes = 0; 3804 if (!BaseOpcode->Atomic) { 3805 DMask = MI.getOperand(DMaskIdx).getImm(); 3806 if (BaseOpcode->Gather4) { 3807 DMaskLanes = 4; 3808 } else if (DMask != 0) { 3809 DMaskLanes = countPopulation(DMask); 3810 } else if (!IsTFE && !BaseOpcode->Store) { 3811 // If dmask is 0, this is a no-op load. This can be eliminated. 3812 B.buildUndef(MI.getOperand(0)); 3813 MI.eraseFromParent(); 3814 return true; 3815 } 3816 } 3817 3818 Observer.changingInstr(MI); 3819 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3820 3821 unsigned NewOpcode = NumDefs == 0 ? 3822 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3823 3824 // Track that we legalized this 3825 MI.setDesc(B.getTII().get(NewOpcode)); 3826 3827 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3828 // dmask to be at least 1 otherwise the instruction will fail 3829 if (IsTFE && DMask == 0) { 3830 DMask = 0x1; 3831 DMaskLanes = 1; 3832 MI.getOperand(DMaskIdx).setImm(DMask); 3833 } 3834 3835 if (BaseOpcode->Atomic) { 3836 Register VData0 = MI.getOperand(2).getReg(); 3837 LLT Ty = MRI->getType(VData0); 3838 3839 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3840 if (Ty.isVector()) 3841 return false; 3842 3843 if (BaseOpcode->AtomicX2) { 3844 Register VData1 = MI.getOperand(3).getReg(); 3845 // The two values are packed in one register. 3846 LLT PackedTy = LLT::vector(2, Ty); 3847 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3848 MI.getOperand(2).setReg(Concat.getReg(0)); 3849 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3850 } 3851 } 3852 3853 int CorrectedNumVAddrs = NumVAddrs; 3854 3855 // Optimize _L to _LZ when _L is zero 3856 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3857 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3858 const ConstantFP *ConstantLod; 3859 const int LodIdx = AddrIdx + NumVAddrs - 1; 3860 3861 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3862 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3863 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3864 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3865 LZMappingInfo->LZ, ImageDimIntr->Dim); 3866 3867 // The starting indexes should remain in the same place. 3868 --NumVAddrs; 3869 --CorrectedNumVAddrs; 3870 3871 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3872 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3873 MI.RemoveOperand(LodIdx); 3874 } 3875 } 3876 } 3877 3878 // Optimize _mip away, when 'lod' is zero 3879 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3880 int64_t ConstantLod; 3881 const int LodIdx = AddrIdx + NumVAddrs - 1; 3882 3883 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3884 if (ConstantLod == 0) { 3885 // TODO: Change intrinsic opcode and remove operand instead or replacing 3886 // it with 0, as the _L to _LZ handling is done above. 3887 MI.getOperand(LodIdx).ChangeToImmediate(0); 3888 --CorrectedNumVAddrs; 3889 } 3890 } 3891 } 3892 3893 // Rewrite the addressing register layout before doing anything else. 3894 if (IsA16 || IsG16) { 3895 if (IsA16) { 3896 // Target must support the feature and gradients need to be 16 bit too 3897 if (!ST.hasA16() || !IsG16) 3898 return false; 3899 } else if (!ST.hasG16()) 3900 return false; 3901 3902 if (NumVAddrs > 1) { 3903 SmallVector<Register, 4> PackedRegs; 3904 // Don't compress addresses for G16 3905 const int PackEndIdx = 3906 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3907 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3908 PackEndIdx, NumGradients); 3909 3910 if (!IsA16) { 3911 // Add uncompressed address 3912 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3913 int AddrReg = MI.getOperand(I).getReg(); 3914 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3915 PackedRegs.push_back(AddrReg); 3916 } 3917 } 3918 3919 // See also below in the non-a16 branch 3920 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3921 3922 if (!UseNSA && PackedRegs.size() > 1) { 3923 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3924 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3925 PackedRegs[0] = Concat.getReg(0); 3926 PackedRegs.resize(1); 3927 } 3928 3929 const int NumPacked = PackedRegs.size(); 3930 for (int I = 0; I != NumVAddrs; ++I) { 3931 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3932 if (!SrcOp.isReg()) { 3933 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3934 continue; 3935 } 3936 3937 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3938 3939 if (I < NumPacked) 3940 SrcOp.setReg(PackedRegs[I]); 3941 else 3942 SrcOp.setReg(AMDGPU::NoRegister); 3943 } 3944 } 3945 } else { 3946 // If the register allocator cannot place the address registers contiguously 3947 // without introducing moves, then using the non-sequential address encoding 3948 // is always preferable, since it saves VALU instructions and is usually a 3949 // wash in terms of code size or even better. 3950 // 3951 // However, we currently have no way of hinting to the register allocator 3952 // that MIMG addresses should be placed contiguously when it is possible to 3953 // do so, so force non-NSA for the common 2-address case as a heuristic. 3954 // 3955 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3956 // allocation when possible. 3957 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3958 3959 if (!UseNSA && NumVAddrs > 1) 3960 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3961 } 3962 3963 int Flags = 0; 3964 if (IsA16) 3965 Flags |= 1; 3966 if (IsG16) 3967 Flags |= 2; 3968 MI.addOperand(MachineOperand::CreateImm(Flags)); 3969 3970 if (BaseOpcode->Store) { // No TFE for stores? 3971 // TODO: Handle dmask trim 3972 Register VData = MI.getOperand(1).getReg(); 3973 LLT Ty = MRI->getType(VData); 3974 if (!Ty.isVector() || Ty.getElementType() != S16) 3975 return true; 3976 3977 Register RepackedReg = handleD16VData(B, *MRI, VData); 3978 if (RepackedReg != VData) { 3979 MI.getOperand(1).setReg(RepackedReg); 3980 } 3981 3982 return true; 3983 } 3984 3985 Register DstReg = MI.getOperand(0).getReg(); 3986 LLT Ty = MRI->getType(DstReg); 3987 const LLT EltTy = Ty.getScalarType(); 3988 const bool IsD16 = Ty.getScalarType() == S16; 3989 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3990 3991 // Confirm that the return type is large enough for the dmask specified 3992 if (NumElts < DMaskLanes) 3993 return false; 3994 3995 if (NumElts > 4 || DMaskLanes > 4) 3996 return false; 3997 3998 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3999 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4000 4001 // The raw dword aligned data component of the load. The only legal cases 4002 // where this matters should be when using the packed D16 format, for 4003 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4004 LLT RoundedTy; 4005 4006 // S32 vector to to cover all data, plus TFE result element. 4007 LLT TFETy; 4008 4009 // Register type to use for each loaded component. Will be S32 or V2S16. 4010 LLT RegTy; 4011 4012 if (IsD16 && ST.hasUnpackedD16VMem()) { 4013 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4014 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4015 RegTy = S32; 4016 } else { 4017 unsigned EltSize = EltTy.getSizeInBits(); 4018 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4019 unsigned RoundedSize = 32 * RoundedElts; 4020 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4021 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4022 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4023 } 4024 4025 // The return type does not need adjustment. 4026 // TODO: Should we change s16 case to s32 or <2 x s16>? 4027 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4028 return true; 4029 4030 Register Dst1Reg; 4031 4032 // Insert after the instruction. 4033 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4034 4035 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4036 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4037 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4038 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4039 4040 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4041 4042 MI.getOperand(0).setReg(NewResultReg); 4043 4044 // In the IR, TFE is supposed to be used with a 2 element struct return 4045 // type. The intruction really returns these two values in one contiguous 4046 // register, with one additional dword beyond the loaded data. Rewrite the 4047 // return type to use a single register result. 4048 4049 if (IsTFE) { 4050 Dst1Reg = MI.getOperand(1).getReg(); 4051 if (MRI->getType(Dst1Reg) != S32) 4052 return false; 4053 4054 // TODO: Make sure the TFE operand bit is set. 4055 MI.RemoveOperand(1); 4056 4057 // Handle the easy case that requires no repack instructions. 4058 if (Ty == S32) { 4059 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4060 return true; 4061 } 4062 } 4063 4064 // Now figure out how to copy the new result register back into the old 4065 // result. 4066 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4067 4068 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4069 4070 if (ResultNumRegs == 1) { 4071 assert(!IsTFE); 4072 ResultRegs[0] = NewResultReg; 4073 } else { 4074 // We have to repack into a new vector of some kind. 4075 for (int I = 0; I != NumDataRegs; ++I) 4076 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4077 B.buildUnmerge(ResultRegs, NewResultReg); 4078 4079 // Drop the final TFE element to get the data part. The TFE result is 4080 // directly written to the right place already. 4081 if (IsTFE) 4082 ResultRegs.resize(NumDataRegs); 4083 } 4084 4085 // For an s16 scalar result, we form an s32 result with a truncate regardless 4086 // of packed vs. unpacked. 4087 if (IsD16 && !Ty.isVector()) { 4088 B.buildTrunc(DstReg, ResultRegs[0]); 4089 return true; 4090 } 4091 4092 // Avoid a build/concat_vector of 1 entry. 4093 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4094 B.buildBitcast(DstReg, ResultRegs[0]); 4095 return true; 4096 } 4097 4098 assert(Ty.isVector()); 4099 4100 if (IsD16) { 4101 // For packed D16 results with TFE enabled, all the data components are 4102 // S32. Cast back to the expected type. 4103 // 4104 // TODO: We don't really need to use load s32 elements. We would only need one 4105 // cast for the TFE result if a multiple of v2s16 was used. 4106 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4107 for (Register &Reg : ResultRegs) 4108 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4109 } else if (ST.hasUnpackedD16VMem()) { 4110 for (Register &Reg : ResultRegs) 4111 Reg = B.buildTrunc(S16, Reg).getReg(0); 4112 } 4113 } 4114 4115 auto padWithUndef = [&](LLT Ty, int NumElts) { 4116 if (NumElts == 0) 4117 return; 4118 Register Undef = B.buildUndef(Ty).getReg(0); 4119 for (int I = 0; I != NumElts; ++I) 4120 ResultRegs.push_back(Undef); 4121 }; 4122 4123 // Pad out any elements eliminated due to the dmask. 4124 LLT ResTy = MRI->getType(ResultRegs[0]); 4125 if (!ResTy.isVector()) { 4126 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4127 B.buildBuildVector(DstReg, ResultRegs); 4128 return true; 4129 } 4130 4131 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4132 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4133 4134 // Deal with the one annoying legal case. 4135 const LLT V3S16 = LLT::vector(3, 16); 4136 if (Ty == V3S16) { 4137 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4138 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4139 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4140 return true; 4141 } 4142 4143 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4144 B.buildConcatVectors(DstReg, ResultRegs); 4145 return true; 4146 } 4147 4148 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4149 LegalizerHelper &Helper, MachineInstr &MI) const { 4150 MachineIRBuilder &B = Helper.MIRBuilder; 4151 GISelChangeObserver &Observer = Helper.Observer; 4152 4153 Register Dst = MI.getOperand(0).getReg(); 4154 LLT Ty = B.getMRI()->getType(Dst); 4155 unsigned Size = Ty.getSizeInBits(); 4156 MachineFunction &MF = B.getMF(); 4157 4158 Observer.changingInstr(MI); 4159 4160 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4161 Ty = getBitcastRegisterType(Ty); 4162 Helper.bitcastDst(MI, Ty, 0); 4163 Dst = MI.getOperand(0).getReg(); 4164 B.setInsertPt(B.getMBB(), MI); 4165 } 4166 4167 // FIXME: We don't really need this intermediate instruction. The intrinsic 4168 // should be fixed to have a memory operand. Since it's readnone, we're not 4169 // allowed to add one. 4170 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4171 MI.RemoveOperand(1); // Remove intrinsic ID 4172 4173 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4174 // TODO: Should this use datalayout alignment? 4175 const unsigned MemSize = (Size + 7) / 8; 4176 const Align MemAlign(4); 4177 MachineMemOperand *MMO = MF.getMachineMemOperand( 4178 MachinePointerInfo(), 4179 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4180 MachineMemOperand::MOInvariant, 4181 MemSize, MemAlign); 4182 MI.addMemOperand(MF, MMO); 4183 4184 // There are no 96-bit result scalar loads, but widening to 128-bit should 4185 // always be legal. We may need to restore this to a 96-bit result if it turns 4186 // out this needs to be converted to a vector load during RegBankSelect. 4187 if (!isPowerOf2_32(Size)) { 4188 if (Ty.isVector()) 4189 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4190 else 4191 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4192 } 4193 4194 Observer.changedInstr(MI); 4195 return true; 4196 } 4197 4198 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4199 MachineRegisterInfo &MRI, 4200 MachineIRBuilder &B) const { 4201 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4202 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4203 !ST.isTrapHandlerEnabled()) { 4204 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4205 } else { 4206 // Pass queue pointer to trap handler as input, and insert trap instruction 4207 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4208 MachineRegisterInfo &MRI = *B.getMRI(); 4209 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4210 Register LiveIn = getLiveInRegister( 4211 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4212 /*InsertLiveInCopy=*/false); 4213 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4214 return false; 4215 B.buildCopy(SGPR01, LiveIn); 4216 B.buildInstr(AMDGPU::S_TRAP) 4217 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4218 .addReg(SGPR01, RegState::Implicit); 4219 } 4220 4221 MI.eraseFromParent(); 4222 return true; 4223 } 4224 4225 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4226 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4227 // Is non-HSA path or trap-handler disabled? then, report a warning 4228 // accordingly 4229 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4230 !ST.isTrapHandlerEnabled()) { 4231 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4232 "debugtrap handler not supported", 4233 MI.getDebugLoc(), DS_Warning); 4234 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4235 Ctx.diagnose(NoTrap); 4236 } else { 4237 // Insert debug-trap instruction 4238 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4239 } 4240 4241 MI.eraseFromParent(); 4242 return true; 4243 } 4244 4245 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4246 MachineInstr &MI) const { 4247 MachineIRBuilder &B = Helper.MIRBuilder; 4248 MachineRegisterInfo &MRI = *B.getMRI(); 4249 4250 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4251 auto IntrID = MI.getIntrinsicID(); 4252 switch (IntrID) { 4253 case Intrinsic::amdgcn_if: 4254 case Intrinsic::amdgcn_else: { 4255 MachineInstr *Br = nullptr; 4256 MachineBasicBlock *UncondBrTarget = nullptr; 4257 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4258 const SIRegisterInfo *TRI 4259 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4260 4261 Register Def = MI.getOperand(1).getReg(); 4262 Register Use = MI.getOperand(3).getReg(); 4263 4264 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4265 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4266 if (IntrID == Intrinsic::amdgcn_if) { 4267 B.buildInstr(AMDGPU::SI_IF) 4268 .addDef(Def) 4269 .addUse(Use) 4270 .addMBB(UncondBrTarget); 4271 } else { 4272 B.buildInstr(AMDGPU::SI_ELSE) 4273 .addDef(Def) 4274 .addUse(Use) 4275 .addMBB(UncondBrTarget) 4276 .addImm(0); 4277 } 4278 4279 if (Br) { 4280 Br->getOperand(0).setMBB(CondBrTarget); 4281 } else { 4282 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4283 // since we're swapping branch targets it needs to be reinserted. 4284 // FIXME: IRTranslator should probably not do this 4285 B.buildBr(*CondBrTarget); 4286 } 4287 4288 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4289 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4290 MI.eraseFromParent(); 4291 BrCond->eraseFromParent(); 4292 return true; 4293 } 4294 4295 return false; 4296 } 4297 case Intrinsic::amdgcn_loop: { 4298 MachineInstr *Br = nullptr; 4299 MachineBasicBlock *UncondBrTarget = nullptr; 4300 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4301 const SIRegisterInfo *TRI 4302 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4303 4304 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4305 Register Reg = MI.getOperand(2).getReg(); 4306 4307 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4308 B.buildInstr(AMDGPU::SI_LOOP) 4309 .addUse(Reg) 4310 .addMBB(UncondBrTarget); 4311 4312 if (Br) 4313 Br->getOperand(0).setMBB(CondBrTarget); 4314 else 4315 B.buildBr(*CondBrTarget); 4316 4317 MI.eraseFromParent(); 4318 BrCond->eraseFromParent(); 4319 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4320 return true; 4321 } 4322 4323 return false; 4324 } 4325 case Intrinsic::amdgcn_kernarg_segment_ptr: 4326 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4327 // This only makes sense to call in a kernel, so just lower to null. 4328 B.buildConstant(MI.getOperand(0).getReg(), 0); 4329 MI.eraseFromParent(); 4330 return true; 4331 } 4332 4333 return legalizePreloadedArgIntrin( 4334 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4335 case Intrinsic::amdgcn_implicitarg_ptr: 4336 return legalizeImplicitArgPtr(MI, MRI, B); 4337 case Intrinsic::amdgcn_workitem_id_x: 4338 return legalizePreloadedArgIntrin(MI, MRI, B, 4339 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4340 case Intrinsic::amdgcn_workitem_id_y: 4341 return legalizePreloadedArgIntrin(MI, MRI, B, 4342 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4343 case Intrinsic::amdgcn_workitem_id_z: 4344 return legalizePreloadedArgIntrin(MI, MRI, B, 4345 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4346 case Intrinsic::amdgcn_workgroup_id_x: 4347 return legalizePreloadedArgIntrin(MI, MRI, B, 4348 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4349 case Intrinsic::amdgcn_workgroup_id_y: 4350 return legalizePreloadedArgIntrin(MI, MRI, B, 4351 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4352 case Intrinsic::amdgcn_workgroup_id_z: 4353 return legalizePreloadedArgIntrin(MI, MRI, B, 4354 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4355 case Intrinsic::amdgcn_dispatch_ptr: 4356 return legalizePreloadedArgIntrin(MI, MRI, B, 4357 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4358 case Intrinsic::amdgcn_queue_ptr: 4359 return legalizePreloadedArgIntrin(MI, MRI, B, 4360 AMDGPUFunctionArgInfo::QUEUE_PTR); 4361 case Intrinsic::amdgcn_implicit_buffer_ptr: 4362 return legalizePreloadedArgIntrin( 4363 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4364 case Intrinsic::amdgcn_dispatch_id: 4365 return legalizePreloadedArgIntrin(MI, MRI, B, 4366 AMDGPUFunctionArgInfo::DISPATCH_ID); 4367 case Intrinsic::amdgcn_fdiv_fast: 4368 return legalizeFDIVFastIntrin(MI, MRI, B); 4369 case Intrinsic::amdgcn_is_shared: 4370 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4371 case Intrinsic::amdgcn_is_private: 4372 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4373 case Intrinsic::amdgcn_wavefrontsize: { 4374 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4375 MI.eraseFromParent(); 4376 return true; 4377 } 4378 case Intrinsic::amdgcn_s_buffer_load: 4379 return legalizeSBufferLoad(Helper, MI); 4380 case Intrinsic::amdgcn_raw_buffer_store: 4381 case Intrinsic::amdgcn_struct_buffer_store: 4382 return legalizeBufferStore(MI, MRI, B, false, false); 4383 case Intrinsic::amdgcn_raw_buffer_store_format: 4384 case Intrinsic::amdgcn_struct_buffer_store_format: 4385 return legalizeBufferStore(MI, MRI, B, false, true); 4386 case Intrinsic::amdgcn_raw_tbuffer_store: 4387 case Intrinsic::amdgcn_struct_tbuffer_store: 4388 return legalizeBufferStore(MI, MRI, B, true, true); 4389 case Intrinsic::amdgcn_raw_buffer_load: 4390 case Intrinsic::amdgcn_struct_buffer_load: 4391 return legalizeBufferLoad(MI, MRI, B, false, false); 4392 case Intrinsic::amdgcn_raw_buffer_load_format: 4393 case Intrinsic::amdgcn_struct_buffer_load_format: 4394 return legalizeBufferLoad(MI, MRI, B, true, false); 4395 case Intrinsic::amdgcn_raw_tbuffer_load: 4396 case Intrinsic::amdgcn_struct_tbuffer_load: 4397 return legalizeBufferLoad(MI, MRI, B, true, true); 4398 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4399 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4400 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4401 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4402 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4403 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4404 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4405 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4406 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4407 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4408 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4409 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4410 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4411 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4412 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4413 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4414 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4415 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4416 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4417 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4418 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4419 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4420 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4421 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4422 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4423 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4424 return legalizeBufferAtomic(MI, B, IntrID); 4425 case Intrinsic::amdgcn_atomic_inc: 4426 return legalizeAtomicIncDec(MI, B, true); 4427 case Intrinsic::amdgcn_atomic_dec: 4428 return legalizeAtomicIncDec(MI, B, false); 4429 case Intrinsic::trap: 4430 return legalizeTrapIntrinsic(MI, MRI, B); 4431 case Intrinsic::debugtrap: 4432 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4433 default: { 4434 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4435 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4436 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4437 return true; 4438 } 4439 } 4440 4441 return true; 4442 } 4443