1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 return Ty.getSizeInBits() % 32 == 0; 77 }; 78 } 79 80 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getScalarType(); 84 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 85 }; 86 } 87 88 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 93 }; 94 } 95 96 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 const LLT EltTy = Ty.getElementType(); 100 unsigned Size = Ty.getSizeInBits(); 101 unsigned Pieces = (Size + 63) / 64; 102 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 103 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 104 }; 105 } 106 107 // Increase the number of vector elements to reach the next multiple of 32-bit 108 // type. 109 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 110 return [=](const LegalityQuery &Query) { 111 const LLT Ty = Query.Types[TypeIdx]; 112 113 const LLT EltTy = Ty.getElementType(); 114 const int Size = Ty.getSizeInBits(); 115 const int EltSize = EltTy.getSizeInBits(); 116 const int NextMul32 = (Size + 31) / 32; 117 118 assert(EltSize < 32); 119 120 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 121 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 122 }; 123 } 124 125 static LLT getBitcastRegisterType(const LLT Ty) { 126 const unsigned Size = Ty.getSizeInBits(); 127 128 LLT CoercedTy; 129 if (Size <= 32) { 130 // <2 x s8> -> s16 131 // <4 x s8> -> s32 132 return LLT::scalar(Size); 133 } 134 135 return LLT::scalarOrVector(Size / 32, 32); 136 } 137 138 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 142 }; 143 } 144 145 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 146 return [=](const LegalityQuery &Query) { 147 const LLT Ty = Query.Types[TypeIdx]; 148 unsigned Size = Ty.getSizeInBits(); 149 assert(Size % 32 == 0); 150 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 151 }; 152 } 153 154 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 155 return [=](const LegalityQuery &Query) { 156 const LLT QueryTy = Query.Types[TypeIdx]; 157 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 158 }; 159 } 160 161 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 165 }; 166 } 167 168 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 172 }; 173 } 174 175 static bool isRegisterSize(unsigned Size) { 176 return Size % 32 == 0 && Size <= MaxRegisterSize; 177 } 178 179 static bool isRegisterVectorElementType(LLT EltTy) { 180 const int EltSize = EltTy.getSizeInBits(); 181 return EltSize == 16 || EltSize % 32 == 0; 182 } 183 184 static bool isRegisterVectorType(LLT Ty) { 185 const int EltSize = Ty.getElementType().getSizeInBits(); 186 return EltSize == 32 || EltSize == 64 || 187 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 188 EltSize == 128 || EltSize == 256; 189 } 190 191 static bool isRegisterType(LLT Ty) { 192 if (!isRegisterSize(Ty.getSizeInBits())) 193 return false; 194 195 if (Ty.isVector()) 196 return isRegisterVectorType(Ty); 197 198 return true; 199 } 200 201 // Any combination of 32 or 64-bit elements up the maximum register size, and 202 // multiples of v2s16. 203 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 204 return [=](const LegalityQuery &Query) { 205 return isRegisterType(Query.Types[TypeIdx]); 206 }; 207 } 208 209 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 210 return [=](const LegalityQuery &Query) { 211 const LLT QueryTy = Query.Types[TypeIdx]; 212 if (!QueryTy.isVector()) 213 return false; 214 const LLT EltTy = QueryTy.getElementType(); 215 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 216 }; 217 } 218 219 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT Ty = Query.Types[TypeIdx]; 222 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 223 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 224 }; 225 } 226 227 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 228 // handle some operations by just promoting the register during 229 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 230 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 231 bool IsLoad) { 232 switch (AS) { 233 case AMDGPUAS::PRIVATE_ADDRESS: 234 // FIXME: Private element size. 235 return 32; 236 case AMDGPUAS::LOCAL_ADDRESS: 237 return ST.useDS128() ? 128 : 64; 238 case AMDGPUAS::GLOBAL_ADDRESS: 239 case AMDGPUAS::CONSTANT_ADDRESS: 240 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 241 // Treat constant and global as identical. SMRD loads are sometimes usable for 242 // global loads (ideally constant address space should be eliminated) 243 // depending on the context. Legality cannot be context dependent, but 244 // RegBankSelect can split the load as necessary depending on the pointer 245 // register bank/uniformity and if the memory is invariant or not written in a 246 // kernel. 247 return IsLoad ? 512 : 128; 248 default: 249 // Flat addresses may contextually need to be split to 32-bit parts if they 250 // may alias scratch depending on the subtarget. 251 return 128; 252 } 253 } 254 255 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 256 const LegalityQuery &Query, 257 unsigned Opcode) { 258 const LLT Ty = Query.Types[0]; 259 260 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 261 const bool IsLoad = Opcode != AMDGPU::G_STORE; 262 263 unsigned RegSize = Ty.getSizeInBits(); 264 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 265 unsigned Align = Query.MMODescrs[0].AlignInBits; 266 unsigned AS = Query.Types[1].getAddressSpace(); 267 268 // All of these need to be custom lowered to cast the pointer operand. 269 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 270 return false; 271 272 // TODO: We should be able to widen loads if the alignment is high enough, but 273 // we also need to modify the memory access size. 274 #if 0 275 // Accept widening loads based on alignment. 276 if (IsLoad && MemSize < Size) 277 MemSize = std::max(MemSize, Align); 278 #endif 279 280 // Only 1-byte and 2-byte to 32-bit extloads are valid. 281 if (MemSize != RegSize && RegSize != 32) 282 return false; 283 284 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 285 return false; 286 287 switch (MemSize) { 288 case 8: 289 case 16: 290 case 32: 291 case 64: 292 case 128: 293 break; 294 case 96: 295 if (!ST.hasDwordx3LoadStores()) 296 return false; 297 break; 298 case 256: 299 case 512: 300 // These may contextually need to be broken down. 301 break; 302 default: 303 return false; 304 } 305 306 assert(RegSize >= MemSize); 307 308 if (Align < MemSize) { 309 const SITargetLowering *TLI = ST.getTargetLowering(); 310 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 311 return false; 312 } 313 314 return true; 315 } 316 317 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 318 // workaround this. Eventually it should ignore the type for loads and only care 319 // about the size. Return true in cases where we will workaround this for now by 320 // bitcasting. 321 static bool loadStoreBitcastWorkaround(const LLT Ty) { 322 if (EnableNewLegality) 323 return false; 324 325 const unsigned Size = Ty.getSizeInBits(); 326 if (Size <= 64) 327 return false; 328 if (!Ty.isVector()) 329 return true; 330 unsigned EltSize = Ty.getElementType().getSizeInBits(); 331 return EltSize != 32 && EltSize != 64; 332 } 333 334 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 335 unsigned Opcode) { 336 const LLT Ty = Query.Types[0]; 337 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 338 !loadStoreBitcastWorkaround(Ty); 339 } 340 341 /// Return true if a load or store of the type should be lowered with a bitcast 342 /// to a different type. 343 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 344 const unsigned MemSizeInBits) { 345 const unsigned Size = Ty.getSizeInBits(); 346 if (Size != MemSizeInBits) 347 return Size <= 32 && Ty.isVector(); 348 349 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 350 return true; 351 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 352 !isRegisterVectorElementType(Ty.getElementType()); 353 } 354 355 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 356 const GCNTargetMachine &TM) 357 : ST(ST_) { 358 using namespace TargetOpcode; 359 360 auto GetAddrSpacePtr = [&TM](unsigned AS) { 361 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 362 }; 363 364 const LLT S1 = LLT::scalar(1); 365 const LLT S16 = LLT::scalar(16); 366 const LLT S32 = LLT::scalar(32); 367 const LLT S64 = LLT::scalar(64); 368 const LLT S128 = LLT::scalar(128); 369 const LLT S256 = LLT::scalar(256); 370 const LLT S512 = LLT::scalar(512); 371 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 372 373 const LLT V2S16 = LLT::vector(2, 16); 374 const LLT V4S16 = LLT::vector(4, 16); 375 376 const LLT V2S32 = LLT::vector(2, 32); 377 const LLT V3S32 = LLT::vector(3, 32); 378 const LLT V4S32 = LLT::vector(4, 32); 379 const LLT V5S32 = LLT::vector(5, 32); 380 const LLT V6S32 = LLT::vector(6, 32); 381 const LLT V7S32 = LLT::vector(7, 32); 382 const LLT V8S32 = LLT::vector(8, 32); 383 const LLT V9S32 = LLT::vector(9, 32); 384 const LLT V10S32 = LLT::vector(10, 32); 385 const LLT V11S32 = LLT::vector(11, 32); 386 const LLT V12S32 = LLT::vector(12, 32); 387 const LLT V13S32 = LLT::vector(13, 32); 388 const LLT V14S32 = LLT::vector(14, 32); 389 const LLT V15S32 = LLT::vector(15, 32); 390 const LLT V16S32 = LLT::vector(16, 32); 391 const LLT V32S32 = LLT::vector(32, 32); 392 393 const LLT V2S64 = LLT::vector(2, 64); 394 const LLT V3S64 = LLT::vector(3, 64); 395 const LLT V4S64 = LLT::vector(4, 64); 396 const LLT V5S64 = LLT::vector(5, 64); 397 const LLT V6S64 = LLT::vector(6, 64); 398 const LLT V7S64 = LLT::vector(7, 64); 399 const LLT V8S64 = LLT::vector(8, 64); 400 const LLT V16S64 = LLT::vector(16, 64); 401 402 std::initializer_list<LLT> AllS32Vectors = 403 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 404 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 405 std::initializer_list<LLT> AllS64Vectors = 406 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 407 408 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 409 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 410 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 411 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 412 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 413 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 414 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 415 416 const LLT CodePtr = FlatPtr; 417 418 const std::initializer_list<LLT> AddrSpaces64 = { 419 GlobalPtr, ConstantPtr, FlatPtr 420 }; 421 422 const std::initializer_list<LLT> AddrSpaces32 = { 423 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 424 }; 425 426 const std::initializer_list<LLT> FPTypesBase = { 427 S32, S64 428 }; 429 430 const std::initializer_list<LLT> FPTypes16 = { 431 S32, S64, S16 432 }; 433 434 const std::initializer_list<LLT> FPTypesPK16 = { 435 S32, S64, S16, V2S16 436 }; 437 438 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 439 440 setAction({G_BRCOND, S1}, Legal); // VCC branches 441 setAction({G_BRCOND, S32}, Legal); // SCC branches 442 443 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 444 // elements for v3s16 445 getActionDefinitionsBuilder(G_PHI) 446 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 447 .legalFor(AllS32Vectors) 448 .legalFor(AllS64Vectors) 449 .legalFor(AddrSpaces64) 450 .legalFor(AddrSpaces32) 451 .legalIf(isPointer(0)) 452 .clampScalar(0, S32, S256) 453 .widenScalarToNextPow2(0, 32) 454 .clampMaxNumElements(0, S32, 16) 455 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 456 .scalarize(0); 457 458 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 459 // Full set of gfx9 features. 460 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 461 .legalFor({S32, S16, V2S16}) 462 .clampScalar(0, S16, S32) 463 .clampMaxNumElements(0, S16, 2) 464 .scalarize(0) 465 .widenScalarToNextPow2(0, 32); 466 467 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 468 .legalFor({S32, S16, V2S16}) // Clamp modifier 469 .minScalar(0, S16) 470 .clampMaxNumElements(0, S16, 2) 471 .scalarize(0) 472 .widenScalarToNextPow2(0, 32) 473 .lower(); 474 } else if (ST.has16BitInsts()) { 475 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 476 .legalFor({S32, S16}) 477 .clampScalar(0, S16, S32) 478 .scalarize(0) 479 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 480 481 // Technically the saturating operations require clamp bit support, but this 482 // was introduced at the same time as 16-bit operations. 483 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 484 .legalFor({S32, S16}) // Clamp modifier 485 .minScalar(0, S16) 486 .scalarize(0) 487 .widenScalarToNextPow2(0, 16) 488 .lower(); 489 490 // We're just lowering this, but it helps get a better result to try to 491 // coerce to the desired type first. 492 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 493 .minScalar(0, S16) 494 .scalarize(0) 495 .lower(); 496 } else { 497 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 if (ST.hasIntClamp()) { 503 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 504 .legalFor({S32}) // Clamp modifier. 505 .scalarize(0) 506 .minScalarOrElt(0, S32) 507 .lower(); 508 } else { 509 // Clamp bit support was added in VI, along with 16-bit operations. 510 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 511 .minScalar(0, S32) 512 .scalarize(0) 513 .lower(); 514 } 515 516 // FIXME: DAG expansion gets better results. The widening uses the smaller 517 // range values and goes for the min/max lowering directly. 518 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 519 .minScalar(0, S32) 520 .scalarize(0) 521 .lower(); 522 } 523 524 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 525 .customFor({S32, S64}) 526 .clampScalar(0, S32, S64) 527 .widenScalarToNextPow2(0, 32) 528 .scalarize(0); 529 530 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 531 .legalFor({S32}) 532 .clampScalar(0, S32, S32) 533 .scalarize(0); 534 535 // Report legal for any types we can handle anywhere. For the cases only legal 536 // on the SALU, RegBankSelect will be able to re-legalize. 537 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 538 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 539 .clampScalar(0, S32, S64) 540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 541 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 542 .widenScalarToNextPow2(0) 543 .scalarize(0); 544 545 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 546 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 547 .legalFor({{S32, S1}, {S32, S32}}) 548 .minScalar(0, S32) 549 // TODO: .scalarize(0) 550 .lower(); 551 552 getActionDefinitionsBuilder(G_BITCAST) 553 // Don't worry about the size constraint. 554 .legalIf(all(isRegisterType(0), isRegisterType(1))) 555 .lower(); 556 557 558 getActionDefinitionsBuilder(G_CONSTANT) 559 .legalFor({S1, S32, S64, S16, GlobalPtr, 560 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 561 .legalIf(isPointer(0)) 562 .clampScalar(0, S32, S64) 563 .widenScalarToNextPow2(0); 564 565 getActionDefinitionsBuilder(G_FCONSTANT) 566 .legalFor({S32, S64, S16}) 567 .clampScalar(0, S16, S64); 568 569 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 570 .legalIf(isRegisterType(0)) 571 // s1 and s16 are special cases because they have legal operations on 572 // them, but don't really occupy registers in the normal way. 573 .legalFor({S1, S16}) 574 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 575 .clampScalarOrElt(0, S32, MaxScalar) 576 .widenScalarToNextPow2(0, 32) 577 .clampMaxNumElements(0, S32, 16); 578 579 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 580 581 // If the amount is divergent, we have to do a wave reduction to get the 582 // maximum value, so this is expanded during RegBankSelect. 583 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 584 .legalFor({{PrivatePtr, S32}}); 585 586 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 587 .customIf(typeIsNot(0, PrivatePtr)); 588 589 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 590 591 auto &FPOpActions = getActionDefinitionsBuilder( 592 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 593 .legalFor({S32, S64}); 594 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 595 .customFor({S32, S64}); 596 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 597 .customFor({S32, S64}); 598 599 if (ST.has16BitInsts()) { 600 if (ST.hasVOP3PInsts()) 601 FPOpActions.legalFor({S16, V2S16}); 602 else 603 FPOpActions.legalFor({S16}); 604 605 TrigActions.customFor({S16}); 606 FDIVActions.customFor({S16}); 607 } 608 609 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 610 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 611 612 if (ST.hasVOP3PInsts()) { 613 MinNumMaxNum.customFor(FPTypesPK16) 614 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 615 .clampMaxNumElements(0, S16, 2) 616 .clampScalar(0, S16, S64) 617 .scalarize(0); 618 } else if (ST.has16BitInsts()) { 619 MinNumMaxNum.customFor(FPTypes16) 620 .clampScalar(0, S16, S64) 621 .scalarize(0); 622 } else { 623 MinNumMaxNum.customFor(FPTypesBase) 624 .clampScalar(0, S32, S64) 625 .scalarize(0); 626 } 627 628 if (ST.hasVOP3PInsts()) 629 FPOpActions.clampMaxNumElements(0, S16, 2); 630 631 FPOpActions 632 .scalarize(0) 633 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 634 635 TrigActions 636 .scalarize(0) 637 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 638 639 FDIVActions 640 .scalarize(0) 641 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 642 643 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 644 .legalFor(FPTypesPK16) 645 .clampMaxNumElements(0, S16, 2) 646 .scalarize(0) 647 .clampScalar(0, S16, S64); 648 649 if (ST.has16BitInsts()) { 650 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 651 .legalFor({S32, S64, S16}) 652 .scalarize(0) 653 .clampScalar(0, S16, S64); 654 } else { 655 getActionDefinitionsBuilder(G_FSQRT) 656 .legalFor({S32, S64}) 657 .scalarize(0) 658 .clampScalar(0, S32, S64); 659 660 if (ST.hasFractBug()) { 661 getActionDefinitionsBuilder(G_FFLOOR) 662 .customFor({S64}) 663 .legalFor({S32, S64}) 664 .scalarize(0) 665 .clampScalar(0, S32, S64); 666 } else { 667 getActionDefinitionsBuilder(G_FFLOOR) 668 .legalFor({S32, S64}) 669 .scalarize(0) 670 .clampScalar(0, S32, S64); 671 } 672 } 673 674 getActionDefinitionsBuilder(G_FPTRUNC) 675 .legalFor({{S32, S64}, {S16, S32}}) 676 .scalarize(0) 677 .lower(); 678 679 getActionDefinitionsBuilder(G_FPEXT) 680 .legalFor({{S64, S32}, {S32, S16}}) 681 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 682 .scalarize(0); 683 684 getActionDefinitionsBuilder(G_FSUB) 685 // Use actual fsub instruction 686 .legalFor({S32}) 687 // Must use fadd + fneg 688 .lowerFor({S64, S16, V2S16}) 689 .scalarize(0) 690 .clampScalar(0, S32, S64); 691 692 // Whether this is legal depends on the floating point mode for the function. 693 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 694 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 695 FMad.customFor({S32, S16}); 696 else if (ST.hasMadMacF32Insts()) 697 FMad.customFor({S32}); 698 else if (ST.hasMadF16()) 699 FMad.customFor({S16}); 700 FMad.scalarize(0) 701 .lower(); 702 703 // TODO: Do we need to clamp maximum bitwidth? 704 getActionDefinitionsBuilder(G_TRUNC) 705 .legalIf(isScalar(0)) 706 .legalFor({{V2S16, V2S32}}) 707 .clampMaxNumElements(0, S16, 2) 708 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 709 // situations (like an invalid implicit use), we don't want to infinite loop 710 // in the legalizer. 711 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 712 .alwaysLegal(); 713 714 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 715 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 716 {S32, S1}, {S64, S1}, {S16, S1}}) 717 .scalarize(0) 718 .clampScalar(0, S32, S64) 719 .widenScalarToNextPow2(1, 32); 720 721 // TODO: Split s1->s64 during regbankselect for VALU. 722 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 723 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 724 .lowerFor({{S32, S64}}) 725 .lowerIf(typeIs(1, S1)) 726 .customFor({{S64, S64}}); 727 if (ST.has16BitInsts()) 728 IToFP.legalFor({{S16, S16}}); 729 IToFP.clampScalar(1, S32, S64) 730 .minScalar(0, S32) 731 .scalarize(0) 732 .widenScalarToNextPow2(1); 733 734 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 735 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 736 .customFor({{S64, S64}}) 737 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 738 if (ST.has16BitInsts()) 739 FPToI.legalFor({{S16, S16}}); 740 else 741 FPToI.minScalar(1, S32); 742 743 FPToI.minScalar(0, S32) 744 .scalarize(0) 745 .lower(); 746 747 // Lower roundeven into G_FRINT 748 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 749 .scalarize(0) 750 .lower(); 751 752 if (ST.has16BitInsts()) { 753 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 754 .legalFor({S16, S32, S64}) 755 .clampScalar(0, S16, S64) 756 .scalarize(0); 757 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 758 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 759 .legalFor({S32, S64}) 760 .clampScalar(0, S32, S64) 761 .scalarize(0); 762 } else { 763 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 764 .legalFor({S32}) 765 .customFor({S64}) 766 .clampScalar(0, S32, S64) 767 .scalarize(0); 768 } 769 770 getActionDefinitionsBuilder(G_PTR_ADD) 771 .legalIf(all(isPointer(0), sameSize(0, 1))) 772 .scalarize(0) 773 .scalarSameSizeAs(1, 0); 774 775 getActionDefinitionsBuilder(G_PTRMASK) 776 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 777 .scalarSameSizeAs(1, 0) 778 .scalarize(0); 779 780 auto &CmpBuilder = 781 getActionDefinitionsBuilder(G_ICMP) 782 // The compare output type differs based on the register bank of the output, 783 // so make both s1 and s32 legal. 784 // 785 // Scalar compares producing output in scc will be promoted to s32, as that 786 // is the allocatable register type that will be needed for the copy from 787 // scc. This will be promoted during RegBankSelect, and we assume something 788 // before that won't try to use s32 result types. 789 // 790 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 791 // bank. 792 .legalForCartesianProduct( 793 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 794 .legalForCartesianProduct( 795 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 796 if (ST.has16BitInsts()) { 797 CmpBuilder.legalFor({{S1, S16}}); 798 } 799 800 CmpBuilder 801 .widenScalarToNextPow2(1) 802 .clampScalar(1, S32, S64) 803 .scalarize(0) 804 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 805 806 getActionDefinitionsBuilder(G_FCMP) 807 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 808 .widenScalarToNextPow2(1) 809 .clampScalar(1, S32, S64) 810 .scalarize(0); 811 812 // FIXME: fpow has a selection pattern that should move to custom lowering. 813 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 814 if (ST.has16BitInsts()) 815 Exp2Ops.legalFor({S32, S16}); 816 else 817 Exp2Ops.legalFor({S32}); 818 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 819 Exp2Ops.scalarize(0); 820 821 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 822 if (ST.has16BitInsts()) 823 ExpOps.customFor({{S32}, {S16}}); 824 else 825 ExpOps.customFor({S32}); 826 ExpOps.clampScalar(0, MinScalarFPTy, S32) 827 .scalarize(0); 828 829 getActionDefinitionsBuilder(G_FPOWI) 830 .clampScalar(0, MinScalarFPTy, S32) 831 .lower(); 832 833 // The 64-bit versions produce 32-bit results, but only on the SALU. 834 getActionDefinitionsBuilder(G_CTPOP) 835 .legalFor({{S32, S32}, {S32, S64}}) 836 .clampScalar(0, S32, S32) 837 .clampScalar(1, S32, S64) 838 .scalarize(0) 839 .widenScalarToNextPow2(0, 32) 840 .widenScalarToNextPow2(1, 32); 841 842 // The hardware instructions return a different result on 0 than the generic 843 // instructions expect. The hardware produces -1, but these produce the 844 // bitwidth. 845 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 846 .scalarize(0) 847 .clampScalar(0, S32, S32) 848 .clampScalar(1, S32, S64) 849 .widenScalarToNextPow2(0, 32) 850 .widenScalarToNextPow2(1, 32) 851 .lower(); 852 853 // The 64-bit versions produce 32-bit results, but only on the SALU. 854 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 855 .legalFor({{S32, S32}, {S32, S64}}) 856 .clampScalar(0, S32, S32) 857 .clampScalar(1, S32, S64) 858 .scalarize(0) 859 .widenScalarToNextPow2(0, 32) 860 .widenScalarToNextPow2(1, 32); 861 862 getActionDefinitionsBuilder(G_BITREVERSE) 863 .legalFor({S32}) 864 .clampScalar(0, S32, S32) 865 .scalarize(0); 866 867 if (ST.has16BitInsts()) { 868 getActionDefinitionsBuilder(G_BSWAP) 869 .legalFor({S16, S32, V2S16}) 870 .clampMaxNumElements(0, S16, 2) 871 // FIXME: Fixing non-power-of-2 before clamp is workaround for 872 // narrowScalar limitation. 873 .widenScalarToNextPow2(0) 874 .clampScalar(0, S16, S32) 875 .scalarize(0); 876 877 if (ST.hasVOP3PInsts()) { 878 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 879 .legalFor({S32, S16, V2S16}) 880 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 881 .clampMaxNumElements(0, S16, 2) 882 .minScalar(0, S16) 883 .widenScalarToNextPow2(0) 884 .scalarize(0) 885 .lower(); 886 } else { 887 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 888 .legalFor({S32, S16}) 889 .widenScalarToNextPow2(0) 890 .minScalar(0, S16) 891 .scalarize(0) 892 .lower(); 893 } 894 } else { 895 // TODO: Should have same legality without v_perm_b32 896 getActionDefinitionsBuilder(G_BSWAP) 897 .legalFor({S32}) 898 .lowerIf(scalarNarrowerThan(0, 32)) 899 // FIXME: Fixing non-power-of-2 before clamp is workaround for 900 // narrowScalar limitation. 901 .widenScalarToNextPow2(0) 902 .maxScalar(0, S32) 903 .scalarize(0) 904 .lower(); 905 906 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 907 .legalFor({S32}) 908 .minScalar(0, S32) 909 .widenScalarToNextPow2(0) 910 .scalarize(0) 911 .lower(); 912 } 913 914 getActionDefinitionsBuilder(G_INTTOPTR) 915 // List the common cases 916 .legalForCartesianProduct(AddrSpaces64, {S64}) 917 .legalForCartesianProduct(AddrSpaces32, {S32}) 918 .scalarize(0) 919 // Accept any address space as long as the size matches 920 .legalIf(sameSize(0, 1)) 921 .widenScalarIf(smallerThan(1, 0), 922 [](const LegalityQuery &Query) { 923 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 924 }) 925 .narrowScalarIf(largerThan(1, 0), 926 [](const LegalityQuery &Query) { 927 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 928 }); 929 930 getActionDefinitionsBuilder(G_PTRTOINT) 931 // List the common cases 932 .legalForCartesianProduct(AddrSpaces64, {S64}) 933 .legalForCartesianProduct(AddrSpaces32, {S32}) 934 .scalarize(0) 935 // Accept any address space as long as the size matches 936 .legalIf(sameSize(0, 1)) 937 .widenScalarIf(smallerThan(0, 1), 938 [](const LegalityQuery &Query) { 939 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 940 }) 941 .narrowScalarIf( 942 largerThan(0, 1), 943 [](const LegalityQuery &Query) { 944 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 945 }); 946 947 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 948 .scalarize(0) 949 .custom(); 950 951 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 952 bool IsLoad) -> bool { 953 const LLT DstTy = Query.Types[0]; 954 955 // Split vector extloads. 956 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 957 unsigned Align = Query.MMODescrs[0].AlignInBits; 958 959 if (MemSize < DstTy.getSizeInBits()) 960 MemSize = std::max(MemSize, Align); 961 962 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 963 return true; 964 965 const LLT PtrTy = Query.Types[1]; 966 unsigned AS = PtrTy.getAddressSpace(); 967 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 968 return true; 969 970 // Catch weird sized loads that don't evenly divide into the access sizes 971 // TODO: May be able to widen depending on alignment etc. 972 unsigned NumRegs = (MemSize + 31) / 32; 973 if (NumRegs == 3) { 974 if (!ST.hasDwordx3LoadStores()) 975 return true; 976 } else { 977 // If the alignment allows, these should have been widened. 978 if (!isPowerOf2_32(NumRegs)) 979 return true; 980 } 981 982 if (Align < MemSize) { 983 const SITargetLowering *TLI = ST.getTargetLowering(); 984 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 985 } 986 987 return false; 988 }; 989 990 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 991 unsigned Opc) -> bool { 992 unsigned Size = Query.Types[0].getSizeInBits(); 993 if (isPowerOf2_32(Size)) 994 return false; 995 996 if (Size == 96 && ST.hasDwordx3LoadStores()) 997 return false; 998 999 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 1000 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 1001 return false; 1002 1003 unsigned Align = Query.MMODescrs[0].AlignInBits; 1004 unsigned RoundedSize = NextPowerOf2(Size); 1005 return (Align >= RoundedSize); 1006 }; 1007 1008 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 1009 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 1010 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 1011 1012 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1013 // LDS 1014 // TODO: Unsupported flat for SI. 1015 1016 for (unsigned Op : {G_LOAD, G_STORE}) { 1017 const bool IsStore = Op == G_STORE; 1018 1019 auto &Actions = getActionDefinitionsBuilder(Op); 1020 // Explicitly list some common cases. 1021 // TODO: Does this help compile time at all? 1022 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1023 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1024 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1025 {S64, GlobalPtr, 64, GlobalAlign32}, 1026 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1027 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1028 {S32, GlobalPtr, 8, GlobalAlign8}, 1029 {S32, GlobalPtr, 16, GlobalAlign16}, 1030 1031 {S32, LocalPtr, 32, 32}, 1032 {S64, LocalPtr, 64, 32}, 1033 {V2S32, LocalPtr, 64, 32}, 1034 {S32, LocalPtr, 8, 8}, 1035 {S32, LocalPtr, 16, 16}, 1036 {V2S16, LocalPtr, 32, 32}, 1037 1038 {S32, PrivatePtr, 32, 32}, 1039 {S32, PrivatePtr, 8, 8}, 1040 {S32, PrivatePtr, 16, 16}, 1041 {V2S16, PrivatePtr, 32, 32}, 1042 1043 {S32, ConstantPtr, 32, GlobalAlign32}, 1044 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1045 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1046 {S64, ConstantPtr, 64, GlobalAlign32}, 1047 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1048 Actions.legalIf( 1049 [=](const LegalityQuery &Query) -> bool { 1050 return isLoadStoreLegal(ST, Query, Op); 1051 }); 1052 1053 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1054 // 64-bits. 1055 // 1056 // TODO: Should generalize bitcast action into coerce, which will also cover 1057 // inserting addrspacecasts. 1058 Actions.customIf(typeIs(1, Constant32Ptr)); 1059 1060 // Turn any illegal element vectors into something easier to deal 1061 // with. These will ultimately produce 32-bit scalar shifts to extract the 1062 // parts anyway. 1063 // 1064 // For odd 16-bit element vectors, prefer to split those into pieces with 1065 // 16-bit vector parts. 1066 Actions.bitcastIf( 1067 [=](const LegalityQuery &Query) -> bool { 1068 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1069 Query.MMODescrs[0].SizeInBits); 1070 }, bitcastToRegisterType(0)); 1071 1072 Actions 1073 .customIf(typeIs(1, Constant32Ptr)) 1074 // Widen suitably aligned loads by loading extra elements. 1075 .moreElementsIf([=](const LegalityQuery &Query) { 1076 const LLT Ty = Query.Types[0]; 1077 return Op == G_LOAD && Ty.isVector() && 1078 shouldWidenLoadResult(Query, Op); 1079 }, moreElementsToNextPow2(0)) 1080 .widenScalarIf([=](const LegalityQuery &Query) { 1081 const LLT Ty = Query.Types[0]; 1082 return Op == G_LOAD && !Ty.isVector() && 1083 shouldWidenLoadResult(Query, Op); 1084 }, widenScalarOrEltToNextPow2(0)) 1085 .narrowScalarIf( 1086 [=](const LegalityQuery &Query) -> bool { 1087 return !Query.Types[0].isVector() && 1088 needToSplitMemOp(Query, Op == G_LOAD); 1089 }, 1090 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1091 const LLT DstTy = Query.Types[0]; 1092 const LLT PtrTy = Query.Types[1]; 1093 1094 const unsigned DstSize = DstTy.getSizeInBits(); 1095 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1096 1097 // Split extloads. 1098 if (DstSize > MemSize) 1099 return std::make_pair(0, LLT::scalar(MemSize)); 1100 1101 if (!isPowerOf2_32(DstSize)) { 1102 // We're probably decomposing an odd sized store. Try to split 1103 // to the widest type. TODO: Account for alignment. As-is it 1104 // should be OK, since the new parts will be further legalized. 1105 unsigned FloorSize = PowerOf2Floor(DstSize); 1106 return std::make_pair(0, LLT::scalar(FloorSize)); 1107 } 1108 1109 if (DstSize > 32 && (DstSize % 32 != 0)) { 1110 // FIXME: Need a way to specify non-extload of larger size if 1111 // suitably aligned. 1112 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1113 } 1114 1115 unsigned MaxSize = maxSizeForAddrSpace(ST, 1116 PtrTy.getAddressSpace(), 1117 Op == G_LOAD); 1118 if (MemSize > MaxSize) 1119 return std::make_pair(0, LLT::scalar(MaxSize)); 1120 1121 unsigned Align = Query.MMODescrs[0].AlignInBits; 1122 return std::make_pair(0, LLT::scalar(Align)); 1123 }) 1124 .fewerElementsIf( 1125 [=](const LegalityQuery &Query) -> bool { 1126 return Query.Types[0].isVector() && 1127 needToSplitMemOp(Query, Op == G_LOAD); 1128 }, 1129 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1130 const LLT DstTy = Query.Types[0]; 1131 const LLT PtrTy = Query.Types[1]; 1132 1133 LLT EltTy = DstTy.getElementType(); 1134 unsigned MaxSize = maxSizeForAddrSpace(ST, 1135 PtrTy.getAddressSpace(), 1136 Op == G_LOAD); 1137 1138 // FIXME: Handle widened to power of 2 results better. This ends 1139 // up scalarizing. 1140 // FIXME: 3 element stores scalarized on SI 1141 1142 // Split if it's too large for the address space. 1143 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1144 unsigned NumElts = DstTy.getNumElements(); 1145 unsigned EltSize = EltTy.getSizeInBits(); 1146 1147 if (MaxSize % EltSize == 0) { 1148 return std::make_pair( 1149 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1150 } 1151 1152 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1153 1154 // FIXME: Refine when odd breakdowns handled 1155 // The scalars will need to be re-legalized. 1156 if (NumPieces == 1 || NumPieces >= NumElts || 1157 NumElts % NumPieces != 0) 1158 return std::make_pair(0, EltTy); 1159 1160 return std::make_pair(0, 1161 LLT::vector(NumElts / NumPieces, EltTy)); 1162 } 1163 1164 // FIXME: We could probably handle weird extending loads better. 1165 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1166 if (DstTy.getSizeInBits() > MemSize) 1167 return std::make_pair(0, EltTy); 1168 1169 unsigned EltSize = EltTy.getSizeInBits(); 1170 unsigned DstSize = DstTy.getSizeInBits(); 1171 if (!isPowerOf2_32(DstSize)) { 1172 // We're probably decomposing an odd sized store. Try to split 1173 // to the widest type. TODO: Account for alignment. As-is it 1174 // should be OK, since the new parts will be further legalized. 1175 unsigned FloorSize = PowerOf2Floor(DstSize); 1176 return std::make_pair( 1177 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1178 } 1179 1180 // Need to split because of alignment. 1181 unsigned Align = Query.MMODescrs[0].AlignInBits; 1182 if (EltSize > Align && 1183 (EltSize / Align < DstTy.getNumElements())) { 1184 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1185 } 1186 1187 // May need relegalization for the scalars. 1188 return std::make_pair(0, EltTy); 1189 }) 1190 .minScalar(0, S32); 1191 1192 if (IsStore) 1193 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1194 1195 // TODO: Need a bitcast lower option? 1196 Actions 1197 .widenScalarToNextPow2(0) 1198 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1199 } 1200 1201 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1202 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1203 {S32, GlobalPtr, 16, 2 * 8}, 1204 {S32, LocalPtr, 8, 8}, 1205 {S32, LocalPtr, 16, 16}, 1206 {S32, PrivatePtr, 8, 8}, 1207 {S32, PrivatePtr, 16, 16}, 1208 {S32, ConstantPtr, 8, 8}, 1209 {S32, ConstantPtr, 16, 2 * 8}}); 1210 if (ST.hasFlatAddressSpace()) { 1211 ExtLoads.legalForTypesWithMemDesc( 1212 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1213 } 1214 1215 ExtLoads.clampScalar(0, S32, S32) 1216 .widenScalarToNextPow2(0) 1217 .unsupportedIfMemSizeNotPow2() 1218 .lower(); 1219 1220 auto &Atomics = getActionDefinitionsBuilder( 1221 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1222 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1223 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1224 G_ATOMICRMW_UMIN}) 1225 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1226 {S64, GlobalPtr}, {S64, LocalPtr}, 1227 {S32, RegionPtr}, {S64, RegionPtr}}); 1228 if (ST.hasFlatAddressSpace()) { 1229 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1230 } 1231 1232 if (ST.hasLDSFPAtomics()) { 1233 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1234 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1235 } 1236 1237 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1238 // demarshalling 1239 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1240 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1241 {S32, FlatPtr}, {S64, FlatPtr}}) 1242 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1243 {S32, RegionPtr}, {S64, RegionPtr}}); 1244 // TODO: Pointer types, any 32-bit or 64-bit vector 1245 1246 // Condition should be s32 for scalar, s1 for vector. 1247 getActionDefinitionsBuilder(G_SELECT) 1248 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1249 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1250 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1251 .clampScalar(0, S16, S64) 1252 .scalarize(1) 1253 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1254 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1255 .clampMaxNumElements(0, S32, 2) 1256 .clampMaxNumElements(0, LocalPtr, 2) 1257 .clampMaxNumElements(0, PrivatePtr, 2) 1258 .scalarize(0) 1259 .widenScalarToNextPow2(0) 1260 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1261 1262 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1263 // be more flexible with the shift amount type. 1264 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1265 .legalFor({{S32, S32}, {S64, S32}}); 1266 if (ST.has16BitInsts()) { 1267 if (ST.hasVOP3PInsts()) { 1268 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1269 .clampMaxNumElements(0, S16, 2); 1270 } else 1271 Shifts.legalFor({{S16, S16}}); 1272 1273 // TODO: Support 16-bit shift amounts for all types 1274 Shifts.widenScalarIf( 1275 [=](const LegalityQuery &Query) { 1276 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1277 // 32-bit amount. 1278 const LLT ValTy = Query.Types[0]; 1279 const LLT AmountTy = Query.Types[1]; 1280 return ValTy.getSizeInBits() <= 16 && 1281 AmountTy.getSizeInBits() < 16; 1282 }, changeTo(1, S16)); 1283 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1284 Shifts.clampScalar(1, S32, S32); 1285 Shifts.clampScalar(0, S16, S64); 1286 Shifts.widenScalarToNextPow2(0, 16); 1287 } else { 1288 // Make sure we legalize the shift amount type first, as the general 1289 // expansion for the shifted type will produce much worse code if it hasn't 1290 // been truncated already. 1291 Shifts.clampScalar(1, S32, S32); 1292 Shifts.clampScalar(0, S32, S64); 1293 Shifts.widenScalarToNextPow2(0, 32); 1294 } 1295 Shifts.scalarize(0); 1296 1297 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1298 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1299 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1300 unsigned IdxTypeIdx = 2; 1301 1302 getActionDefinitionsBuilder(Op) 1303 .customIf([=](const LegalityQuery &Query) { 1304 const LLT EltTy = Query.Types[EltTypeIdx]; 1305 const LLT VecTy = Query.Types[VecTypeIdx]; 1306 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1307 const unsigned EltSize = EltTy.getSizeInBits(); 1308 return (EltSize == 32 || EltSize == 64) && 1309 VecTy.getSizeInBits() % 32 == 0 && 1310 VecTy.getSizeInBits() <= MaxRegisterSize && 1311 IdxTy.getSizeInBits() == 32; 1312 }) 1313 .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), 1314 bitcastToVectorElement32(1)) 1315 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1316 .bitcastIf( 1317 all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), 1318 [=](const LegalityQuery &Query) { 1319 // For > 64-bit element types, try to turn this into a 64-bit 1320 // element vector since we may be able to do better indexing 1321 // if this is scalar. If not, fall back to 32. 1322 const LLT EltTy = Query.Types[EltTypeIdx]; 1323 const LLT VecTy = Query.Types[VecTypeIdx]; 1324 const unsigned DstEltSize = EltTy.getSizeInBits(); 1325 const unsigned VecSize = VecTy.getSizeInBits(); 1326 1327 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1328 return std::make_pair( 1329 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1330 }) 1331 .clampScalar(EltTypeIdx, S32, S64) 1332 .clampScalar(VecTypeIdx, S32, S64) 1333 .clampScalar(IdxTypeIdx, S32, S32) 1334 // TODO: Clamp the number of elements before resorting to stack lowering. 1335 // It should only be necessary with variable indexes. 1336 // As a last resort, lower to the stack 1337 .lower(); 1338 } 1339 1340 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1341 .unsupportedIf([=](const LegalityQuery &Query) { 1342 const LLT &EltTy = Query.Types[1].getElementType(); 1343 return Query.Types[0] != EltTy; 1344 }); 1345 1346 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1347 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1348 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1349 1350 // FIXME: Doesn't handle extract of illegal sizes. 1351 getActionDefinitionsBuilder(Op) 1352 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1353 // FIXME: Multiples of 16 should not be legal. 1354 .legalIf([=](const LegalityQuery &Query) { 1355 const LLT BigTy = Query.Types[BigTyIdx]; 1356 const LLT LitTy = Query.Types[LitTyIdx]; 1357 return (BigTy.getSizeInBits() % 32 == 0) && 1358 (LitTy.getSizeInBits() % 16 == 0); 1359 }) 1360 .widenScalarIf( 1361 [=](const LegalityQuery &Query) { 1362 const LLT BigTy = Query.Types[BigTyIdx]; 1363 return (BigTy.getScalarSizeInBits() < 16); 1364 }, 1365 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1366 .widenScalarIf( 1367 [=](const LegalityQuery &Query) { 1368 const LLT LitTy = Query.Types[LitTyIdx]; 1369 return (LitTy.getScalarSizeInBits() < 16); 1370 }, 1371 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1372 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1373 .widenScalarToNextPow2(BigTyIdx, 32); 1374 1375 } 1376 1377 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1378 .legalForCartesianProduct(AllS32Vectors, {S32}) 1379 .legalForCartesianProduct(AllS64Vectors, {S64}) 1380 .clampNumElements(0, V16S32, V32S32) 1381 .clampNumElements(0, V2S64, V16S64) 1382 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1383 1384 if (ST.hasScalarPackInsts()) { 1385 BuildVector 1386 // FIXME: Should probably widen s1 vectors straight to s32 1387 .minScalarOrElt(0, S16) 1388 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1389 .minScalar(1, S32); 1390 1391 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1392 .legalFor({V2S16, S32}) 1393 .lower(); 1394 BuildVector.minScalarOrElt(0, S32); 1395 } else { 1396 BuildVector.customFor({V2S16, S16}); 1397 BuildVector.minScalarOrElt(0, S32); 1398 1399 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1400 .customFor({V2S16, S32}) 1401 .lower(); 1402 } 1403 1404 BuildVector.legalIf(isRegisterType(0)); 1405 1406 // FIXME: Clamp maximum size 1407 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1408 .legalIf(isRegisterType(0)); 1409 1410 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1411 // pre-legalize. 1412 if (ST.hasVOP3PInsts()) { 1413 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1414 .customFor({V2S16, V2S16}) 1415 .lower(); 1416 } else 1417 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1418 1419 // Merge/Unmerge 1420 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1421 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1422 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1423 1424 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1425 const LLT Ty = Query.Types[TypeIdx]; 1426 if (Ty.isVector()) { 1427 const LLT &EltTy = Ty.getElementType(); 1428 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1429 return true; 1430 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1431 return true; 1432 } 1433 return false; 1434 }; 1435 1436 auto &Builder = getActionDefinitionsBuilder(Op) 1437 .lowerFor({{S16, V2S16}}) 1438 .lowerIf([=](const LegalityQuery &Query) { 1439 const LLT BigTy = Query.Types[BigTyIdx]; 1440 return BigTy.getSizeInBits() == 32; 1441 }) 1442 // Try to widen to s16 first for small types. 1443 // TODO: Only do this on targets with legal s16 shifts 1444 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1445 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1446 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1447 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1448 elementTypeIs(1, S16)), 1449 changeTo(1, V2S16)) 1450 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1451 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1452 // valid. 1453 .clampScalar(LitTyIdx, S32, S512) 1454 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1455 // Break up vectors with weird elements into scalars 1456 .fewerElementsIf( 1457 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1458 scalarize(0)) 1459 .fewerElementsIf( 1460 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1461 scalarize(1)) 1462 .clampScalar(BigTyIdx, S32, MaxScalar); 1463 1464 if (Op == G_MERGE_VALUES) { 1465 Builder.widenScalarIf( 1466 // TODO: Use 16-bit shifts if legal for 8-bit values? 1467 [=](const LegalityQuery &Query) { 1468 const LLT Ty = Query.Types[LitTyIdx]; 1469 return Ty.getSizeInBits() < 32; 1470 }, 1471 changeTo(LitTyIdx, S32)); 1472 } 1473 1474 Builder.widenScalarIf( 1475 [=](const LegalityQuery &Query) { 1476 const LLT Ty = Query.Types[BigTyIdx]; 1477 return !isPowerOf2_32(Ty.getSizeInBits()) && 1478 Ty.getSizeInBits() % 16 != 0; 1479 }, 1480 [=](const LegalityQuery &Query) { 1481 // Pick the next power of 2, or a multiple of 64 over 128. 1482 // Whichever is smaller. 1483 const LLT &Ty = Query.Types[BigTyIdx]; 1484 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1485 if (NewSizeInBits >= 256) { 1486 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1487 if (RoundedTo < NewSizeInBits) 1488 NewSizeInBits = RoundedTo; 1489 } 1490 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1491 }) 1492 .legalIf([=](const LegalityQuery &Query) { 1493 const LLT &BigTy = Query.Types[BigTyIdx]; 1494 const LLT &LitTy = Query.Types[LitTyIdx]; 1495 1496 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1497 return false; 1498 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1499 return false; 1500 1501 return BigTy.getSizeInBits() % 16 == 0 && 1502 LitTy.getSizeInBits() % 16 == 0 && 1503 BigTy.getSizeInBits() <= MaxRegisterSize; 1504 }) 1505 // Any vectors left are the wrong size. Scalarize them. 1506 .scalarize(0) 1507 .scalarize(1); 1508 } 1509 1510 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1511 // RegBankSelect. 1512 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1513 .legalFor({{S32}, {S64}}); 1514 1515 if (ST.hasVOP3PInsts()) { 1516 SextInReg.lowerFor({{V2S16}}) 1517 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1518 // get more vector shift opportunities, since we'll get those when 1519 // expanded. 1520 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1521 } else if (ST.has16BitInsts()) { 1522 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1523 } else { 1524 // Prefer to promote to s32 before lowering if we don't have 16-bit 1525 // shifts. This avoid a lot of intermediate truncate and extend operations. 1526 SextInReg.lowerFor({{S32}, {S64}}); 1527 } 1528 1529 SextInReg 1530 .scalarize(0) 1531 .clampScalar(0, S32, S64) 1532 .lower(); 1533 1534 getActionDefinitionsBuilder(G_FSHR) 1535 .legalFor({{S32, S32}}) 1536 .scalarize(0) 1537 .lower(); 1538 1539 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1540 .legalFor({S64}); 1541 1542 getActionDefinitionsBuilder(G_FENCE) 1543 .alwaysLegal(); 1544 1545 getActionDefinitionsBuilder({ 1546 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1547 G_FCOPYSIGN, 1548 1549 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1550 G_ATOMICRMW_NAND, 1551 G_ATOMICRMW_FSUB, 1552 G_READ_REGISTER, 1553 G_WRITE_REGISTER, 1554 1555 G_SADDO, G_SSUBO, 1556 1557 // TODO: Implement 1558 G_FMINIMUM, G_FMAXIMUM, 1559 G_FSHL 1560 }).lower(); 1561 1562 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1563 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1564 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1565 .unsupported(); 1566 1567 computeTables(); 1568 verify(*ST.getInstrInfo()); 1569 } 1570 1571 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1572 MachineInstr &MI) const { 1573 MachineIRBuilder &B = Helper.MIRBuilder; 1574 MachineRegisterInfo &MRI = *B.getMRI(); 1575 GISelChangeObserver &Observer = Helper.Observer; 1576 1577 switch (MI.getOpcode()) { 1578 case TargetOpcode::G_ADDRSPACE_CAST: 1579 return legalizeAddrSpaceCast(MI, MRI, B); 1580 case TargetOpcode::G_FRINT: 1581 return legalizeFrint(MI, MRI, B); 1582 case TargetOpcode::G_FCEIL: 1583 return legalizeFceil(MI, MRI, B); 1584 case TargetOpcode::G_INTRINSIC_TRUNC: 1585 return legalizeIntrinsicTrunc(MI, MRI, B); 1586 case TargetOpcode::G_SITOFP: 1587 return legalizeITOFP(MI, MRI, B, true); 1588 case TargetOpcode::G_UITOFP: 1589 return legalizeITOFP(MI, MRI, B, false); 1590 case TargetOpcode::G_FPTOSI: 1591 return legalizeFPTOI(MI, MRI, B, true); 1592 case TargetOpcode::G_FPTOUI: 1593 return legalizeFPTOI(MI, MRI, B, false); 1594 case TargetOpcode::G_FMINNUM: 1595 case TargetOpcode::G_FMAXNUM: 1596 case TargetOpcode::G_FMINNUM_IEEE: 1597 case TargetOpcode::G_FMAXNUM_IEEE: 1598 return legalizeMinNumMaxNum(Helper, MI); 1599 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1600 return legalizeExtractVectorElt(MI, MRI, B); 1601 case TargetOpcode::G_INSERT_VECTOR_ELT: 1602 return legalizeInsertVectorElt(MI, MRI, B); 1603 case TargetOpcode::G_SHUFFLE_VECTOR: 1604 return legalizeShuffleVector(MI, MRI, B); 1605 case TargetOpcode::G_FSIN: 1606 case TargetOpcode::G_FCOS: 1607 return legalizeSinCos(MI, MRI, B); 1608 case TargetOpcode::G_GLOBAL_VALUE: 1609 return legalizeGlobalValue(MI, MRI, B); 1610 case TargetOpcode::G_LOAD: 1611 return legalizeLoad(MI, MRI, B, Observer); 1612 case TargetOpcode::G_FMAD: 1613 return legalizeFMad(MI, MRI, B); 1614 case TargetOpcode::G_FDIV: 1615 return legalizeFDIV(MI, MRI, B); 1616 case TargetOpcode::G_UDIV: 1617 case TargetOpcode::G_UREM: 1618 return legalizeUDIV_UREM(MI, MRI, B); 1619 case TargetOpcode::G_SDIV: 1620 case TargetOpcode::G_SREM: 1621 return legalizeSDIV_SREM(MI, MRI, B); 1622 case TargetOpcode::G_ATOMIC_CMPXCHG: 1623 return legalizeAtomicCmpXChg(MI, MRI, B); 1624 case TargetOpcode::G_FLOG: 1625 return legalizeFlog(MI, B, numbers::ln2f); 1626 case TargetOpcode::G_FLOG10: 1627 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1628 case TargetOpcode::G_FEXP: 1629 return legalizeFExp(MI, B); 1630 case TargetOpcode::G_FPOW: 1631 return legalizeFPow(MI, B); 1632 case TargetOpcode::G_FFLOOR: 1633 return legalizeFFloor(MI, MRI, B); 1634 case TargetOpcode::G_BUILD_VECTOR: 1635 return legalizeBuildVector(MI, MRI, B); 1636 default: 1637 return false; 1638 } 1639 1640 llvm_unreachable("expected switch to return"); 1641 } 1642 1643 Register AMDGPULegalizerInfo::getSegmentAperture( 1644 unsigned AS, 1645 MachineRegisterInfo &MRI, 1646 MachineIRBuilder &B) const { 1647 MachineFunction &MF = B.getMF(); 1648 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1649 const LLT S32 = LLT::scalar(32); 1650 1651 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1652 1653 if (ST.hasApertureRegs()) { 1654 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1655 // getreg. 1656 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1657 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1658 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1659 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1660 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1661 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1662 unsigned Encoding = 1663 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1664 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1665 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1666 1667 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1668 1669 B.buildInstr(AMDGPU::S_GETREG_B32) 1670 .addDef(GetReg) 1671 .addImm(Encoding); 1672 MRI.setType(GetReg, S32); 1673 1674 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1675 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1676 } 1677 1678 Register QueuePtr = MRI.createGenericVirtualRegister( 1679 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1680 1681 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1682 return Register(); 1683 1684 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1685 // private_segment_aperture_base_hi. 1686 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1687 1688 // TODO: can we be smarter about machine pointer info? 1689 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1690 MachineMemOperand *MMO = MF.getMachineMemOperand( 1691 PtrInfo, 1692 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1693 MachineMemOperand::MOInvariant, 1694 4, commonAlignment(Align(64), StructOffset)); 1695 1696 Register LoadAddr; 1697 1698 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1699 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1700 } 1701 1702 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1703 MachineInstr &MI, MachineRegisterInfo &MRI, 1704 MachineIRBuilder &B) const { 1705 MachineFunction &MF = B.getMF(); 1706 1707 const LLT S32 = LLT::scalar(32); 1708 Register Dst = MI.getOperand(0).getReg(); 1709 Register Src = MI.getOperand(1).getReg(); 1710 1711 LLT DstTy = MRI.getType(Dst); 1712 LLT SrcTy = MRI.getType(Src); 1713 unsigned DestAS = DstTy.getAddressSpace(); 1714 unsigned SrcAS = SrcTy.getAddressSpace(); 1715 1716 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1717 // vector element. 1718 assert(!DstTy.isVector()); 1719 1720 const AMDGPUTargetMachine &TM 1721 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1722 1723 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1724 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1725 return true; 1726 } 1727 1728 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1729 // Truncate. 1730 B.buildExtract(Dst, Src, 0); 1731 MI.eraseFromParent(); 1732 return true; 1733 } 1734 1735 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1736 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1737 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1738 1739 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1740 // another. Merge operands are required to be the same type, but creating an 1741 // extra ptrtoint would be kind of pointless. 1742 auto HighAddr = B.buildConstant( 1743 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1744 B.buildMerge(Dst, {Src, HighAddr}); 1745 MI.eraseFromParent(); 1746 return true; 1747 } 1748 1749 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1750 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1751 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1752 unsigned NullVal = TM.getNullPointerValue(DestAS); 1753 1754 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1755 auto FlatNull = B.buildConstant(SrcTy, 0); 1756 1757 // Extract low 32-bits of the pointer. 1758 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1759 1760 auto CmpRes = 1761 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1762 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1763 1764 MI.eraseFromParent(); 1765 return true; 1766 } 1767 1768 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1769 return false; 1770 1771 if (!ST.hasFlatAddressSpace()) 1772 return false; 1773 1774 auto SegmentNull = 1775 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1776 auto FlatNull = 1777 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1778 1779 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1780 if (!ApertureReg.isValid()) 1781 return false; 1782 1783 auto CmpRes = 1784 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1785 1786 // Coerce the type of the low half of the result so we can use merge_values. 1787 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1788 1789 // TODO: Should we allow mismatched types but matching sizes in merges to 1790 // avoid the ptrtoint? 1791 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1792 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1793 1794 MI.eraseFromParent(); 1795 return true; 1796 } 1797 1798 bool AMDGPULegalizerInfo::legalizeFrint( 1799 MachineInstr &MI, MachineRegisterInfo &MRI, 1800 MachineIRBuilder &B) const { 1801 Register Src = MI.getOperand(1).getReg(); 1802 LLT Ty = MRI.getType(Src); 1803 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1804 1805 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1806 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1807 1808 auto C1 = B.buildFConstant(Ty, C1Val); 1809 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1810 1811 // TODO: Should this propagate fast-math-flags? 1812 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1813 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1814 1815 auto C2 = B.buildFConstant(Ty, C2Val); 1816 auto Fabs = B.buildFAbs(Ty, Src); 1817 1818 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1819 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1820 MI.eraseFromParent(); 1821 return true; 1822 } 1823 1824 bool AMDGPULegalizerInfo::legalizeFceil( 1825 MachineInstr &MI, MachineRegisterInfo &MRI, 1826 MachineIRBuilder &B) const { 1827 1828 const LLT S1 = LLT::scalar(1); 1829 const LLT S64 = LLT::scalar(64); 1830 1831 Register Src = MI.getOperand(1).getReg(); 1832 assert(MRI.getType(Src) == S64); 1833 1834 // result = trunc(src) 1835 // if (src > 0.0 && src != result) 1836 // result += 1.0 1837 1838 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1839 1840 const auto Zero = B.buildFConstant(S64, 0.0); 1841 const auto One = B.buildFConstant(S64, 1.0); 1842 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1843 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1844 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1845 auto Add = B.buildSelect(S64, And, One, Zero); 1846 1847 // TODO: Should this propagate fast-math-flags? 1848 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1849 return true; 1850 } 1851 1852 static MachineInstrBuilder extractF64Exponent(Register Hi, 1853 MachineIRBuilder &B) { 1854 const unsigned FractBits = 52; 1855 const unsigned ExpBits = 11; 1856 LLT S32 = LLT::scalar(32); 1857 1858 auto Const0 = B.buildConstant(S32, FractBits - 32); 1859 auto Const1 = B.buildConstant(S32, ExpBits); 1860 1861 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1862 .addUse(Hi) 1863 .addUse(Const0.getReg(0)) 1864 .addUse(Const1.getReg(0)); 1865 1866 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1867 } 1868 1869 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1870 MachineInstr &MI, MachineRegisterInfo &MRI, 1871 MachineIRBuilder &B) const { 1872 const LLT S1 = LLT::scalar(1); 1873 const LLT S32 = LLT::scalar(32); 1874 const LLT S64 = LLT::scalar(64); 1875 1876 Register Src = MI.getOperand(1).getReg(); 1877 assert(MRI.getType(Src) == S64); 1878 1879 // TODO: Should this use extract since the low half is unused? 1880 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1881 Register Hi = Unmerge.getReg(1); 1882 1883 // Extract the upper half, since this is where we will find the sign and 1884 // exponent. 1885 auto Exp = extractF64Exponent(Hi, B); 1886 1887 const unsigned FractBits = 52; 1888 1889 // Extract the sign bit. 1890 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1891 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1892 1893 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1894 1895 const auto Zero32 = B.buildConstant(S32, 0); 1896 1897 // Extend back to 64-bits. 1898 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1899 1900 auto Shr = B.buildAShr(S64, FractMask, Exp); 1901 auto Not = B.buildNot(S64, Shr); 1902 auto Tmp0 = B.buildAnd(S64, Src, Not); 1903 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1904 1905 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1906 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1907 1908 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1909 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1910 MI.eraseFromParent(); 1911 return true; 1912 } 1913 1914 bool AMDGPULegalizerInfo::legalizeITOFP( 1915 MachineInstr &MI, MachineRegisterInfo &MRI, 1916 MachineIRBuilder &B, bool Signed) const { 1917 1918 Register Dst = MI.getOperand(0).getReg(); 1919 Register Src = MI.getOperand(1).getReg(); 1920 1921 const LLT S64 = LLT::scalar(64); 1922 const LLT S32 = LLT::scalar(32); 1923 1924 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1925 1926 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1927 1928 auto CvtHi = Signed ? 1929 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1930 B.buildUITOFP(S64, Unmerge.getReg(1)); 1931 1932 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1933 1934 auto ThirtyTwo = B.buildConstant(S32, 32); 1935 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1936 .addUse(CvtHi.getReg(0)) 1937 .addUse(ThirtyTwo.getReg(0)); 1938 1939 // TODO: Should this propagate fast-math-flags? 1940 B.buildFAdd(Dst, LdExp, CvtLo); 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 // TODO: Copied from DAG implementation. Verify logic and document how this 1946 // actually works. 1947 bool AMDGPULegalizerInfo::legalizeFPTOI( 1948 MachineInstr &MI, MachineRegisterInfo &MRI, 1949 MachineIRBuilder &B, bool Signed) const { 1950 1951 Register Dst = MI.getOperand(0).getReg(); 1952 Register Src = MI.getOperand(1).getReg(); 1953 1954 const LLT S64 = LLT::scalar(64); 1955 const LLT S32 = LLT::scalar(32); 1956 1957 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1958 1959 unsigned Flags = MI.getFlags(); 1960 1961 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1962 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1963 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1964 1965 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1966 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1967 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1968 1969 auto Hi = Signed ? 1970 B.buildFPTOSI(S32, FloorMul) : 1971 B.buildFPTOUI(S32, FloorMul); 1972 auto Lo = B.buildFPTOUI(S32, Fma); 1973 1974 B.buildMerge(Dst, { Lo, Hi }); 1975 MI.eraseFromParent(); 1976 1977 return true; 1978 } 1979 1980 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1981 MachineInstr &MI) const { 1982 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1983 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1984 1985 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1986 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1987 1988 // With ieee_mode disabled, the instructions have the correct behavior 1989 // already for G_FMINNUM/G_FMAXNUM 1990 if (!MFI->getMode().IEEE) 1991 return !IsIEEEOp; 1992 1993 if (IsIEEEOp) 1994 return true; 1995 1996 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2000 MachineInstr &MI, MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 // TODO: Should move some of this into LegalizerHelper. 2003 2004 // TODO: Promote dynamic indexing of s16 to s32 2005 2006 // FIXME: Artifact combiner probably should have replaced the truncated 2007 // constant before this, so we shouldn't need 2008 // getConstantVRegValWithLookThrough. 2009 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2010 MI.getOperand(2).getReg(), MRI); 2011 if (!IdxVal) // Dynamic case will be selected to register indexing. 2012 return true; 2013 2014 Register Dst = MI.getOperand(0).getReg(); 2015 Register Vec = MI.getOperand(1).getReg(); 2016 2017 LLT VecTy = MRI.getType(Vec); 2018 LLT EltTy = VecTy.getElementType(); 2019 assert(EltTy == MRI.getType(Dst)); 2020 2021 if (IdxVal->Value < VecTy.getNumElements()) 2022 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 2023 else 2024 B.buildUndef(Dst); 2025 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2031 MachineInstr &MI, MachineRegisterInfo &MRI, 2032 MachineIRBuilder &B) const { 2033 // TODO: Should move some of this into LegalizerHelper. 2034 2035 // TODO: Promote dynamic indexing of s16 to s32 2036 2037 // FIXME: Artifact combiner probably should have replaced the truncated 2038 // constant before this, so we shouldn't need 2039 // getConstantVRegValWithLookThrough. 2040 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 2041 MI.getOperand(3).getReg(), MRI); 2042 if (!IdxVal) // Dynamic case will be selected to register indexing. 2043 return true; 2044 2045 Register Dst = MI.getOperand(0).getReg(); 2046 Register Vec = MI.getOperand(1).getReg(); 2047 Register Ins = MI.getOperand(2).getReg(); 2048 2049 LLT VecTy = MRI.getType(Vec); 2050 LLT EltTy = VecTy.getElementType(); 2051 assert(EltTy == MRI.getType(Ins)); 2052 2053 if (IdxVal->Value < VecTy.getNumElements()) 2054 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2055 else 2056 B.buildUndef(Dst); 2057 2058 MI.eraseFromParent(); 2059 return true; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2063 MachineInstr &MI, MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B) const { 2065 const LLT V2S16 = LLT::vector(2, 16); 2066 2067 Register Dst = MI.getOperand(0).getReg(); 2068 Register Src0 = MI.getOperand(1).getReg(); 2069 LLT DstTy = MRI.getType(Dst); 2070 LLT SrcTy = MRI.getType(Src0); 2071 2072 if (SrcTy == V2S16 && DstTy == V2S16 && 2073 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2074 return true; 2075 2076 MachineIRBuilder HelperBuilder(MI); 2077 GISelObserverWrapper DummyObserver; 2078 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2079 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizeSinCos( 2083 MachineInstr &MI, MachineRegisterInfo &MRI, 2084 MachineIRBuilder &B) const { 2085 2086 Register DstReg = MI.getOperand(0).getReg(); 2087 Register SrcReg = MI.getOperand(1).getReg(); 2088 LLT Ty = MRI.getType(DstReg); 2089 unsigned Flags = MI.getFlags(); 2090 2091 Register TrigVal; 2092 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2093 if (ST.hasTrigReducedRange()) { 2094 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2095 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2096 .addUse(MulVal.getReg(0)) 2097 .setMIFlags(Flags).getReg(0); 2098 } else 2099 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2100 2101 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2102 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2103 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2104 .addUse(TrigVal) 2105 .setMIFlags(Flags); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2111 MachineIRBuilder &B, 2112 const GlobalValue *GV, 2113 int64_t Offset, 2114 unsigned GAFlags) const { 2115 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2116 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2117 // to the following code sequence: 2118 // 2119 // For constant address space: 2120 // s_getpc_b64 s[0:1] 2121 // s_add_u32 s0, s0, $symbol 2122 // s_addc_u32 s1, s1, 0 2123 // 2124 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2125 // a fixup or relocation is emitted to replace $symbol with a literal 2126 // constant, which is a pc-relative offset from the encoding of the $symbol 2127 // operand to the global variable. 2128 // 2129 // For global address space: 2130 // s_getpc_b64 s[0:1] 2131 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2132 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2133 // 2134 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2135 // fixups or relocations are emitted to replace $symbol@*@lo and 2136 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2137 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2138 // operand to the global variable. 2139 // 2140 // What we want here is an offset from the value returned by s_getpc 2141 // (which is the address of the s_add_u32 instruction) to the global 2142 // variable, but since the encoding of $symbol starts 4 bytes after the start 2143 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2144 // small. This requires us to add 4 to the global variable offset in order to 2145 // compute the correct address. 2146 2147 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2148 2149 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2150 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2151 2152 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2153 .addDef(PCReg); 2154 2155 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2156 if (GAFlags == SIInstrInfo::MO_NONE) 2157 MIB.addImm(0); 2158 else 2159 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2160 2161 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2162 2163 if (PtrTy.getSizeInBits() == 32) 2164 B.buildExtract(DstReg, PCReg, 0); 2165 return true; 2166 } 2167 2168 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2169 MachineInstr &MI, MachineRegisterInfo &MRI, 2170 MachineIRBuilder &B) const { 2171 Register DstReg = MI.getOperand(0).getReg(); 2172 LLT Ty = MRI.getType(DstReg); 2173 unsigned AS = Ty.getAddressSpace(); 2174 2175 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2176 MachineFunction &MF = B.getMF(); 2177 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2178 2179 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2180 if (!MFI->isEntryFunction()) { 2181 const Function &Fn = MF.getFunction(); 2182 DiagnosticInfoUnsupported BadLDSDecl( 2183 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2184 DS_Warning); 2185 Fn.getContext().diagnose(BadLDSDecl); 2186 2187 // We currently don't have a way to correctly allocate LDS objects that 2188 // aren't directly associated with a kernel. We do force inlining of 2189 // functions that use local objects. However, if these dead functions are 2190 // not eliminated, we don't want a compile time error. Just emit a warning 2191 // and a trap, since there should be no callable path here. 2192 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2193 B.buildUndef(DstReg); 2194 MI.eraseFromParent(); 2195 return true; 2196 } 2197 2198 // TODO: We could emit code to handle the initialization somewhere. 2199 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2200 const SITargetLowering *TLI = ST.getTargetLowering(); 2201 if (!TLI->shouldUseLDSConstAddress(GV)) { 2202 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2203 return true; // Leave in place; 2204 } 2205 2206 B.buildConstant( 2207 DstReg, 2208 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 const Function &Fn = MF.getFunction(); 2214 DiagnosticInfoUnsupported BadInit( 2215 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2216 Fn.getContext().diagnose(BadInit); 2217 return true; 2218 } 2219 2220 const SITargetLowering *TLI = ST.getTargetLowering(); 2221 2222 if (TLI->shouldEmitFixup(GV)) { 2223 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2224 MI.eraseFromParent(); 2225 return true; 2226 } 2227 2228 if (TLI->shouldEmitPCReloc(GV)) { 2229 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2235 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2236 2237 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2238 MachinePointerInfo::getGOT(MF), 2239 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2240 MachineMemOperand::MOInvariant, 2241 8 /*Size*/, Align(8)); 2242 2243 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2244 2245 if (Ty.getSizeInBits() == 32) { 2246 // Truncate if this is a 32-bit constant adrdess. 2247 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2248 B.buildExtract(DstReg, Load, 0); 2249 } else 2250 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2251 2252 MI.eraseFromParent(); 2253 return true; 2254 } 2255 2256 bool AMDGPULegalizerInfo::legalizeLoad( 2257 MachineInstr &MI, MachineRegisterInfo &MRI, 2258 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2259 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2260 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2261 Observer.changingInstr(MI); 2262 MI.getOperand(1).setReg(Cast.getReg(0)); 2263 Observer.changedInstr(MI); 2264 return true; 2265 } 2266 2267 bool AMDGPULegalizerInfo::legalizeFMad( 2268 MachineInstr &MI, MachineRegisterInfo &MRI, 2269 MachineIRBuilder &B) const { 2270 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2271 assert(Ty.isScalar()); 2272 2273 MachineFunction &MF = B.getMF(); 2274 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2275 2276 // TODO: Always legal with future ftz flag. 2277 // FIXME: Do we need just output? 2278 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2279 return true; 2280 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2281 return true; 2282 2283 MachineIRBuilder HelperBuilder(MI); 2284 GISelObserverWrapper DummyObserver; 2285 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2286 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2287 } 2288 2289 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2290 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2291 Register DstReg = MI.getOperand(0).getReg(); 2292 Register PtrReg = MI.getOperand(1).getReg(); 2293 Register CmpVal = MI.getOperand(2).getReg(); 2294 Register NewVal = MI.getOperand(3).getReg(); 2295 2296 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2297 "this should not have been custom lowered"); 2298 2299 LLT ValTy = MRI.getType(CmpVal); 2300 LLT VecTy = LLT::vector(2, ValTy); 2301 2302 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2303 2304 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2305 .addDef(DstReg) 2306 .addUse(PtrReg) 2307 .addUse(PackedVal) 2308 .setMemRefs(MI.memoperands()); 2309 2310 MI.eraseFromParent(); 2311 return true; 2312 } 2313 2314 bool AMDGPULegalizerInfo::legalizeFlog( 2315 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2316 Register Dst = MI.getOperand(0).getReg(); 2317 Register Src = MI.getOperand(1).getReg(); 2318 LLT Ty = B.getMRI()->getType(Dst); 2319 unsigned Flags = MI.getFlags(); 2320 2321 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2322 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2323 2324 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2325 MI.eraseFromParent(); 2326 return true; 2327 } 2328 2329 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2330 MachineIRBuilder &B) const { 2331 Register Dst = MI.getOperand(0).getReg(); 2332 Register Src = MI.getOperand(1).getReg(); 2333 unsigned Flags = MI.getFlags(); 2334 LLT Ty = B.getMRI()->getType(Dst); 2335 2336 auto K = B.buildFConstant(Ty, numbers::log2e); 2337 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2338 B.buildFExp2(Dst, Mul, Flags); 2339 MI.eraseFromParent(); 2340 return true; 2341 } 2342 2343 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2344 MachineIRBuilder &B) const { 2345 Register Dst = MI.getOperand(0).getReg(); 2346 Register Src0 = MI.getOperand(1).getReg(); 2347 Register Src1 = MI.getOperand(2).getReg(); 2348 unsigned Flags = MI.getFlags(); 2349 LLT Ty = B.getMRI()->getType(Dst); 2350 const LLT S16 = LLT::scalar(16); 2351 const LLT S32 = LLT::scalar(32); 2352 2353 if (Ty == S32) { 2354 auto Log = B.buildFLog2(S32, Src0, Flags); 2355 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2356 .addUse(Log.getReg(0)) 2357 .addUse(Src1) 2358 .setMIFlags(Flags); 2359 B.buildFExp2(Dst, Mul, Flags); 2360 } else if (Ty == S16) { 2361 // There's no f16 fmul_legacy, so we need to convert for it. 2362 auto Log = B.buildFLog2(S16, Src0, Flags); 2363 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2364 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2365 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2366 .addUse(Ext0.getReg(0)) 2367 .addUse(Ext1.getReg(0)) 2368 .setMIFlags(Flags); 2369 2370 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2371 } else 2372 return false; 2373 2374 MI.eraseFromParent(); 2375 return true; 2376 } 2377 2378 // Find a source register, ignoring any possible source modifiers. 2379 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2380 Register ModSrc = OrigSrc; 2381 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2382 ModSrc = SrcFNeg->getOperand(1).getReg(); 2383 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2384 ModSrc = SrcFAbs->getOperand(1).getReg(); 2385 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2386 ModSrc = SrcFAbs->getOperand(1).getReg(); 2387 return ModSrc; 2388 } 2389 2390 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2391 MachineRegisterInfo &MRI, 2392 MachineIRBuilder &B) const { 2393 2394 const LLT S1 = LLT::scalar(1); 2395 const LLT S64 = LLT::scalar(64); 2396 Register Dst = MI.getOperand(0).getReg(); 2397 Register OrigSrc = MI.getOperand(1).getReg(); 2398 unsigned Flags = MI.getFlags(); 2399 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2400 "this should not have been custom lowered"); 2401 2402 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2403 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2404 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2405 // V_FRACT bug is: 2406 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2407 // 2408 // Convert floor(x) to (x - fract(x)) 2409 2410 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2411 .addUse(OrigSrc) 2412 .setMIFlags(Flags); 2413 2414 // Give source modifier matching some assistance before obscuring a foldable 2415 // pattern. 2416 2417 // TODO: We can avoid the neg on the fract? The input sign to fract 2418 // shouldn't matter? 2419 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2420 2421 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2422 2423 Register Min = MRI.createGenericVirtualRegister(S64); 2424 2425 // We don't need to concern ourselves with the snan handling difference, so 2426 // use the one which will directly select. 2427 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2428 if (MFI->getMode().IEEE) 2429 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2430 else 2431 B.buildFMinNum(Min, Fract, Const, Flags); 2432 2433 Register CorrectedFract = Min; 2434 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2435 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2436 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2437 } 2438 2439 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2440 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2441 2442 MI.eraseFromParent(); 2443 return true; 2444 } 2445 2446 // Turn an illegal packed v2s16 build vector into bit operations. 2447 // TODO: This should probably be a bitcast action in LegalizerHelper. 2448 bool AMDGPULegalizerInfo::legalizeBuildVector( 2449 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2450 Register Dst = MI.getOperand(0).getReg(); 2451 const LLT S32 = LLT::scalar(32); 2452 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2453 2454 Register Src0 = MI.getOperand(1).getReg(); 2455 Register Src1 = MI.getOperand(2).getReg(); 2456 assert(MRI.getType(Src0) == LLT::scalar(16)); 2457 2458 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2459 B.buildBitcast(Dst, Merge); 2460 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 2465 // Return the use branch instruction, otherwise null if the usage is invalid. 2466 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2467 MachineRegisterInfo &MRI, 2468 MachineInstr *&Br, 2469 MachineBasicBlock *&UncondBrTarget) { 2470 Register CondDef = MI.getOperand(0).getReg(); 2471 if (!MRI.hasOneNonDBGUse(CondDef)) 2472 return nullptr; 2473 2474 MachineBasicBlock *Parent = MI.getParent(); 2475 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2476 if (UseMI.getParent() != Parent || 2477 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2478 return nullptr; 2479 2480 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2481 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2482 if (Next == Parent->end()) { 2483 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2484 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2485 return nullptr; 2486 UncondBrTarget = &*NextMBB; 2487 } else { 2488 if (Next->getOpcode() != AMDGPU::G_BR) 2489 return nullptr; 2490 Br = &*Next; 2491 UncondBrTarget = Br->getOperand(0).getMBB(); 2492 } 2493 2494 return &UseMI; 2495 } 2496 2497 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2498 const ArgDescriptor *Arg, 2499 const TargetRegisterClass *ArgRC, 2500 LLT ArgTy) const { 2501 MCRegister SrcReg = Arg->getRegister(); 2502 assert(SrcReg.isPhysical() && "Physical register expected"); 2503 assert(DstReg.isVirtual() && "Virtual register expected"); 2504 2505 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2506 ArgTy); 2507 if (Arg->isMasked()) { 2508 // TODO: Should we try to emit this once in the entry block? 2509 const LLT S32 = LLT::scalar(32); 2510 const unsigned Mask = Arg->getMask(); 2511 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2512 2513 Register AndMaskSrc = LiveIn; 2514 2515 if (Shift != 0) { 2516 auto ShiftAmt = B.buildConstant(S32, Shift); 2517 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2518 } 2519 2520 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2521 } else { 2522 B.buildCopy(DstReg, LiveIn); 2523 } 2524 2525 return true; 2526 } 2527 2528 bool AMDGPULegalizerInfo::loadInputValue( 2529 Register DstReg, MachineIRBuilder &B, 2530 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2531 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2532 const ArgDescriptor *Arg; 2533 const TargetRegisterClass *ArgRC; 2534 LLT ArgTy; 2535 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2536 2537 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2538 return false; // TODO: Handle these 2539 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2540 } 2541 2542 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2543 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2544 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2545 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2546 return false; 2547 2548 MI.eraseFromParent(); 2549 return true; 2550 } 2551 2552 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2553 MachineRegisterInfo &MRI, 2554 MachineIRBuilder &B) const { 2555 Register Dst = MI.getOperand(0).getReg(); 2556 LLT DstTy = MRI.getType(Dst); 2557 LLT S16 = LLT::scalar(16); 2558 LLT S32 = LLT::scalar(32); 2559 LLT S64 = LLT::scalar(64); 2560 2561 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2562 return true; 2563 2564 if (DstTy == S16) 2565 return legalizeFDIV16(MI, MRI, B); 2566 if (DstTy == S32) 2567 return legalizeFDIV32(MI, MRI, B); 2568 if (DstTy == S64) 2569 return legalizeFDIV64(MI, MRI, B); 2570 2571 return false; 2572 } 2573 2574 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2575 Register DstReg, 2576 Register X, 2577 Register Y, 2578 bool IsDiv) const { 2579 const LLT S1 = LLT::scalar(1); 2580 const LLT S32 = LLT::scalar(32); 2581 2582 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2583 // algorithm used here. 2584 2585 // Initial estimate of inv(y). 2586 auto FloatY = B.buildUITOFP(S32, Y); 2587 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2588 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2589 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2590 auto Z = B.buildFPTOUI(S32, ScaledY); 2591 2592 // One round of UNR. 2593 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2594 auto NegYZ = B.buildMul(S32, NegY, Z); 2595 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2596 2597 // Quotient/remainder estimate. 2598 auto Q = B.buildUMulH(S32, X, Z); 2599 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2600 2601 // First quotient/remainder refinement. 2602 auto One = B.buildConstant(S32, 1); 2603 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2604 if (IsDiv) 2605 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2606 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2607 2608 // Second quotient/remainder refinement. 2609 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2610 if (IsDiv) 2611 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2612 else 2613 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2614 } 2615 2616 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2617 MachineRegisterInfo &MRI, 2618 MachineIRBuilder &B) const { 2619 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2620 Register DstReg = MI.getOperand(0).getReg(); 2621 Register Num = MI.getOperand(1).getReg(); 2622 Register Den = MI.getOperand(2).getReg(); 2623 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2624 MI.eraseFromParent(); 2625 return true; 2626 } 2627 2628 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2629 // 2630 // Return lo, hi of result 2631 // 2632 // %cvt.lo = G_UITOFP Val.lo 2633 // %cvt.hi = G_UITOFP Val.hi 2634 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2635 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2636 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2637 // %mul2 = G_FMUL %mul1, 2**(-32) 2638 // %trunc = G_INTRINSIC_TRUNC %mul2 2639 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2640 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2641 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2642 Register Val) { 2643 const LLT S32 = LLT::scalar(32); 2644 auto Unmerge = B.buildUnmerge(S32, Val); 2645 2646 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2647 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2648 2649 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2650 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2651 2652 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2653 auto Mul1 = 2654 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2655 2656 // 2**(-32) 2657 auto Mul2 = 2658 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2659 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2660 2661 // -(2**32) 2662 auto Mad2 = B.buildFMAD(S32, Trunc, 2663 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2664 2665 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2666 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2667 2668 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2669 } 2670 2671 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2672 Register DstReg, 2673 Register Numer, 2674 Register Denom, 2675 bool IsDiv) const { 2676 const LLT S32 = LLT::scalar(32); 2677 const LLT S64 = LLT::scalar(64); 2678 const LLT S1 = LLT::scalar(1); 2679 Register RcpLo, RcpHi; 2680 2681 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2682 2683 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2684 2685 auto Zero64 = B.buildConstant(S64, 0); 2686 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2687 2688 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2689 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2690 2691 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2692 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2693 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2694 2695 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2696 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2697 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2698 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2699 2700 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2701 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2702 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2703 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2704 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2705 2706 auto Zero32 = B.buildConstant(S32, 0); 2707 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2708 auto Add2_HiC = 2709 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2710 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2711 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2712 2713 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2714 Register NumerLo = UnmergeNumer.getReg(0); 2715 Register NumerHi = UnmergeNumer.getReg(1); 2716 2717 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2718 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2719 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2720 Register Mul3_Lo = UnmergeMul3.getReg(0); 2721 Register Mul3_Hi = UnmergeMul3.getReg(1); 2722 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2723 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2724 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2725 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2726 2727 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2728 Register DenomLo = UnmergeDenom.getReg(0); 2729 Register DenomHi = UnmergeDenom.getReg(1); 2730 2731 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2732 auto C1 = B.buildSExt(S32, CmpHi); 2733 2734 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2735 auto C2 = B.buildSExt(S32, CmpLo); 2736 2737 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2738 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2739 2740 // TODO: Here and below portions of the code can be enclosed into if/endif. 2741 // Currently control flow is unconditional and we have 4 selects after 2742 // potential endif to substitute PHIs. 2743 2744 // if C3 != 0 ... 2745 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2746 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2747 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2748 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2749 2750 auto One64 = B.buildConstant(S64, 1); 2751 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2752 2753 auto C4 = 2754 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2755 auto C5 = 2756 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2757 auto C6 = B.buildSelect( 2758 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2759 2760 // if (C6 != 0) 2761 auto Add4 = B.buildAdd(S64, Add3, One64); 2762 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2763 2764 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2765 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2766 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2767 2768 // endif C6 2769 // endif C3 2770 2771 if (IsDiv) { 2772 auto Sel1 = B.buildSelect( 2773 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2774 B.buildSelect(DstReg, 2775 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2776 } else { 2777 auto Sel2 = B.buildSelect( 2778 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2779 B.buildSelect(DstReg, 2780 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2781 } 2782 } 2783 2784 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2785 MachineRegisterInfo &MRI, 2786 MachineIRBuilder &B) const { 2787 const LLT S64 = LLT::scalar(64); 2788 const LLT S32 = LLT::scalar(32); 2789 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2790 Register DstReg = MI.getOperand(0).getReg(); 2791 Register Num = MI.getOperand(1).getReg(); 2792 Register Den = MI.getOperand(2).getReg(); 2793 LLT Ty = MRI.getType(DstReg); 2794 2795 if (Ty == S32) 2796 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2797 else if (Ty == S64) 2798 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2799 else 2800 return false; 2801 2802 MI.eraseFromParent(); 2803 return true; 2804 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2808 MachineRegisterInfo &MRI, 2809 MachineIRBuilder &B) const { 2810 const LLT S64 = LLT::scalar(64); 2811 const LLT S32 = LLT::scalar(32); 2812 2813 Register DstReg = MI.getOperand(0).getReg(); 2814 const LLT Ty = MRI.getType(DstReg); 2815 if (Ty != S32 && Ty != S64) 2816 return false; 2817 2818 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2819 2820 Register LHS = MI.getOperand(1).getReg(); 2821 Register RHS = MI.getOperand(2).getReg(); 2822 2823 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2824 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2825 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2826 2827 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2828 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2829 2830 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2831 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2832 2833 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2834 if (Ty == S32) 2835 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2836 else 2837 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2838 2839 Register Sign; 2840 if (IsDiv) 2841 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2842 else 2843 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2844 2845 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2846 B.buildSub(DstReg, UDivRem, Sign); 2847 2848 MI.eraseFromParent(); 2849 return true; 2850 } 2851 2852 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2853 MachineRegisterInfo &MRI, 2854 MachineIRBuilder &B) const { 2855 Register Res = MI.getOperand(0).getReg(); 2856 Register LHS = MI.getOperand(1).getReg(); 2857 Register RHS = MI.getOperand(2).getReg(); 2858 2859 uint16_t Flags = MI.getFlags(); 2860 2861 LLT ResTy = MRI.getType(Res); 2862 LLT S32 = LLT::scalar(32); 2863 LLT S64 = LLT::scalar(64); 2864 2865 const MachineFunction &MF = B.getMF(); 2866 bool Unsafe = 2867 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2868 2869 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2870 return false; 2871 2872 if (!Unsafe && ResTy == S32 && 2873 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2874 return false; 2875 2876 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2877 // 1 / x -> RCP(x) 2878 if (CLHS->isExactlyValue(1.0)) { 2879 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2880 .addUse(RHS) 2881 .setMIFlags(Flags); 2882 2883 MI.eraseFromParent(); 2884 return true; 2885 } 2886 2887 // -1 / x -> RCP( FNEG(x) ) 2888 if (CLHS->isExactlyValue(-1.0)) { 2889 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2890 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2891 .addUse(FNeg.getReg(0)) 2892 .setMIFlags(Flags); 2893 2894 MI.eraseFromParent(); 2895 return true; 2896 } 2897 } 2898 2899 // x / y -> x * (1.0 / y) 2900 if (Unsafe) { 2901 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2902 .addUse(RHS) 2903 .setMIFlags(Flags); 2904 B.buildFMul(Res, LHS, RCP, Flags); 2905 2906 MI.eraseFromParent(); 2907 return true; 2908 } 2909 2910 return false; 2911 } 2912 2913 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2914 MachineRegisterInfo &MRI, 2915 MachineIRBuilder &B) const { 2916 Register Res = MI.getOperand(0).getReg(); 2917 Register LHS = MI.getOperand(1).getReg(); 2918 Register RHS = MI.getOperand(2).getReg(); 2919 2920 uint16_t Flags = MI.getFlags(); 2921 2922 LLT S16 = LLT::scalar(16); 2923 LLT S32 = LLT::scalar(32); 2924 2925 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2926 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2927 2928 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2929 .addUse(RHSExt.getReg(0)) 2930 .setMIFlags(Flags); 2931 2932 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2933 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2934 2935 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2936 .addUse(RDst.getReg(0)) 2937 .addUse(RHS) 2938 .addUse(LHS) 2939 .setMIFlags(Flags); 2940 2941 MI.eraseFromParent(); 2942 return true; 2943 } 2944 2945 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2946 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2947 static void toggleSPDenormMode(bool Enable, 2948 MachineIRBuilder &B, 2949 const GCNSubtarget &ST, 2950 AMDGPU::SIModeRegisterDefaults Mode) { 2951 // Set SP denorm mode to this value. 2952 unsigned SPDenormMode = 2953 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2954 2955 if (ST.hasDenormModeInst()) { 2956 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2957 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2958 2959 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2960 B.buildInstr(AMDGPU::S_DENORM_MODE) 2961 .addImm(NewDenormModeValue); 2962 2963 } else { 2964 // Select FP32 bit field in mode register. 2965 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2966 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2967 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2968 2969 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2970 .addImm(SPDenormMode) 2971 .addImm(SPDenormModeBitField); 2972 } 2973 } 2974 2975 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2976 MachineRegisterInfo &MRI, 2977 MachineIRBuilder &B) const { 2978 Register Res = MI.getOperand(0).getReg(); 2979 Register LHS = MI.getOperand(1).getReg(); 2980 Register RHS = MI.getOperand(2).getReg(); 2981 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2982 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2983 2984 uint16_t Flags = MI.getFlags(); 2985 2986 LLT S32 = LLT::scalar(32); 2987 LLT S1 = LLT::scalar(1); 2988 2989 auto One = B.buildFConstant(S32, 1.0f); 2990 2991 auto DenominatorScaled = 2992 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2993 .addUse(LHS) 2994 .addUse(RHS) 2995 .addImm(0) 2996 .setMIFlags(Flags); 2997 auto NumeratorScaled = 2998 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2999 .addUse(LHS) 3000 .addUse(RHS) 3001 .addImm(1) 3002 .setMIFlags(Flags); 3003 3004 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3005 .addUse(DenominatorScaled.getReg(0)) 3006 .setMIFlags(Flags); 3007 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3008 3009 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3010 // aren't modeled as reading it. 3011 if (!Mode.allFP32Denormals()) 3012 toggleSPDenormMode(true, B, ST, Mode); 3013 3014 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3015 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3016 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3017 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3018 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3019 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3020 3021 if (!Mode.allFP32Denormals()) 3022 toggleSPDenormMode(false, B, ST, Mode); 3023 3024 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3025 .addUse(Fma4.getReg(0)) 3026 .addUse(Fma1.getReg(0)) 3027 .addUse(Fma3.getReg(0)) 3028 .addUse(NumeratorScaled.getReg(1)) 3029 .setMIFlags(Flags); 3030 3031 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3032 .addUse(Fmas.getReg(0)) 3033 .addUse(RHS) 3034 .addUse(LHS) 3035 .setMIFlags(Flags); 3036 3037 MI.eraseFromParent(); 3038 return true; 3039 } 3040 3041 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3042 MachineRegisterInfo &MRI, 3043 MachineIRBuilder &B) const { 3044 Register Res = MI.getOperand(0).getReg(); 3045 Register LHS = MI.getOperand(1).getReg(); 3046 Register RHS = MI.getOperand(2).getReg(); 3047 3048 uint16_t Flags = MI.getFlags(); 3049 3050 LLT S64 = LLT::scalar(64); 3051 LLT S1 = LLT::scalar(1); 3052 3053 auto One = B.buildFConstant(S64, 1.0); 3054 3055 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3056 .addUse(LHS) 3057 .addUse(RHS) 3058 .addImm(0) 3059 .setMIFlags(Flags); 3060 3061 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3062 3063 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3064 .addUse(DivScale0.getReg(0)) 3065 .setMIFlags(Flags); 3066 3067 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3068 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3069 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3070 3071 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3072 .addUse(LHS) 3073 .addUse(RHS) 3074 .addImm(1) 3075 .setMIFlags(Flags); 3076 3077 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3078 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3079 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3080 3081 Register Scale; 3082 if (!ST.hasUsableDivScaleConditionOutput()) { 3083 // Workaround a hardware bug on SI where the condition output from div_scale 3084 // is not usable. 3085 3086 LLT S32 = LLT::scalar(32); 3087 3088 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3089 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3090 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3091 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3092 3093 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3094 Scale1Unmerge.getReg(1)); 3095 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3096 Scale0Unmerge.getReg(1)); 3097 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3098 } else { 3099 Scale = DivScale1.getReg(1); 3100 } 3101 3102 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3103 .addUse(Fma4.getReg(0)) 3104 .addUse(Fma3.getReg(0)) 3105 .addUse(Mul.getReg(0)) 3106 .addUse(Scale) 3107 .setMIFlags(Flags); 3108 3109 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3110 .addUse(Fmas.getReg(0)) 3111 .addUse(RHS) 3112 .addUse(LHS) 3113 .setMIFlags(Flags); 3114 3115 MI.eraseFromParent(); 3116 return true; 3117 } 3118 3119 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3120 MachineRegisterInfo &MRI, 3121 MachineIRBuilder &B) const { 3122 Register Res = MI.getOperand(0).getReg(); 3123 Register LHS = MI.getOperand(2).getReg(); 3124 Register RHS = MI.getOperand(3).getReg(); 3125 uint16_t Flags = MI.getFlags(); 3126 3127 LLT S32 = LLT::scalar(32); 3128 LLT S1 = LLT::scalar(1); 3129 3130 auto Abs = B.buildFAbs(S32, RHS, Flags); 3131 const APFloat C0Val(1.0f); 3132 3133 auto C0 = B.buildConstant(S32, 0x6f800000); 3134 auto C1 = B.buildConstant(S32, 0x2f800000); 3135 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3136 3137 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3138 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3139 3140 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3141 3142 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3143 .addUse(Mul0.getReg(0)) 3144 .setMIFlags(Flags); 3145 3146 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3147 3148 B.buildFMul(Res, Sel, Mul1, Flags); 3149 3150 MI.eraseFromParent(); 3151 return true; 3152 } 3153 3154 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3155 MachineRegisterInfo &MRI, 3156 MachineIRBuilder &B) const { 3157 uint64_t Offset = 3158 ST.getTargetLowering()->getImplicitParameterOffset( 3159 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3160 LLT DstTy = MRI.getType(DstReg); 3161 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3162 3163 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3164 if (!loadInputValue(KernargPtrReg, B, 3165 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3166 return false; 3167 3168 // FIXME: This should be nuw 3169 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3170 return true; 3171 } 3172 3173 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3174 MachineRegisterInfo &MRI, 3175 MachineIRBuilder &B) const { 3176 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3177 if (!MFI->isEntryFunction()) { 3178 return legalizePreloadedArgIntrin(MI, MRI, B, 3179 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3180 } 3181 3182 Register DstReg = MI.getOperand(0).getReg(); 3183 if (!getImplicitArgPtr(DstReg, MRI, B)) 3184 return false; 3185 3186 MI.eraseFromParent(); 3187 return true; 3188 } 3189 3190 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3191 MachineRegisterInfo &MRI, 3192 MachineIRBuilder &B, 3193 unsigned AddrSpace) const { 3194 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3195 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3196 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3197 MI.eraseFromParent(); 3198 return true; 3199 } 3200 3201 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3202 // offset (the offset that is included in bounds checking and swizzling, to be 3203 // split between the instruction's voffset and immoffset fields) and soffset 3204 // (the offset that is excluded from bounds checking and swizzling, to go in 3205 // the instruction's soffset field). This function takes the first kind of 3206 // offset and figures out how to split it between voffset and immoffset. 3207 std::tuple<Register, unsigned, unsigned> 3208 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3209 Register OrigOffset) const { 3210 const unsigned MaxImm = 4095; 3211 Register BaseReg; 3212 unsigned TotalConstOffset; 3213 MachineInstr *OffsetDef; 3214 const LLT S32 = LLT::scalar(32); 3215 3216 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3217 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3218 3219 unsigned ImmOffset = TotalConstOffset; 3220 3221 // If the immediate value is too big for the immoffset field, put the value 3222 // and -4096 into the immoffset field so that the value that is copied/added 3223 // for the voffset field is a multiple of 4096, and it stands more chance 3224 // of being CSEd with the copy/add for another similar load/store. 3225 // However, do not do that rounding down to a multiple of 4096 if that is a 3226 // negative number, as it appears to be illegal to have a negative offset 3227 // in the vgpr, even if adding the immediate offset makes it positive. 3228 unsigned Overflow = ImmOffset & ~MaxImm; 3229 ImmOffset -= Overflow; 3230 if ((int32_t)Overflow < 0) { 3231 Overflow += ImmOffset; 3232 ImmOffset = 0; 3233 } 3234 3235 if (Overflow != 0) { 3236 if (!BaseReg) { 3237 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3238 } else { 3239 auto OverflowVal = B.buildConstant(S32, Overflow); 3240 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3241 } 3242 } 3243 3244 if (!BaseReg) 3245 BaseReg = B.buildConstant(S32, 0).getReg(0); 3246 3247 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3248 } 3249 3250 /// Handle register layout difference for f16 images for some subtargets. 3251 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3252 MachineRegisterInfo &MRI, 3253 Register Reg) const { 3254 if (!ST.hasUnpackedD16VMem()) 3255 return Reg; 3256 3257 const LLT S16 = LLT::scalar(16); 3258 const LLT S32 = LLT::scalar(32); 3259 LLT StoreVT = MRI.getType(Reg); 3260 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3261 3262 auto Unmerge = B.buildUnmerge(S16, Reg); 3263 3264 SmallVector<Register, 4> WideRegs; 3265 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3266 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3267 3268 int NumElts = StoreVT.getNumElements(); 3269 3270 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3271 } 3272 3273 Register AMDGPULegalizerInfo::fixStoreSourceType( 3274 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3275 MachineRegisterInfo *MRI = B.getMRI(); 3276 LLT Ty = MRI->getType(VData); 3277 3278 const LLT S16 = LLT::scalar(16); 3279 3280 // Fixup illegal register types for i8 stores. 3281 if (Ty == LLT::scalar(8) || Ty == S16) { 3282 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3283 return AnyExt; 3284 } 3285 3286 if (Ty.isVector()) { 3287 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3288 if (IsFormat) 3289 return handleD16VData(B, *MRI, VData); 3290 } 3291 } 3292 3293 return VData; 3294 } 3295 3296 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3297 MachineRegisterInfo &MRI, 3298 MachineIRBuilder &B, 3299 bool IsTyped, 3300 bool IsFormat) const { 3301 Register VData = MI.getOperand(1).getReg(); 3302 LLT Ty = MRI.getType(VData); 3303 LLT EltTy = Ty.getScalarType(); 3304 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3305 const LLT S32 = LLT::scalar(32); 3306 3307 VData = fixStoreSourceType(B, VData, IsFormat); 3308 Register RSrc = MI.getOperand(2).getReg(); 3309 3310 MachineMemOperand *MMO = *MI.memoperands_begin(); 3311 const int MemSize = MMO->getSize(); 3312 3313 unsigned ImmOffset; 3314 unsigned TotalOffset; 3315 3316 // The typed intrinsics add an immediate after the registers. 3317 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3318 3319 // The struct intrinsic variants add one additional operand over raw. 3320 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3321 Register VIndex; 3322 int OpOffset = 0; 3323 if (HasVIndex) { 3324 VIndex = MI.getOperand(3).getReg(); 3325 OpOffset = 1; 3326 } 3327 3328 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3329 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3330 3331 unsigned Format = 0; 3332 if (IsTyped) { 3333 Format = MI.getOperand(5 + OpOffset).getImm(); 3334 ++OpOffset; 3335 } 3336 3337 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3338 3339 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3340 if (TotalOffset != 0) 3341 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3342 3343 unsigned Opc; 3344 if (IsTyped) { 3345 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3346 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3347 } else if (IsFormat) { 3348 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3349 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3350 } else { 3351 switch (MemSize) { 3352 case 1: 3353 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3354 break; 3355 case 2: 3356 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3357 break; 3358 default: 3359 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3360 break; 3361 } 3362 } 3363 3364 if (!VIndex) 3365 VIndex = B.buildConstant(S32, 0).getReg(0); 3366 3367 auto MIB = B.buildInstr(Opc) 3368 .addUse(VData) // vdata 3369 .addUse(RSrc) // rsrc 3370 .addUse(VIndex) // vindex 3371 .addUse(VOffset) // voffset 3372 .addUse(SOffset) // soffset 3373 .addImm(ImmOffset); // offset(imm) 3374 3375 if (IsTyped) 3376 MIB.addImm(Format); 3377 3378 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3379 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3380 .addMemOperand(MMO); 3381 3382 MI.eraseFromParent(); 3383 return true; 3384 } 3385 3386 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3387 MachineRegisterInfo &MRI, 3388 MachineIRBuilder &B, 3389 bool IsFormat, 3390 bool IsTyped) const { 3391 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3392 MachineMemOperand *MMO = *MI.memoperands_begin(); 3393 const int MemSize = MMO->getSize(); 3394 const LLT S32 = LLT::scalar(32); 3395 3396 Register Dst = MI.getOperand(0).getReg(); 3397 Register RSrc = MI.getOperand(2).getReg(); 3398 3399 // The typed intrinsics add an immediate after the registers. 3400 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3401 3402 // The struct intrinsic variants add one additional operand over raw. 3403 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3404 Register VIndex; 3405 int OpOffset = 0; 3406 if (HasVIndex) { 3407 VIndex = MI.getOperand(3).getReg(); 3408 OpOffset = 1; 3409 } 3410 3411 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3412 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3413 3414 unsigned Format = 0; 3415 if (IsTyped) { 3416 Format = MI.getOperand(5 + OpOffset).getImm(); 3417 ++OpOffset; 3418 } 3419 3420 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3421 unsigned ImmOffset; 3422 unsigned TotalOffset; 3423 3424 LLT Ty = MRI.getType(Dst); 3425 LLT EltTy = Ty.getScalarType(); 3426 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3427 const bool Unpacked = ST.hasUnpackedD16VMem(); 3428 3429 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3430 if (TotalOffset != 0) 3431 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3432 3433 unsigned Opc; 3434 3435 if (IsTyped) { 3436 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3437 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3438 } else if (IsFormat) { 3439 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3440 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3441 } else { 3442 switch (MemSize) { 3443 case 1: 3444 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3445 break; 3446 case 2: 3447 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3448 break; 3449 default: 3450 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3451 break; 3452 } 3453 } 3454 3455 Register LoadDstReg; 3456 3457 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3458 LLT UnpackedTy = Ty.changeElementSize(32); 3459 3460 if (IsExtLoad) 3461 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3462 else if (Unpacked && IsD16 && Ty.isVector()) 3463 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3464 else 3465 LoadDstReg = Dst; 3466 3467 if (!VIndex) 3468 VIndex = B.buildConstant(S32, 0).getReg(0); 3469 3470 auto MIB = B.buildInstr(Opc) 3471 .addDef(LoadDstReg) // vdata 3472 .addUse(RSrc) // rsrc 3473 .addUse(VIndex) // vindex 3474 .addUse(VOffset) // voffset 3475 .addUse(SOffset) // soffset 3476 .addImm(ImmOffset); // offset(imm) 3477 3478 if (IsTyped) 3479 MIB.addImm(Format); 3480 3481 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3482 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3483 .addMemOperand(MMO); 3484 3485 if (LoadDstReg != Dst) { 3486 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3487 3488 // Widen result for extending loads was widened. 3489 if (IsExtLoad) 3490 B.buildTrunc(Dst, LoadDstReg); 3491 else { 3492 // Repack to original 16-bit vector result 3493 // FIXME: G_TRUNC should work, but legalization currently fails 3494 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3495 SmallVector<Register, 4> Repack; 3496 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3497 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3498 B.buildMerge(Dst, Repack); 3499 } 3500 } 3501 3502 MI.eraseFromParent(); 3503 return true; 3504 } 3505 3506 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3507 MachineIRBuilder &B, 3508 bool IsInc) const { 3509 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3510 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3511 B.buildInstr(Opc) 3512 .addDef(MI.getOperand(0).getReg()) 3513 .addUse(MI.getOperand(2).getReg()) 3514 .addUse(MI.getOperand(3).getReg()) 3515 .cloneMemRefs(MI); 3516 MI.eraseFromParent(); 3517 return true; 3518 } 3519 3520 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3521 switch (IntrID) { 3522 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3523 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3524 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3525 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3526 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3527 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3528 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3529 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3531 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3532 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3533 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3534 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3535 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3536 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3537 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3538 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3539 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3540 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3541 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3542 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3543 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3544 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3545 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3546 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3547 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3548 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3549 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3550 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3551 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3552 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3553 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3554 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3555 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3556 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3557 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3558 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3559 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3560 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3561 default: 3562 llvm_unreachable("unhandled atomic opcode"); 3563 } 3564 } 3565 3566 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3567 MachineIRBuilder &B, 3568 Intrinsic::ID IID) const { 3569 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3570 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3571 3572 Register Dst = MI.getOperand(0).getReg(); 3573 Register VData = MI.getOperand(2).getReg(); 3574 3575 Register CmpVal; 3576 int OpOffset = 0; 3577 3578 if (IsCmpSwap) { 3579 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3580 ++OpOffset; 3581 } 3582 3583 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3584 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3585 3586 // The struct intrinsic variants add one additional operand over raw. 3587 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3588 Register VIndex; 3589 if (HasVIndex) { 3590 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3591 ++OpOffset; 3592 } 3593 3594 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3595 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3596 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3597 3598 MachineMemOperand *MMO = *MI.memoperands_begin(); 3599 3600 unsigned ImmOffset; 3601 unsigned TotalOffset; 3602 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3603 if (TotalOffset != 0) 3604 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3605 3606 if (!VIndex) 3607 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3608 3609 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3610 .addDef(Dst) 3611 .addUse(VData); // vdata 3612 3613 if (IsCmpSwap) 3614 MIB.addReg(CmpVal); 3615 3616 MIB.addUse(RSrc) // rsrc 3617 .addUse(VIndex) // vindex 3618 .addUse(VOffset) // voffset 3619 .addUse(SOffset) // soffset 3620 .addImm(ImmOffset) // offset(imm) 3621 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3622 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3623 .addMemOperand(MMO); 3624 3625 MI.eraseFromParent(); 3626 return true; 3627 } 3628 3629 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3630 /// vector with s16 typed elements. 3631 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3632 SmallVectorImpl<Register> &PackedAddrs, 3633 int AddrIdx, int DimIdx, int EndIdx, 3634 int NumGradients) { 3635 const LLT S16 = LLT::scalar(16); 3636 const LLT V2S16 = LLT::vector(2, 16); 3637 3638 for (int I = AddrIdx; I < EndIdx; ++I) { 3639 MachineOperand &SrcOp = MI.getOperand(I); 3640 if (!SrcOp.isReg()) 3641 continue; // _L to _LZ may have eliminated this. 3642 3643 Register AddrReg = SrcOp.getReg(); 3644 3645 if (I < DimIdx) { 3646 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3647 PackedAddrs.push_back(AddrReg); 3648 } else { 3649 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3650 // derivatives dx/dh and dx/dv are packed with undef. 3651 if (((I + 1) >= EndIdx) || 3652 ((NumGradients / 2) % 2 == 1 && 3653 (I == DimIdx + (NumGradients / 2) - 1 || 3654 I == DimIdx + NumGradients - 1)) || 3655 // Check for _L to _LZ optimization 3656 !MI.getOperand(I + 1).isReg()) { 3657 PackedAddrs.push_back( 3658 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3659 .getReg(0)); 3660 } else { 3661 PackedAddrs.push_back( 3662 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3663 .getReg(0)); 3664 ++I; 3665 } 3666 } 3667 } 3668 } 3669 3670 /// Convert from separate vaddr components to a single vector address register, 3671 /// and replace the remaining operands with $noreg. 3672 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3673 int DimIdx, int NumVAddrs) { 3674 const LLT S32 = LLT::scalar(32); 3675 3676 SmallVector<Register, 8> AddrRegs; 3677 for (int I = 0; I != NumVAddrs; ++I) { 3678 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3679 if (SrcOp.isReg()) { 3680 AddrRegs.push_back(SrcOp.getReg()); 3681 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3682 } 3683 } 3684 3685 int NumAddrRegs = AddrRegs.size(); 3686 if (NumAddrRegs != 1) { 3687 // Round up to 8 elements for v5-v7 3688 // FIXME: Missing intermediate sized register classes and instructions. 3689 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3690 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3691 auto Undef = B.buildUndef(S32); 3692 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3693 NumAddrRegs = RoundedNumRegs; 3694 } 3695 3696 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3697 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3698 } 3699 3700 for (int I = 1; I != NumVAddrs; ++I) { 3701 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3702 if (SrcOp.isReg()) 3703 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3704 } 3705 } 3706 3707 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3708 /// 3709 /// Depending on the subtarget, load/store with 16-bit element data need to be 3710 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3711 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3712 /// registers. 3713 /// 3714 /// We don't want to directly select image instructions just yet, but also want 3715 /// to exposes all register repacking to the legalizer/combiners. We also don't 3716 /// want a selected instrution entering RegBankSelect. In order to avoid 3717 /// defining a multitude of intermediate image instructions, directly hack on 3718 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3719 /// now unnecessary arguments with $noreg. 3720 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3721 MachineInstr &MI, MachineIRBuilder &B, 3722 GISelChangeObserver &Observer, 3723 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3724 3725 const int NumDefs = MI.getNumExplicitDefs(); 3726 bool IsTFE = NumDefs == 2; 3727 // We are only processing the operands of d16 image operations on subtargets 3728 // that use the unpacked register layout, or need to repack the TFE result. 3729 3730 // TODO: Do we need to guard against already legalized intrinsics? 3731 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3732 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3733 3734 MachineRegisterInfo *MRI = B.getMRI(); 3735 const LLT S32 = LLT::scalar(32); 3736 const LLT S16 = LLT::scalar(16); 3737 const LLT V2S16 = LLT::vector(2, 16); 3738 3739 // Index of first address argument 3740 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3741 3742 int NumVAddrs, NumGradients; 3743 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3744 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3745 getDMaskIdx(BaseOpcode, NumDefs); 3746 unsigned DMask = 0; 3747 3748 // Check for 16 bit addresses and pack if true. 3749 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3750 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3751 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3752 const bool IsG16 = GradTy == S16; 3753 const bool IsA16 = AddrTy == S16; 3754 3755 int DMaskLanes = 0; 3756 if (!BaseOpcode->Atomic) { 3757 DMask = MI.getOperand(DMaskIdx).getImm(); 3758 if (BaseOpcode->Gather4) { 3759 DMaskLanes = 4; 3760 } else if (DMask != 0) { 3761 DMaskLanes = countPopulation(DMask); 3762 } else if (!IsTFE && !BaseOpcode->Store) { 3763 // If dmask is 0, this is a no-op load. This can be eliminated. 3764 B.buildUndef(MI.getOperand(0)); 3765 MI.eraseFromParent(); 3766 return true; 3767 } 3768 } 3769 3770 Observer.changingInstr(MI); 3771 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3772 3773 unsigned NewOpcode = NumDefs == 0 ? 3774 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3775 3776 // Track that we legalized this 3777 MI.setDesc(B.getTII().get(NewOpcode)); 3778 3779 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3780 // dmask to be at least 1 otherwise the instruction will fail 3781 if (IsTFE && DMask == 0) { 3782 DMask = 0x1; 3783 DMaskLanes = 1; 3784 MI.getOperand(DMaskIdx).setImm(DMask); 3785 } 3786 3787 if (BaseOpcode->Atomic) { 3788 Register VData0 = MI.getOperand(2).getReg(); 3789 LLT Ty = MRI->getType(VData0); 3790 3791 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3792 if (Ty.isVector()) 3793 return false; 3794 3795 if (BaseOpcode->AtomicX2) { 3796 Register VData1 = MI.getOperand(3).getReg(); 3797 // The two values are packed in one register. 3798 LLT PackedTy = LLT::vector(2, Ty); 3799 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3800 MI.getOperand(2).setReg(Concat.getReg(0)); 3801 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3802 } 3803 } 3804 3805 int CorrectedNumVAddrs = NumVAddrs; 3806 3807 // Optimize _L to _LZ when _L is zero 3808 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3809 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3810 const ConstantFP *ConstantLod; 3811 const int LodIdx = AddrIdx + NumVAddrs - 1; 3812 3813 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3814 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3815 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3816 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3817 LZMappingInfo->LZ, ImageDimIntr->Dim); 3818 3819 // The starting indexes should remain in the same place. 3820 --NumVAddrs; 3821 --CorrectedNumVAddrs; 3822 3823 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3824 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3825 MI.RemoveOperand(LodIdx); 3826 } 3827 } 3828 } 3829 3830 // Optimize _mip away, when 'lod' is zero 3831 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3832 int64_t ConstantLod; 3833 const int LodIdx = AddrIdx + NumVAddrs - 1; 3834 3835 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3836 if (ConstantLod == 0) { 3837 // TODO: Change intrinsic opcode and remove operand instead or replacing 3838 // it with 0, as the _L to _LZ handling is done above. 3839 MI.getOperand(LodIdx).ChangeToImmediate(0); 3840 --CorrectedNumVAddrs; 3841 } 3842 } 3843 } 3844 3845 // Rewrite the addressing register layout before doing anything else. 3846 if (IsA16 || IsG16) { 3847 if (IsA16) { 3848 // Target must support the feature and gradients need to be 16 bit too 3849 if (!ST.hasA16() || !IsG16) 3850 return false; 3851 } else if (!ST.hasG16()) 3852 return false; 3853 3854 if (NumVAddrs > 1) { 3855 SmallVector<Register, 4> PackedRegs; 3856 // Don't compress addresses for G16 3857 const int PackEndIdx = 3858 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3859 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3860 PackEndIdx, NumGradients); 3861 3862 if (!IsA16) { 3863 // Add uncompressed address 3864 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3865 int AddrReg = MI.getOperand(I).getReg(); 3866 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3867 PackedRegs.push_back(AddrReg); 3868 } 3869 } 3870 3871 // See also below in the non-a16 branch 3872 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3873 3874 if (!UseNSA && PackedRegs.size() > 1) { 3875 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3876 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3877 PackedRegs[0] = Concat.getReg(0); 3878 PackedRegs.resize(1); 3879 } 3880 3881 const int NumPacked = PackedRegs.size(); 3882 for (int I = 0; I != NumVAddrs; ++I) { 3883 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3884 if (!SrcOp.isReg()) { 3885 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3886 continue; 3887 } 3888 3889 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3890 3891 if (I < NumPacked) 3892 SrcOp.setReg(PackedRegs[I]); 3893 else 3894 SrcOp.setReg(AMDGPU::NoRegister); 3895 } 3896 } 3897 } else { 3898 // If the register allocator cannot place the address registers contiguously 3899 // without introducing moves, then using the non-sequential address encoding 3900 // is always preferable, since it saves VALU instructions and is usually a 3901 // wash in terms of code size or even better. 3902 // 3903 // However, we currently have no way of hinting to the register allocator 3904 // that MIMG addresses should be placed contiguously when it is possible to 3905 // do so, so force non-NSA for the common 2-address case as a heuristic. 3906 // 3907 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3908 // allocation when possible. 3909 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3910 3911 if (!UseNSA && NumVAddrs > 1) 3912 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3913 } 3914 3915 int Flags = 0; 3916 if (IsA16) 3917 Flags |= 1; 3918 if (IsG16) 3919 Flags |= 2; 3920 MI.addOperand(MachineOperand::CreateImm(Flags)); 3921 3922 if (BaseOpcode->Store) { // No TFE for stores? 3923 // TODO: Handle dmask trim 3924 Register VData = MI.getOperand(1).getReg(); 3925 LLT Ty = MRI->getType(VData); 3926 if (!Ty.isVector() || Ty.getElementType() != S16) 3927 return true; 3928 3929 Register RepackedReg = handleD16VData(B, *MRI, VData); 3930 if (RepackedReg != VData) { 3931 MI.getOperand(1).setReg(RepackedReg); 3932 } 3933 3934 return true; 3935 } 3936 3937 Register DstReg = MI.getOperand(0).getReg(); 3938 LLT Ty = MRI->getType(DstReg); 3939 const LLT EltTy = Ty.getScalarType(); 3940 const bool IsD16 = Ty.getScalarType() == S16; 3941 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3942 3943 // Confirm that the return type is large enough for the dmask specified 3944 if (NumElts < DMaskLanes) 3945 return false; 3946 3947 if (NumElts > 4 || DMaskLanes > 4) 3948 return false; 3949 3950 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3951 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3952 3953 // The raw dword aligned data component of the load. The only legal cases 3954 // where this matters should be when using the packed D16 format, for 3955 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3956 LLT RoundedTy; 3957 3958 // S32 vector to to cover all data, plus TFE result element. 3959 LLT TFETy; 3960 3961 // Register type to use for each loaded component. Will be S32 or V2S16. 3962 LLT RegTy; 3963 3964 if (IsD16 && ST.hasUnpackedD16VMem()) { 3965 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3966 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3967 RegTy = S32; 3968 } else { 3969 unsigned EltSize = EltTy.getSizeInBits(); 3970 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3971 unsigned RoundedSize = 32 * RoundedElts; 3972 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3973 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3974 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3975 } 3976 3977 // The return type does not need adjustment. 3978 // TODO: Should we change s16 case to s32 or <2 x s16>? 3979 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3980 return true; 3981 3982 Register Dst1Reg; 3983 3984 // Insert after the instruction. 3985 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3986 3987 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3988 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3989 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3990 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3991 3992 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3993 3994 MI.getOperand(0).setReg(NewResultReg); 3995 3996 // In the IR, TFE is supposed to be used with a 2 element struct return 3997 // type. The intruction really returns these two values in one contiguous 3998 // register, with one additional dword beyond the loaded data. Rewrite the 3999 // return type to use a single register result. 4000 4001 if (IsTFE) { 4002 Dst1Reg = MI.getOperand(1).getReg(); 4003 if (MRI->getType(Dst1Reg) != S32) 4004 return false; 4005 4006 // TODO: Make sure the TFE operand bit is set. 4007 MI.RemoveOperand(1); 4008 4009 // Handle the easy case that requires no repack instructions. 4010 if (Ty == S32) { 4011 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4012 return true; 4013 } 4014 } 4015 4016 // Now figure out how to copy the new result register back into the old 4017 // result. 4018 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4019 4020 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4021 4022 if (ResultNumRegs == 1) { 4023 assert(!IsTFE); 4024 ResultRegs[0] = NewResultReg; 4025 } else { 4026 // We have to repack into a new vector of some kind. 4027 for (int I = 0; I != NumDataRegs; ++I) 4028 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4029 B.buildUnmerge(ResultRegs, NewResultReg); 4030 4031 // Drop the final TFE element to get the data part. The TFE result is 4032 // directly written to the right place already. 4033 if (IsTFE) 4034 ResultRegs.resize(NumDataRegs); 4035 } 4036 4037 // For an s16 scalar result, we form an s32 result with a truncate regardless 4038 // of packed vs. unpacked. 4039 if (IsD16 && !Ty.isVector()) { 4040 B.buildTrunc(DstReg, ResultRegs[0]); 4041 return true; 4042 } 4043 4044 // Avoid a build/concat_vector of 1 entry. 4045 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4046 B.buildBitcast(DstReg, ResultRegs[0]); 4047 return true; 4048 } 4049 4050 assert(Ty.isVector()); 4051 4052 if (IsD16) { 4053 // For packed D16 results with TFE enabled, all the data components are 4054 // S32. Cast back to the expected type. 4055 // 4056 // TODO: We don't really need to use load s32 elements. We would only need one 4057 // cast for the TFE result if a multiple of v2s16 was used. 4058 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4059 for (Register &Reg : ResultRegs) 4060 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4061 } else if (ST.hasUnpackedD16VMem()) { 4062 for (Register &Reg : ResultRegs) 4063 Reg = B.buildTrunc(S16, Reg).getReg(0); 4064 } 4065 } 4066 4067 auto padWithUndef = [&](LLT Ty, int NumElts) { 4068 if (NumElts == 0) 4069 return; 4070 Register Undef = B.buildUndef(Ty).getReg(0); 4071 for (int I = 0; I != NumElts; ++I) 4072 ResultRegs.push_back(Undef); 4073 }; 4074 4075 // Pad out any elements eliminated due to the dmask. 4076 LLT ResTy = MRI->getType(ResultRegs[0]); 4077 if (!ResTy.isVector()) { 4078 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4079 B.buildBuildVector(DstReg, ResultRegs); 4080 return true; 4081 } 4082 4083 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4084 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4085 4086 // Deal with the one annoying legal case. 4087 const LLT V3S16 = LLT::vector(3, 16); 4088 if (Ty == V3S16) { 4089 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4090 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4091 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4092 return true; 4093 } 4094 4095 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4096 B.buildConcatVectors(DstReg, ResultRegs); 4097 return true; 4098 } 4099 4100 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4101 LegalizerHelper &Helper, MachineInstr &MI) const { 4102 MachineIRBuilder &B = Helper.MIRBuilder; 4103 GISelChangeObserver &Observer = Helper.Observer; 4104 4105 Register Dst = MI.getOperand(0).getReg(); 4106 LLT Ty = B.getMRI()->getType(Dst); 4107 unsigned Size = Ty.getSizeInBits(); 4108 MachineFunction &MF = B.getMF(); 4109 4110 Observer.changingInstr(MI); 4111 4112 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4113 Ty = getBitcastRegisterType(Ty); 4114 Helper.bitcastDst(MI, Ty, 0); 4115 Dst = MI.getOperand(0).getReg(); 4116 B.setInsertPt(B.getMBB(), MI); 4117 } 4118 4119 // FIXME: We don't really need this intermediate instruction. The intrinsic 4120 // should be fixed to have a memory operand. Since it's readnone, we're not 4121 // allowed to add one. 4122 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4123 MI.RemoveOperand(1); // Remove intrinsic ID 4124 4125 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4126 // TODO: Should this use datalayout alignment? 4127 const unsigned MemSize = (Size + 7) / 8; 4128 const Align MemAlign(4); 4129 MachineMemOperand *MMO = MF.getMachineMemOperand( 4130 MachinePointerInfo(), 4131 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4132 MachineMemOperand::MOInvariant, 4133 MemSize, MemAlign); 4134 MI.addMemOperand(MF, MMO); 4135 4136 // There are no 96-bit result scalar loads, but widening to 128-bit should 4137 // always be legal. We may need to restore this to a 96-bit result if it turns 4138 // out this needs to be converted to a vector load during RegBankSelect. 4139 if (!isPowerOf2_32(Size)) { 4140 if (Ty.isVector()) 4141 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4142 else 4143 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4144 } 4145 4146 Observer.changedInstr(MI); 4147 return true; 4148 } 4149 4150 // TODO: Move to selection 4151 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4152 MachineRegisterInfo &MRI, 4153 MachineIRBuilder &B) const { 4154 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4155 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4156 !ST.isTrapHandlerEnabled()) { 4157 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4158 } else { 4159 // Pass queue pointer to trap handler as input, and insert trap instruction 4160 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4161 MachineRegisterInfo &MRI = *B.getMRI(); 4162 4163 Register LiveIn = 4164 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4165 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4166 return false; 4167 4168 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4169 B.buildCopy(SGPR01, LiveIn); 4170 B.buildInstr(AMDGPU::S_TRAP) 4171 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4172 .addReg(SGPR01, RegState::Implicit); 4173 } 4174 4175 MI.eraseFromParent(); 4176 return true; 4177 } 4178 4179 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4180 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4181 // Is non-HSA path or trap-handler disabled? then, report a warning 4182 // accordingly 4183 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4184 !ST.isTrapHandlerEnabled()) { 4185 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4186 "debugtrap handler not supported", 4187 MI.getDebugLoc(), DS_Warning); 4188 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4189 Ctx.diagnose(NoTrap); 4190 } else { 4191 // Insert debug-trap instruction 4192 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4193 } 4194 4195 MI.eraseFromParent(); 4196 return true; 4197 } 4198 4199 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4200 MachineInstr &MI) const { 4201 MachineIRBuilder &B = Helper.MIRBuilder; 4202 MachineRegisterInfo &MRI = *B.getMRI(); 4203 4204 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4205 auto IntrID = MI.getIntrinsicID(); 4206 switch (IntrID) { 4207 case Intrinsic::amdgcn_if: 4208 case Intrinsic::amdgcn_else: { 4209 MachineInstr *Br = nullptr; 4210 MachineBasicBlock *UncondBrTarget = nullptr; 4211 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4212 const SIRegisterInfo *TRI 4213 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4214 4215 Register Def = MI.getOperand(1).getReg(); 4216 Register Use = MI.getOperand(3).getReg(); 4217 4218 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4219 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4220 if (IntrID == Intrinsic::amdgcn_if) { 4221 B.buildInstr(AMDGPU::SI_IF) 4222 .addDef(Def) 4223 .addUse(Use) 4224 .addMBB(UncondBrTarget); 4225 } else { 4226 B.buildInstr(AMDGPU::SI_ELSE) 4227 .addDef(Def) 4228 .addUse(Use) 4229 .addMBB(UncondBrTarget) 4230 .addImm(0); 4231 } 4232 4233 if (Br) { 4234 Br->getOperand(0).setMBB(CondBrTarget); 4235 } else { 4236 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4237 // since we're swapping branch targets it needs to be reinserted. 4238 // FIXME: IRTranslator should probably not do this 4239 B.buildBr(*CondBrTarget); 4240 } 4241 4242 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4243 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4244 MI.eraseFromParent(); 4245 BrCond->eraseFromParent(); 4246 return true; 4247 } 4248 4249 return false; 4250 } 4251 case Intrinsic::amdgcn_loop: { 4252 MachineInstr *Br = nullptr; 4253 MachineBasicBlock *UncondBrTarget = nullptr; 4254 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4255 const SIRegisterInfo *TRI 4256 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4257 4258 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4259 Register Reg = MI.getOperand(2).getReg(); 4260 4261 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4262 B.buildInstr(AMDGPU::SI_LOOP) 4263 .addUse(Reg) 4264 .addMBB(UncondBrTarget); 4265 4266 if (Br) 4267 Br->getOperand(0).setMBB(CondBrTarget); 4268 else 4269 B.buildBr(*CondBrTarget); 4270 4271 MI.eraseFromParent(); 4272 BrCond->eraseFromParent(); 4273 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4274 return true; 4275 } 4276 4277 return false; 4278 } 4279 case Intrinsic::amdgcn_kernarg_segment_ptr: 4280 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4281 // This only makes sense to call in a kernel, so just lower to null. 4282 B.buildConstant(MI.getOperand(0).getReg(), 0); 4283 MI.eraseFromParent(); 4284 return true; 4285 } 4286 4287 return legalizePreloadedArgIntrin( 4288 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4289 case Intrinsic::amdgcn_implicitarg_ptr: 4290 return legalizeImplicitArgPtr(MI, MRI, B); 4291 case Intrinsic::amdgcn_workitem_id_x: 4292 return legalizePreloadedArgIntrin(MI, MRI, B, 4293 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4294 case Intrinsic::amdgcn_workitem_id_y: 4295 return legalizePreloadedArgIntrin(MI, MRI, B, 4296 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4297 case Intrinsic::amdgcn_workitem_id_z: 4298 return legalizePreloadedArgIntrin(MI, MRI, B, 4299 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4300 case Intrinsic::amdgcn_workgroup_id_x: 4301 return legalizePreloadedArgIntrin(MI, MRI, B, 4302 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4303 case Intrinsic::amdgcn_workgroup_id_y: 4304 return legalizePreloadedArgIntrin(MI, MRI, B, 4305 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4306 case Intrinsic::amdgcn_workgroup_id_z: 4307 return legalizePreloadedArgIntrin(MI, MRI, B, 4308 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4309 case Intrinsic::amdgcn_dispatch_ptr: 4310 return legalizePreloadedArgIntrin(MI, MRI, B, 4311 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4312 case Intrinsic::amdgcn_queue_ptr: 4313 return legalizePreloadedArgIntrin(MI, MRI, B, 4314 AMDGPUFunctionArgInfo::QUEUE_PTR); 4315 case Intrinsic::amdgcn_implicit_buffer_ptr: 4316 return legalizePreloadedArgIntrin( 4317 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4318 case Intrinsic::amdgcn_dispatch_id: 4319 return legalizePreloadedArgIntrin(MI, MRI, B, 4320 AMDGPUFunctionArgInfo::DISPATCH_ID); 4321 case Intrinsic::amdgcn_fdiv_fast: 4322 return legalizeFDIVFastIntrin(MI, MRI, B); 4323 case Intrinsic::amdgcn_is_shared: 4324 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4325 case Intrinsic::amdgcn_is_private: 4326 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4327 case Intrinsic::amdgcn_wavefrontsize: { 4328 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4329 MI.eraseFromParent(); 4330 return true; 4331 } 4332 case Intrinsic::amdgcn_s_buffer_load: 4333 return legalizeSBufferLoad(Helper, MI); 4334 case Intrinsic::amdgcn_raw_buffer_store: 4335 case Intrinsic::amdgcn_struct_buffer_store: 4336 return legalizeBufferStore(MI, MRI, B, false, false); 4337 case Intrinsic::amdgcn_raw_buffer_store_format: 4338 case Intrinsic::amdgcn_struct_buffer_store_format: 4339 return legalizeBufferStore(MI, MRI, B, false, true); 4340 case Intrinsic::amdgcn_raw_tbuffer_store: 4341 case Intrinsic::amdgcn_struct_tbuffer_store: 4342 return legalizeBufferStore(MI, MRI, B, true, true); 4343 case Intrinsic::amdgcn_raw_buffer_load: 4344 case Intrinsic::amdgcn_struct_buffer_load: 4345 return legalizeBufferLoad(MI, MRI, B, false, false); 4346 case Intrinsic::amdgcn_raw_buffer_load_format: 4347 case Intrinsic::amdgcn_struct_buffer_load_format: 4348 return legalizeBufferLoad(MI, MRI, B, true, false); 4349 case Intrinsic::amdgcn_raw_tbuffer_load: 4350 case Intrinsic::amdgcn_struct_tbuffer_load: 4351 return legalizeBufferLoad(MI, MRI, B, true, true); 4352 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4353 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4354 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4355 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4356 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4357 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4358 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4359 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4360 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4361 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4362 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4363 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4364 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4365 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4366 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4374 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4375 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4376 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4377 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4378 return legalizeBufferAtomic(MI, B, IntrID); 4379 case Intrinsic::amdgcn_atomic_inc: 4380 return legalizeAtomicIncDec(MI, B, true); 4381 case Intrinsic::amdgcn_atomic_dec: 4382 return legalizeAtomicIncDec(MI, B, false); 4383 case Intrinsic::trap: 4384 return legalizeTrapIntrinsic(MI, MRI, B); 4385 case Intrinsic::debugtrap: 4386 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4387 default: { 4388 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4389 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4390 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4391 return true; 4392 } 4393 } 4394 4395 return true; 4396 } 4397