1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .customIf(typeIsNot(0, PrivatePtr)); 555 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 // Lower roundeven into G_FRINT 715 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 716 .scalarize(0) 717 .lower(); 718 719 if (ST.has16BitInsts()) { 720 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 721 .legalFor({S16, S32, S64}) 722 .clampScalar(0, S16, S64) 723 .scalarize(0); 724 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 725 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 726 .legalFor({S32, S64}) 727 .clampScalar(0, S32, S64) 728 .scalarize(0); 729 } else { 730 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 731 .legalFor({S32}) 732 .customFor({S64}) 733 .clampScalar(0, S32, S64) 734 .scalarize(0); 735 } 736 737 getActionDefinitionsBuilder(G_PTR_ADD) 738 .legalIf(all(isPointer(0), sameSize(0, 1))) 739 .scalarize(0) 740 .scalarSameSizeAs(1, 0); 741 742 getActionDefinitionsBuilder(G_PTRMASK) 743 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 744 .scalarSameSizeAs(1, 0) 745 .scalarize(0); 746 747 auto &CmpBuilder = 748 getActionDefinitionsBuilder(G_ICMP) 749 // The compare output type differs based on the register bank of the output, 750 // so make both s1 and s32 legal. 751 // 752 // Scalar compares producing output in scc will be promoted to s32, as that 753 // is the allocatable register type that will be needed for the copy from 754 // scc. This will be promoted during RegBankSelect, and we assume something 755 // before that won't try to use s32 result types. 756 // 757 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 758 // bank. 759 .legalForCartesianProduct( 760 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 761 .legalForCartesianProduct( 762 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 763 if (ST.has16BitInsts()) { 764 CmpBuilder.legalFor({{S1, S16}}); 765 } 766 767 CmpBuilder 768 .widenScalarToNextPow2(1) 769 .clampScalar(1, S32, S64) 770 .scalarize(0) 771 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 772 773 getActionDefinitionsBuilder(G_FCMP) 774 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 775 .widenScalarToNextPow2(1) 776 .clampScalar(1, S32, S64) 777 .scalarize(0); 778 779 // FIXME: fpow has a selection pattern that should move to custom lowering. 780 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 781 if (ST.has16BitInsts()) 782 Exp2Ops.legalFor({S32, S16}); 783 else 784 Exp2Ops.legalFor({S32}); 785 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 786 Exp2Ops.scalarize(0); 787 788 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 789 if (ST.has16BitInsts()) 790 ExpOps.customFor({{S32}, {S16}}); 791 else 792 ExpOps.customFor({S32}); 793 ExpOps.clampScalar(0, MinScalarFPTy, S32) 794 .scalarize(0); 795 796 getActionDefinitionsBuilder(G_FPOWI) 797 .clampScalar(0, MinScalarFPTy, S32) 798 .lower(); 799 800 // The 64-bit versions produce 32-bit results, but only on the SALU. 801 getActionDefinitionsBuilder(G_CTPOP) 802 .legalFor({{S32, S32}, {S32, S64}}) 803 .clampScalar(0, S32, S32) 804 .clampScalar(1, S32, S64) 805 .scalarize(0) 806 .widenScalarToNextPow2(0, 32) 807 .widenScalarToNextPow2(1, 32); 808 809 // The hardware instructions return a different result on 0 than the generic 810 // instructions expect. The hardware produces -1, but these produce the 811 // bitwidth. 812 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 813 .scalarize(0) 814 .clampScalar(0, S32, S32) 815 .clampScalar(1, S32, S64) 816 .widenScalarToNextPow2(0, 32) 817 .widenScalarToNextPow2(1, 32) 818 .lower(); 819 820 // The 64-bit versions produce 32-bit results, but only on the SALU. 821 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 822 .legalFor({{S32, S32}, {S32, S64}}) 823 .clampScalar(0, S32, S32) 824 .clampScalar(1, S32, S64) 825 .scalarize(0) 826 .widenScalarToNextPow2(0, 32) 827 .widenScalarToNextPow2(1, 32); 828 829 getActionDefinitionsBuilder(G_BITREVERSE) 830 .legalFor({S32}) 831 .clampScalar(0, S32, S32) 832 .scalarize(0); 833 834 if (ST.has16BitInsts()) { 835 getActionDefinitionsBuilder(G_BSWAP) 836 .legalFor({S16, S32, V2S16}) 837 .clampMaxNumElements(0, S16, 2) 838 // FIXME: Fixing non-power-of-2 before clamp is workaround for 839 // narrowScalar limitation. 840 .widenScalarToNextPow2(0) 841 .clampScalar(0, S16, S32) 842 .scalarize(0); 843 844 if (ST.hasVOP3PInsts()) { 845 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 846 .legalFor({S32, S16, V2S16}) 847 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 848 .clampMaxNumElements(0, S16, 2) 849 .minScalar(0, S16) 850 .widenScalarToNextPow2(0) 851 .scalarize(0) 852 .lower(); 853 } else { 854 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 855 .legalFor({S32, S16}) 856 .widenScalarToNextPow2(0) 857 .minScalar(0, S16) 858 .scalarize(0) 859 .lower(); 860 } 861 } else { 862 // TODO: Should have same legality without v_perm_b32 863 getActionDefinitionsBuilder(G_BSWAP) 864 .legalFor({S32}) 865 .lowerIf(scalarNarrowerThan(0, 32)) 866 // FIXME: Fixing non-power-of-2 before clamp is workaround for 867 // narrowScalar limitation. 868 .widenScalarToNextPow2(0) 869 .maxScalar(0, S32) 870 .scalarize(0) 871 .lower(); 872 873 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 874 .legalFor({S32}) 875 .minScalar(0, S32) 876 .widenScalarToNextPow2(0) 877 .scalarize(0) 878 .lower(); 879 } 880 881 getActionDefinitionsBuilder(G_INTTOPTR) 882 // List the common cases 883 .legalForCartesianProduct(AddrSpaces64, {S64}) 884 .legalForCartesianProduct(AddrSpaces32, {S32}) 885 .scalarize(0) 886 // Accept any address space as long as the size matches 887 .legalIf(sameSize(0, 1)) 888 .widenScalarIf(smallerThan(1, 0), 889 [](const LegalityQuery &Query) { 890 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 891 }) 892 .narrowScalarIf(largerThan(1, 0), 893 [](const LegalityQuery &Query) { 894 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 895 }); 896 897 getActionDefinitionsBuilder(G_PTRTOINT) 898 // List the common cases 899 .legalForCartesianProduct(AddrSpaces64, {S64}) 900 .legalForCartesianProduct(AddrSpaces32, {S32}) 901 .scalarize(0) 902 // Accept any address space as long as the size matches 903 .legalIf(sameSize(0, 1)) 904 .widenScalarIf(smallerThan(0, 1), 905 [](const LegalityQuery &Query) { 906 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 907 }) 908 .narrowScalarIf( 909 largerThan(0, 1), 910 [](const LegalityQuery &Query) { 911 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 912 }); 913 914 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 915 .scalarize(0) 916 .custom(); 917 918 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 919 bool IsLoad) -> bool { 920 const LLT DstTy = Query.Types[0]; 921 922 // Split vector extloads. 923 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 924 unsigned Align = Query.MMODescrs[0].AlignInBits; 925 926 if (MemSize < DstTy.getSizeInBits()) 927 MemSize = std::max(MemSize, Align); 928 929 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 930 return true; 931 932 const LLT PtrTy = Query.Types[1]; 933 unsigned AS = PtrTy.getAddressSpace(); 934 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 935 return true; 936 937 // Catch weird sized loads that don't evenly divide into the access sizes 938 // TODO: May be able to widen depending on alignment etc. 939 unsigned NumRegs = (MemSize + 31) / 32; 940 if (NumRegs == 3) { 941 if (!ST.hasDwordx3LoadStores()) 942 return true; 943 } else { 944 // If the alignment allows, these should have been widened. 945 if (!isPowerOf2_32(NumRegs)) 946 return true; 947 } 948 949 if (Align < MemSize) { 950 const SITargetLowering *TLI = ST.getTargetLowering(); 951 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 952 } 953 954 return false; 955 }; 956 957 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 958 unsigned Opc) -> bool { 959 unsigned Size = Query.Types[0].getSizeInBits(); 960 if (isPowerOf2_32(Size)) 961 return false; 962 963 if (Size == 96 && ST.hasDwordx3LoadStores()) 964 return false; 965 966 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 967 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 968 return false; 969 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 unsigned RoundedSize = NextPowerOf2(Size); 972 return (Align >= RoundedSize); 973 }; 974 975 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 976 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 977 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 978 979 // TODO: Refine based on subtargets which support unaligned access or 128-bit 980 // LDS 981 // TODO: Unsupported flat for SI. 982 983 for (unsigned Op : {G_LOAD, G_STORE}) { 984 const bool IsStore = Op == G_STORE; 985 986 auto &Actions = getActionDefinitionsBuilder(Op); 987 // Explicitly list some common cases. 988 // TODO: Does this help compile time at all? 989 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 990 {V2S32, GlobalPtr, 64, GlobalAlign32}, 991 {V4S32, GlobalPtr, 128, GlobalAlign32}, 992 {S64, GlobalPtr, 64, GlobalAlign32}, 993 {V2S64, GlobalPtr, 128, GlobalAlign32}, 994 {V2S16, GlobalPtr, 32, GlobalAlign32}, 995 {S32, GlobalPtr, 8, GlobalAlign8}, 996 {S32, GlobalPtr, 16, GlobalAlign16}, 997 998 {S32, LocalPtr, 32, 32}, 999 {S64, LocalPtr, 64, 32}, 1000 {V2S32, LocalPtr, 64, 32}, 1001 {S32, LocalPtr, 8, 8}, 1002 {S32, LocalPtr, 16, 16}, 1003 {V2S16, LocalPtr, 32, 32}, 1004 1005 {S32, PrivatePtr, 32, 32}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {V2S16, PrivatePtr, 32, 32}, 1009 1010 {S32, ConstantPtr, 32, GlobalAlign32}, 1011 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1012 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1013 {S64, ConstantPtr, 64, GlobalAlign32}, 1014 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1015 Actions.legalIf( 1016 [=](const LegalityQuery &Query) -> bool { 1017 return isLoadStoreLegal(ST, Query, Op); 1018 }); 1019 1020 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1021 // 64-bits. 1022 // 1023 // TODO: Should generalize bitcast action into coerce, which will also cover 1024 // inserting addrspacecasts. 1025 Actions.customIf(typeIs(1, Constant32Ptr)); 1026 1027 // Turn any illegal element vectors into something easier to deal 1028 // with. These will ultimately produce 32-bit scalar shifts to extract the 1029 // parts anyway. 1030 // 1031 // For odd 16-bit element vectors, prefer to split those into pieces with 1032 // 16-bit vector parts. 1033 Actions.bitcastIf( 1034 [=](const LegalityQuery &Query) -> bool { 1035 const LLT Ty = Query.Types[0]; 1036 const unsigned Size = Ty.getSizeInBits(); 1037 1038 if (Size != Query.MMODescrs[0].SizeInBits) 1039 return Size <= 32 && Ty.isVector(); 1040 1041 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1042 return true; 1043 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1044 !isRegisterVectorElementType(Ty.getElementType()); 1045 }, bitcastToRegisterType(0)); 1046 1047 Actions 1048 .customIf(typeIs(1, Constant32Ptr)) 1049 // Widen suitably aligned loads by loading extra elements. 1050 .moreElementsIf([=](const LegalityQuery &Query) { 1051 const LLT Ty = Query.Types[0]; 1052 return Op == G_LOAD && Ty.isVector() && 1053 shouldWidenLoadResult(Query, Op); 1054 }, moreElementsToNextPow2(0)) 1055 .widenScalarIf([=](const LegalityQuery &Query) { 1056 const LLT Ty = Query.Types[0]; 1057 return Op == G_LOAD && !Ty.isVector() && 1058 shouldWidenLoadResult(Query, Op); 1059 }, widenScalarOrEltToNextPow2(0)) 1060 .narrowScalarIf( 1061 [=](const LegalityQuery &Query) -> bool { 1062 return !Query.Types[0].isVector() && 1063 needToSplitMemOp(Query, Op == G_LOAD); 1064 }, 1065 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1066 const LLT DstTy = Query.Types[0]; 1067 const LLT PtrTy = Query.Types[1]; 1068 1069 const unsigned DstSize = DstTy.getSizeInBits(); 1070 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1071 1072 // Split extloads. 1073 if (DstSize > MemSize) 1074 return std::make_pair(0, LLT::scalar(MemSize)); 1075 1076 if (!isPowerOf2_32(DstSize)) { 1077 // We're probably decomposing an odd sized store. Try to split 1078 // to the widest type. TODO: Account for alignment. As-is it 1079 // should be OK, since the new parts will be further legalized. 1080 unsigned FloorSize = PowerOf2Floor(DstSize); 1081 return std::make_pair(0, LLT::scalar(FloorSize)); 1082 } 1083 1084 if (DstSize > 32 && (DstSize % 32 != 0)) { 1085 // FIXME: Need a way to specify non-extload of larger size if 1086 // suitably aligned. 1087 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1088 } 1089 1090 unsigned MaxSize = maxSizeForAddrSpace(ST, 1091 PtrTy.getAddressSpace(), 1092 Op == G_LOAD); 1093 if (MemSize > MaxSize) 1094 return std::make_pair(0, LLT::scalar(MaxSize)); 1095 1096 unsigned Align = Query.MMODescrs[0].AlignInBits; 1097 return std::make_pair(0, LLT::scalar(Align)); 1098 }) 1099 .fewerElementsIf( 1100 [=](const LegalityQuery &Query) -> bool { 1101 return Query.Types[0].isVector() && 1102 needToSplitMemOp(Query, Op == G_LOAD); 1103 }, 1104 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1105 const LLT DstTy = Query.Types[0]; 1106 const LLT PtrTy = Query.Types[1]; 1107 1108 LLT EltTy = DstTy.getElementType(); 1109 unsigned MaxSize = maxSizeForAddrSpace(ST, 1110 PtrTy.getAddressSpace(), 1111 Op == G_LOAD); 1112 1113 // FIXME: Handle widened to power of 2 results better. This ends 1114 // up scalarizing. 1115 // FIXME: 3 element stores scalarized on SI 1116 1117 // Split if it's too large for the address space. 1118 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1119 unsigned NumElts = DstTy.getNumElements(); 1120 unsigned EltSize = EltTy.getSizeInBits(); 1121 1122 if (MaxSize % EltSize == 0) { 1123 return std::make_pair( 1124 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1125 } 1126 1127 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1128 1129 // FIXME: Refine when odd breakdowns handled 1130 // The scalars will need to be re-legalized. 1131 if (NumPieces == 1 || NumPieces >= NumElts || 1132 NumElts % NumPieces != 0) 1133 return std::make_pair(0, EltTy); 1134 1135 return std::make_pair(0, 1136 LLT::vector(NumElts / NumPieces, EltTy)); 1137 } 1138 1139 // FIXME: We could probably handle weird extending loads better. 1140 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1141 if (DstTy.getSizeInBits() > MemSize) 1142 return std::make_pair(0, EltTy); 1143 1144 unsigned EltSize = EltTy.getSizeInBits(); 1145 unsigned DstSize = DstTy.getSizeInBits(); 1146 if (!isPowerOf2_32(DstSize)) { 1147 // We're probably decomposing an odd sized store. Try to split 1148 // to the widest type. TODO: Account for alignment. As-is it 1149 // should be OK, since the new parts will be further legalized. 1150 unsigned FloorSize = PowerOf2Floor(DstSize); 1151 return std::make_pair( 1152 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1153 } 1154 1155 // Need to split because of alignment. 1156 unsigned Align = Query.MMODescrs[0].AlignInBits; 1157 if (EltSize > Align && 1158 (EltSize / Align < DstTy.getNumElements())) { 1159 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1160 } 1161 1162 // May need relegalization for the scalars. 1163 return std::make_pair(0, EltTy); 1164 }) 1165 .minScalar(0, S32); 1166 1167 if (IsStore) 1168 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1169 1170 // TODO: Need a bitcast lower option? 1171 Actions 1172 .widenScalarToNextPow2(0) 1173 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1174 } 1175 1176 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1177 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1178 {S32, GlobalPtr, 16, 2 * 8}, 1179 {S32, LocalPtr, 8, 8}, 1180 {S32, LocalPtr, 16, 16}, 1181 {S32, PrivatePtr, 8, 8}, 1182 {S32, PrivatePtr, 16, 16}, 1183 {S32, ConstantPtr, 8, 8}, 1184 {S32, ConstantPtr, 16, 2 * 8}}); 1185 if (ST.hasFlatAddressSpace()) { 1186 ExtLoads.legalForTypesWithMemDesc( 1187 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1188 } 1189 1190 ExtLoads.clampScalar(0, S32, S32) 1191 .widenScalarToNextPow2(0) 1192 .unsupportedIfMemSizeNotPow2() 1193 .lower(); 1194 1195 auto &Atomics = getActionDefinitionsBuilder( 1196 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1197 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1198 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1199 G_ATOMICRMW_UMIN}) 1200 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1201 {S64, GlobalPtr}, {S64, LocalPtr}, 1202 {S32, RegionPtr}, {S64, RegionPtr}}); 1203 if (ST.hasFlatAddressSpace()) { 1204 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1205 } 1206 1207 if (ST.hasLDSFPAtomics()) { 1208 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1209 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1210 } 1211 1212 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1213 // demarshalling 1214 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1215 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1216 {S32, FlatPtr}, {S64, FlatPtr}}) 1217 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1218 {S32, RegionPtr}, {S64, RegionPtr}}); 1219 // TODO: Pointer types, any 32-bit or 64-bit vector 1220 1221 // Condition should be s32 for scalar, s1 for vector. 1222 getActionDefinitionsBuilder(G_SELECT) 1223 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1224 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1225 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1226 .clampScalar(0, S16, S64) 1227 .scalarize(1) 1228 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1229 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1230 .clampMaxNumElements(0, S32, 2) 1231 .clampMaxNumElements(0, LocalPtr, 2) 1232 .clampMaxNumElements(0, PrivatePtr, 2) 1233 .scalarize(0) 1234 .widenScalarToNextPow2(0) 1235 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1236 1237 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1238 // be more flexible with the shift amount type. 1239 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1240 .legalFor({{S32, S32}, {S64, S32}}); 1241 if (ST.has16BitInsts()) { 1242 if (ST.hasVOP3PInsts()) { 1243 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1244 .clampMaxNumElements(0, S16, 2); 1245 } else 1246 Shifts.legalFor({{S16, S16}}); 1247 1248 // TODO: Support 16-bit shift amounts for all types 1249 Shifts.widenScalarIf( 1250 [=](const LegalityQuery &Query) { 1251 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1252 // 32-bit amount. 1253 const LLT ValTy = Query.Types[0]; 1254 const LLT AmountTy = Query.Types[1]; 1255 return ValTy.getSizeInBits() <= 16 && 1256 AmountTy.getSizeInBits() < 16; 1257 }, changeTo(1, S16)); 1258 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1259 Shifts.clampScalar(1, S32, S32); 1260 Shifts.clampScalar(0, S16, S64); 1261 Shifts.widenScalarToNextPow2(0, 16); 1262 } else { 1263 // Make sure we legalize the shift amount type first, as the general 1264 // expansion for the shifted type will produce much worse code if it hasn't 1265 // been truncated already. 1266 Shifts.clampScalar(1, S32, S32); 1267 Shifts.clampScalar(0, S32, S64); 1268 Shifts.widenScalarToNextPow2(0, 32); 1269 } 1270 Shifts.scalarize(0); 1271 1272 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1273 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1274 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1275 unsigned IdxTypeIdx = 2; 1276 1277 getActionDefinitionsBuilder(Op) 1278 .customIf([=](const LegalityQuery &Query) { 1279 const LLT EltTy = Query.Types[EltTypeIdx]; 1280 const LLT VecTy = Query.Types[VecTypeIdx]; 1281 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1282 return (EltTy.getSizeInBits() == 16 || 1283 EltTy.getSizeInBits() % 32 == 0) && 1284 VecTy.getSizeInBits() % 32 == 0 && 1285 VecTy.getSizeInBits() <= MaxRegisterSize && 1286 IdxTy.getSizeInBits() == 32; 1287 }) 1288 .clampScalar(EltTypeIdx, S32, S64) 1289 .clampScalar(VecTypeIdx, S32, S64) 1290 .clampScalar(IdxTypeIdx, S32, S32) 1291 // TODO: Clamp the number of elements before resorting to stack lowering. 1292 // It should only be necessary with variable indexes. 1293 // As a last resort, lower to the stack 1294 .lower(); 1295 } 1296 1297 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1298 .unsupportedIf([=](const LegalityQuery &Query) { 1299 const LLT &EltTy = Query.Types[1].getElementType(); 1300 return Query.Types[0] != EltTy; 1301 }); 1302 1303 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1304 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1305 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1306 1307 // FIXME: Doesn't handle extract of illegal sizes. 1308 getActionDefinitionsBuilder(Op) 1309 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1310 // FIXME: Multiples of 16 should not be legal. 1311 .legalIf([=](const LegalityQuery &Query) { 1312 const LLT BigTy = Query.Types[BigTyIdx]; 1313 const LLT LitTy = Query.Types[LitTyIdx]; 1314 return (BigTy.getSizeInBits() % 32 == 0) && 1315 (LitTy.getSizeInBits() % 16 == 0); 1316 }) 1317 .widenScalarIf( 1318 [=](const LegalityQuery &Query) { 1319 const LLT BigTy = Query.Types[BigTyIdx]; 1320 return (BigTy.getScalarSizeInBits() < 16); 1321 }, 1322 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1323 .widenScalarIf( 1324 [=](const LegalityQuery &Query) { 1325 const LLT LitTy = Query.Types[LitTyIdx]; 1326 return (LitTy.getScalarSizeInBits() < 16); 1327 }, 1328 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1329 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1330 .widenScalarToNextPow2(BigTyIdx, 32); 1331 1332 } 1333 1334 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1335 .legalForCartesianProduct(AllS32Vectors, {S32}) 1336 .legalForCartesianProduct(AllS64Vectors, {S64}) 1337 .clampNumElements(0, V16S32, V32S32) 1338 .clampNumElements(0, V2S64, V16S64) 1339 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1340 1341 if (ST.hasScalarPackInsts()) { 1342 BuildVector 1343 // FIXME: Should probably widen s1 vectors straight to s32 1344 .minScalarOrElt(0, S16) 1345 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1346 .minScalar(1, S32); 1347 1348 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1349 .legalFor({V2S16, S32}) 1350 .lower(); 1351 BuildVector.minScalarOrElt(0, S32); 1352 } else { 1353 BuildVector.customFor({V2S16, S16}); 1354 BuildVector.minScalarOrElt(0, S32); 1355 1356 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1357 .customFor({V2S16, S32}) 1358 .lower(); 1359 } 1360 1361 BuildVector.legalIf(isRegisterType(0)); 1362 1363 // FIXME: Clamp maximum size 1364 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1365 .legalIf(isRegisterType(0)); 1366 1367 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1368 // pre-legalize. 1369 if (ST.hasVOP3PInsts()) { 1370 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1371 .customFor({V2S16, V2S16}) 1372 .lower(); 1373 } else 1374 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1375 1376 // Merge/Unmerge 1377 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1378 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1379 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1380 1381 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1382 const LLT Ty = Query.Types[TypeIdx]; 1383 if (Ty.isVector()) { 1384 const LLT &EltTy = Ty.getElementType(); 1385 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1386 return true; 1387 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1388 return true; 1389 } 1390 return false; 1391 }; 1392 1393 auto &Builder = getActionDefinitionsBuilder(Op) 1394 .lowerFor({{S16, V2S16}}) 1395 .lowerIf([=](const LegalityQuery &Query) { 1396 const LLT BigTy = Query.Types[BigTyIdx]; 1397 return BigTy.getSizeInBits() == 32; 1398 }) 1399 // Try to widen to s16 first for small types. 1400 // TODO: Only do this on targets with legal s16 shifts 1401 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1402 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1403 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1404 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1405 elementTypeIs(1, S16)), 1406 changeTo(1, V2S16)) 1407 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1408 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1409 // valid. 1410 .clampScalar(LitTyIdx, S32, S512) 1411 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1412 // Break up vectors with weird elements into scalars 1413 .fewerElementsIf( 1414 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1415 scalarize(0)) 1416 .fewerElementsIf( 1417 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1418 scalarize(1)) 1419 .clampScalar(BigTyIdx, S32, MaxScalar); 1420 1421 if (Op == G_MERGE_VALUES) { 1422 Builder.widenScalarIf( 1423 // TODO: Use 16-bit shifts if legal for 8-bit values? 1424 [=](const LegalityQuery &Query) { 1425 const LLT Ty = Query.Types[LitTyIdx]; 1426 return Ty.getSizeInBits() < 32; 1427 }, 1428 changeTo(LitTyIdx, S32)); 1429 } 1430 1431 Builder.widenScalarIf( 1432 [=](const LegalityQuery &Query) { 1433 const LLT Ty = Query.Types[BigTyIdx]; 1434 return !isPowerOf2_32(Ty.getSizeInBits()) && 1435 Ty.getSizeInBits() % 16 != 0; 1436 }, 1437 [=](const LegalityQuery &Query) { 1438 // Pick the next power of 2, or a multiple of 64 over 128. 1439 // Whichever is smaller. 1440 const LLT &Ty = Query.Types[BigTyIdx]; 1441 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1442 if (NewSizeInBits >= 256) { 1443 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1444 if (RoundedTo < NewSizeInBits) 1445 NewSizeInBits = RoundedTo; 1446 } 1447 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1448 }) 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT &BigTy = Query.Types[BigTyIdx]; 1451 const LLT &LitTy = Query.Types[LitTyIdx]; 1452 1453 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1454 return false; 1455 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1456 return false; 1457 1458 return BigTy.getSizeInBits() % 16 == 0 && 1459 LitTy.getSizeInBits() % 16 == 0 && 1460 BigTy.getSizeInBits() <= MaxRegisterSize; 1461 }) 1462 // Any vectors left are the wrong size. Scalarize them. 1463 .scalarize(0) 1464 .scalarize(1); 1465 } 1466 1467 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1468 // RegBankSelect. 1469 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1470 .legalFor({{S32}, {S64}}); 1471 1472 if (ST.hasVOP3PInsts()) { 1473 SextInReg.lowerFor({{V2S16}}) 1474 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1475 // get more vector shift opportunities, since we'll get those when 1476 // expanded. 1477 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1478 } else if (ST.has16BitInsts()) { 1479 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1480 } else { 1481 // Prefer to promote to s32 before lowering if we don't have 16-bit 1482 // shifts. This avoid a lot of intermediate truncate and extend operations. 1483 SextInReg.lowerFor({{S32}, {S64}}); 1484 } 1485 1486 SextInReg 1487 .scalarize(0) 1488 .clampScalar(0, S32, S64) 1489 .lower(); 1490 1491 getActionDefinitionsBuilder(G_FSHR) 1492 .legalFor({{S32, S32}}) 1493 .scalarize(0) 1494 .lower(); 1495 1496 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1497 .legalFor({S64}); 1498 1499 getActionDefinitionsBuilder(G_FENCE) 1500 .alwaysLegal(); 1501 1502 getActionDefinitionsBuilder({ 1503 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1504 G_FCOPYSIGN, 1505 1506 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1507 G_ATOMICRMW_NAND, 1508 G_ATOMICRMW_FSUB, 1509 G_READ_REGISTER, 1510 G_WRITE_REGISTER, 1511 1512 G_SADDO, G_SSUBO, 1513 1514 // TODO: Implement 1515 G_FMINIMUM, G_FMAXIMUM, 1516 G_FSHL 1517 }).lower(); 1518 1519 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1520 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1521 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1522 .unsupported(); 1523 1524 computeTables(); 1525 verify(*ST.getInstrInfo()); 1526 } 1527 1528 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1529 MachineInstr &MI) const { 1530 MachineIRBuilder &B = Helper.MIRBuilder; 1531 MachineRegisterInfo &MRI = *B.getMRI(); 1532 GISelChangeObserver &Observer = Helper.Observer; 1533 1534 switch (MI.getOpcode()) { 1535 case TargetOpcode::G_ADDRSPACE_CAST: 1536 return legalizeAddrSpaceCast(MI, MRI, B); 1537 case TargetOpcode::G_FRINT: 1538 return legalizeFrint(MI, MRI, B); 1539 case TargetOpcode::G_FCEIL: 1540 return legalizeFceil(MI, MRI, B); 1541 case TargetOpcode::G_INTRINSIC_TRUNC: 1542 return legalizeIntrinsicTrunc(MI, MRI, B); 1543 case TargetOpcode::G_SITOFP: 1544 return legalizeITOFP(MI, MRI, B, true); 1545 case TargetOpcode::G_UITOFP: 1546 return legalizeITOFP(MI, MRI, B, false); 1547 case TargetOpcode::G_FPTOSI: 1548 return legalizeFPTOI(MI, MRI, B, true); 1549 case TargetOpcode::G_FPTOUI: 1550 return legalizeFPTOI(MI, MRI, B, false); 1551 case TargetOpcode::G_FMINNUM: 1552 case TargetOpcode::G_FMAXNUM: 1553 case TargetOpcode::G_FMINNUM_IEEE: 1554 case TargetOpcode::G_FMAXNUM_IEEE: 1555 return legalizeMinNumMaxNum(Helper, MI); 1556 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1557 return legalizeExtractVectorElt(MI, MRI, B); 1558 case TargetOpcode::G_INSERT_VECTOR_ELT: 1559 return legalizeInsertVectorElt(MI, MRI, B); 1560 case TargetOpcode::G_SHUFFLE_VECTOR: 1561 return legalizeShuffleVector(MI, MRI, B); 1562 case TargetOpcode::G_FSIN: 1563 case TargetOpcode::G_FCOS: 1564 return legalizeSinCos(MI, MRI, B); 1565 case TargetOpcode::G_GLOBAL_VALUE: 1566 return legalizeGlobalValue(MI, MRI, B); 1567 case TargetOpcode::G_LOAD: 1568 return legalizeLoad(MI, MRI, B, Observer); 1569 case TargetOpcode::G_FMAD: 1570 return legalizeFMad(MI, MRI, B); 1571 case TargetOpcode::G_FDIV: 1572 return legalizeFDIV(MI, MRI, B); 1573 case TargetOpcode::G_UDIV: 1574 case TargetOpcode::G_UREM: 1575 return legalizeUDIV_UREM(MI, MRI, B); 1576 case TargetOpcode::G_SDIV: 1577 case TargetOpcode::G_SREM: 1578 return legalizeSDIV_SREM(MI, MRI, B); 1579 case TargetOpcode::G_ATOMIC_CMPXCHG: 1580 return legalizeAtomicCmpXChg(MI, MRI, B); 1581 case TargetOpcode::G_FLOG: 1582 return legalizeFlog(MI, B, numbers::ln2f); 1583 case TargetOpcode::G_FLOG10: 1584 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1585 case TargetOpcode::G_FEXP: 1586 return legalizeFExp(MI, B); 1587 case TargetOpcode::G_FPOW: 1588 return legalizeFPow(MI, B); 1589 case TargetOpcode::G_FFLOOR: 1590 return legalizeFFloor(MI, MRI, B); 1591 case TargetOpcode::G_BUILD_VECTOR: 1592 return legalizeBuildVector(MI, MRI, B); 1593 default: 1594 return false; 1595 } 1596 1597 llvm_unreachable("expected switch to return"); 1598 } 1599 1600 Register AMDGPULegalizerInfo::getSegmentAperture( 1601 unsigned AS, 1602 MachineRegisterInfo &MRI, 1603 MachineIRBuilder &B) const { 1604 MachineFunction &MF = B.getMF(); 1605 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1606 const LLT S32 = LLT::scalar(32); 1607 1608 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1609 1610 if (ST.hasApertureRegs()) { 1611 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1612 // getreg. 1613 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1614 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1615 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1616 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1617 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1618 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1619 unsigned Encoding = 1620 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1621 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1622 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1623 1624 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1625 1626 B.buildInstr(AMDGPU::S_GETREG_B32) 1627 .addDef(GetReg) 1628 .addImm(Encoding); 1629 MRI.setType(GetReg, S32); 1630 1631 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1632 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1633 } 1634 1635 Register QueuePtr = MRI.createGenericVirtualRegister( 1636 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1637 1638 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1639 return Register(); 1640 1641 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1642 // private_segment_aperture_base_hi. 1643 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1644 1645 // TODO: can we be smarter about machine pointer info? 1646 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1647 MachineMemOperand *MMO = MF.getMachineMemOperand( 1648 PtrInfo, 1649 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1650 MachineMemOperand::MOInvariant, 1651 4, commonAlignment(Align(64), StructOffset)); 1652 1653 Register LoadAddr; 1654 1655 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1656 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1657 } 1658 1659 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1660 MachineInstr &MI, MachineRegisterInfo &MRI, 1661 MachineIRBuilder &B) const { 1662 MachineFunction &MF = B.getMF(); 1663 1664 const LLT S32 = LLT::scalar(32); 1665 Register Dst = MI.getOperand(0).getReg(); 1666 Register Src = MI.getOperand(1).getReg(); 1667 1668 LLT DstTy = MRI.getType(Dst); 1669 LLT SrcTy = MRI.getType(Src); 1670 unsigned DestAS = DstTy.getAddressSpace(); 1671 unsigned SrcAS = SrcTy.getAddressSpace(); 1672 1673 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1674 // vector element. 1675 assert(!DstTy.isVector()); 1676 1677 const AMDGPUTargetMachine &TM 1678 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1679 1680 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1681 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1682 return true; 1683 } 1684 1685 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1686 // Truncate. 1687 B.buildExtract(Dst, Src, 0); 1688 MI.eraseFromParent(); 1689 return true; 1690 } 1691 1692 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1693 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1694 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1695 1696 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1697 // another. Merge operands are required to be the same type, but creating an 1698 // extra ptrtoint would be kind of pointless. 1699 auto HighAddr = B.buildConstant( 1700 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1701 B.buildMerge(Dst, {Src, HighAddr}); 1702 MI.eraseFromParent(); 1703 return true; 1704 } 1705 1706 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1707 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1708 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1709 unsigned NullVal = TM.getNullPointerValue(DestAS); 1710 1711 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1712 auto FlatNull = B.buildConstant(SrcTy, 0); 1713 1714 // Extract low 32-bits of the pointer. 1715 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1716 1717 auto CmpRes = 1718 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1719 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1720 1721 MI.eraseFromParent(); 1722 return true; 1723 } 1724 1725 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1726 return false; 1727 1728 if (!ST.hasFlatAddressSpace()) 1729 return false; 1730 1731 auto SegmentNull = 1732 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1733 auto FlatNull = 1734 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1735 1736 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1737 if (!ApertureReg.isValid()) 1738 return false; 1739 1740 auto CmpRes = 1741 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1742 1743 // Coerce the type of the low half of the result so we can use merge_values. 1744 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1745 1746 // TODO: Should we allow mismatched types but matching sizes in merges to 1747 // avoid the ptrtoint? 1748 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1749 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1750 1751 MI.eraseFromParent(); 1752 return true; 1753 } 1754 1755 bool AMDGPULegalizerInfo::legalizeFrint( 1756 MachineInstr &MI, MachineRegisterInfo &MRI, 1757 MachineIRBuilder &B) const { 1758 Register Src = MI.getOperand(1).getReg(); 1759 LLT Ty = MRI.getType(Src); 1760 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1761 1762 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1763 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1764 1765 auto C1 = B.buildFConstant(Ty, C1Val); 1766 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1767 1768 // TODO: Should this propagate fast-math-flags? 1769 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1770 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1771 1772 auto C2 = B.buildFConstant(Ty, C2Val); 1773 auto Fabs = B.buildFAbs(Ty, Src); 1774 1775 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1776 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1777 MI.eraseFromParent(); 1778 return true; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeFceil( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, 1783 MachineIRBuilder &B) const { 1784 1785 const LLT S1 = LLT::scalar(1); 1786 const LLT S64 = LLT::scalar(64); 1787 1788 Register Src = MI.getOperand(1).getReg(); 1789 assert(MRI.getType(Src) == S64); 1790 1791 // result = trunc(src) 1792 // if (src > 0.0 && src != result) 1793 // result += 1.0 1794 1795 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1796 1797 const auto Zero = B.buildFConstant(S64, 0.0); 1798 const auto One = B.buildFConstant(S64, 1.0); 1799 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1800 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1801 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1802 auto Add = B.buildSelect(S64, And, One, Zero); 1803 1804 // TODO: Should this propagate fast-math-flags? 1805 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1806 return true; 1807 } 1808 1809 static MachineInstrBuilder extractF64Exponent(Register Hi, 1810 MachineIRBuilder &B) { 1811 const unsigned FractBits = 52; 1812 const unsigned ExpBits = 11; 1813 LLT S32 = LLT::scalar(32); 1814 1815 auto Const0 = B.buildConstant(S32, FractBits - 32); 1816 auto Const1 = B.buildConstant(S32, ExpBits); 1817 1818 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1819 .addUse(Hi) 1820 .addUse(Const0.getReg(0)) 1821 .addUse(Const1.getReg(0)); 1822 1823 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1824 } 1825 1826 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1827 MachineInstr &MI, MachineRegisterInfo &MRI, 1828 MachineIRBuilder &B) const { 1829 const LLT S1 = LLT::scalar(1); 1830 const LLT S32 = LLT::scalar(32); 1831 const LLT S64 = LLT::scalar(64); 1832 1833 Register Src = MI.getOperand(1).getReg(); 1834 assert(MRI.getType(Src) == S64); 1835 1836 // TODO: Should this use extract since the low half is unused? 1837 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1838 Register Hi = Unmerge.getReg(1); 1839 1840 // Extract the upper half, since this is where we will find the sign and 1841 // exponent. 1842 auto Exp = extractF64Exponent(Hi, B); 1843 1844 const unsigned FractBits = 52; 1845 1846 // Extract the sign bit. 1847 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1848 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1849 1850 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1851 1852 const auto Zero32 = B.buildConstant(S32, 0); 1853 1854 // Extend back to 64-bits. 1855 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1856 1857 auto Shr = B.buildAShr(S64, FractMask, Exp); 1858 auto Not = B.buildNot(S64, Shr); 1859 auto Tmp0 = B.buildAnd(S64, Src, Not); 1860 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1861 1862 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1863 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1864 1865 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1866 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1867 MI.eraseFromParent(); 1868 return true; 1869 } 1870 1871 bool AMDGPULegalizerInfo::legalizeITOFP( 1872 MachineInstr &MI, MachineRegisterInfo &MRI, 1873 MachineIRBuilder &B, bool Signed) const { 1874 1875 Register Dst = MI.getOperand(0).getReg(); 1876 Register Src = MI.getOperand(1).getReg(); 1877 1878 const LLT S64 = LLT::scalar(64); 1879 const LLT S32 = LLT::scalar(32); 1880 1881 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1882 1883 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1884 1885 auto CvtHi = Signed ? 1886 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1887 B.buildUITOFP(S64, Unmerge.getReg(1)); 1888 1889 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1890 1891 auto ThirtyTwo = B.buildConstant(S32, 32); 1892 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1893 .addUse(CvtHi.getReg(0)) 1894 .addUse(ThirtyTwo.getReg(0)); 1895 1896 // TODO: Should this propagate fast-math-flags? 1897 B.buildFAdd(Dst, LdExp, CvtLo); 1898 MI.eraseFromParent(); 1899 return true; 1900 } 1901 1902 // TODO: Copied from DAG implementation. Verify logic and document how this 1903 // actually works. 1904 bool AMDGPULegalizerInfo::legalizeFPTOI( 1905 MachineInstr &MI, MachineRegisterInfo &MRI, 1906 MachineIRBuilder &B, bool Signed) const { 1907 1908 Register Dst = MI.getOperand(0).getReg(); 1909 Register Src = MI.getOperand(1).getReg(); 1910 1911 const LLT S64 = LLT::scalar(64); 1912 const LLT S32 = LLT::scalar(32); 1913 1914 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1915 1916 unsigned Flags = MI.getFlags(); 1917 1918 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1919 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1920 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1921 1922 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1923 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1924 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1925 1926 auto Hi = Signed ? 1927 B.buildFPTOSI(S32, FloorMul) : 1928 B.buildFPTOUI(S32, FloorMul); 1929 auto Lo = B.buildFPTOUI(S32, Fma); 1930 1931 B.buildMerge(Dst, { Lo, Hi }); 1932 MI.eraseFromParent(); 1933 1934 return true; 1935 } 1936 1937 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1938 MachineInstr &MI) const { 1939 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1940 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1941 1942 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1943 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1944 1945 // With ieee_mode disabled, the instructions have the correct behavior 1946 // already for G_FMINNUM/G_FMAXNUM 1947 if (!MFI->getMode().IEEE) 1948 return !IsIEEEOp; 1949 1950 if (IsIEEEOp) 1951 return true; 1952 1953 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1954 } 1955 1956 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1957 MachineInstr &MI, MachineRegisterInfo &MRI, 1958 MachineIRBuilder &B) const { 1959 // TODO: Should move some of this into LegalizerHelper. 1960 1961 // TODO: Promote dynamic indexing of s16 to s32 1962 1963 // FIXME: Artifact combiner probably should have replaced the truncated 1964 // constant before this, so we shouldn't need 1965 // getConstantVRegValWithLookThrough. 1966 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1967 MI.getOperand(2).getReg(), MRI); 1968 if (!IdxVal) // Dynamic case will be selected to register indexing. 1969 return true; 1970 1971 Register Dst = MI.getOperand(0).getReg(); 1972 Register Vec = MI.getOperand(1).getReg(); 1973 1974 LLT VecTy = MRI.getType(Vec); 1975 LLT EltTy = VecTy.getElementType(); 1976 assert(EltTy == MRI.getType(Dst)); 1977 1978 if (IdxVal->Value < VecTy.getNumElements()) 1979 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1980 else 1981 B.buildUndef(Dst); 1982 1983 MI.eraseFromParent(); 1984 return true; 1985 } 1986 1987 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1988 MachineInstr &MI, MachineRegisterInfo &MRI, 1989 MachineIRBuilder &B) const { 1990 // TODO: Should move some of this into LegalizerHelper. 1991 1992 // TODO: Promote dynamic indexing of s16 to s32 1993 1994 // FIXME: Artifact combiner probably should have replaced the truncated 1995 // constant before this, so we shouldn't need 1996 // getConstantVRegValWithLookThrough. 1997 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1998 MI.getOperand(3).getReg(), MRI); 1999 if (!IdxVal) // Dynamic case will be selected to register indexing. 2000 return true; 2001 2002 Register Dst = MI.getOperand(0).getReg(); 2003 Register Vec = MI.getOperand(1).getReg(); 2004 Register Ins = MI.getOperand(2).getReg(); 2005 2006 LLT VecTy = MRI.getType(Vec); 2007 LLT EltTy = VecTy.getElementType(); 2008 assert(EltTy == MRI.getType(Ins)); 2009 2010 if (IdxVal->Value < VecTy.getNumElements()) 2011 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2012 else 2013 B.buildUndef(Dst); 2014 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2020 MachineInstr &MI, MachineRegisterInfo &MRI, 2021 MachineIRBuilder &B) const { 2022 const LLT V2S16 = LLT::vector(2, 16); 2023 2024 Register Dst = MI.getOperand(0).getReg(); 2025 Register Src0 = MI.getOperand(1).getReg(); 2026 LLT DstTy = MRI.getType(Dst); 2027 LLT SrcTy = MRI.getType(Src0); 2028 2029 if (SrcTy == V2S16 && DstTy == V2S16 && 2030 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2031 return true; 2032 2033 MachineIRBuilder HelperBuilder(MI); 2034 GISelObserverWrapper DummyObserver; 2035 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2036 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2037 } 2038 2039 bool AMDGPULegalizerInfo::legalizeSinCos( 2040 MachineInstr &MI, MachineRegisterInfo &MRI, 2041 MachineIRBuilder &B) const { 2042 2043 Register DstReg = MI.getOperand(0).getReg(); 2044 Register SrcReg = MI.getOperand(1).getReg(); 2045 LLT Ty = MRI.getType(DstReg); 2046 unsigned Flags = MI.getFlags(); 2047 2048 Register TrigVal; 2049 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2050 if (ST.hasTrigReducedRange()) { 2051 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2052 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2053 .addUse(MulVal.getReg(0)) 2054 .setMIFlags(Flags).getReg(0); 2055 } else 2056 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2057 2058 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2059 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2060 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2061 .addUse(TrigVal) 2062 .setMIFlags(Flags); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2068 MachineIRBuilder &B, 2069 const GlobalValue *GV, 2070 int64_t Offset, 2071 unsigned GAFlags) const { 2072 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2073 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2074 // to the following code sequence: 2075 // 2076 // For constant address space: 2077 // s_getpc_b64 s[0:1] 2078 // s_add_u32 s0, s0, $symbol 2079 // s_addc_u32 s1, s1, 0 2080 // 2081 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2082 // a fixup or relocation is emitted to replace $symbol with a literal 2083 // constant, which is a pc-relative offset from the encoding of the $symbol 2084 // operand to the global variable. 2085 // 2086 // For global address space: 2087 // s_getpc_b64 s[0:1] 2088 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2089 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2090 // 2091 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2092 // fixups or relocations are emitted to replace $symbol@*@lo and 2093 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2094 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2095 // operand to the global variable. 2096 // 2097 // What we want here is an offset from the value returned by s_getpc 2098 // (which is the address of the s_add_u32 instruction) to the global 2099 // variable, but since the encoding of $symbol starts 4 bytes after the start 2100 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2101 // small. This requires us to add 4 to the global variable offset in order to 2102 // compute the correct address. 2103 2104 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2105 2106 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2107 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2108 2109 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2110 .addDef(PCReg); 2111 2112 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2113 if (GAFlags == SIInstrInfo::MO_NONE) 2114 MIB.addImm(0); 2115 else 2116 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2117 2118 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2119 2120 if (PtrTy.getSizeInBits() == 32) 2121 B.buildExtract(DstReg, PCReg, 0); 2122 return true; 2123 } 2124 2125 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2126 MachineInstr &MI, MachineRegisterInfo &MRI, 2127 MachineIRBuilder &B) const { 2128 Register DstReg = MI.getOperand(0).getReg(); 2129 LLT Ty = MRI.getType(DstReg); 2130 unsigned AS = Ty.getAddressSpace(); 2131 2132 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2133 MachineFunction &MF = B.getMF(); 2134 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2135 2136 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2137 if (!MFI->isEntryFunction()) { 2138 const Function &Fn = MF.getFunction(); 2139 DiagnosticInfoUnsupported BadLDSDecl( 2140 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2141 DS_Warning); 2142 Fn.getContext().diagnose(BadLDSDecl); 2143 2144 // We currently don't have a way to correctly allocate LDS objects that 2145 // aren't directly associated with a kernel. We do force inlining of 2146 // functions that use local objects. However, if these dead functions are 2147 // not eliminated, we don't want a compile time error. Just emit a warning 2148 // and a trap, since there should be no callable path here. 2149 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2150 B.buildUndef(DstReg); 2151 MI.eraseFromParent(); 2152 return true; 2153 } 2154 2155 // TODO: We could emit code to handle the initialization somewhere. 2156 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2157 const SITargetLowering *TLI = ST.getTargetLowering(); 2158 if (!TLI->shouldUseLDSConstAddress(GV)) { 2159 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2160 return true; // Leave in place; 2161 } 2162 2163 B.buildConstant( 2164 DstReg, 2165 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2166 MI.eraseFromParent(); 2167 return true; 2168 } 2169 2170 const Function &Fn = MF.getFunction(); 2171 DiagnosticInfoUnsupported BadInit( 2172 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2173 Fn.getContext().diagnose(BadInit); 2174 return true; 2175 } 2176 2177 const SITargetLowering *TLI = ST.getTargetLowering(); 2178 2179 if (TLI->shouldEmitFixup(GV)) { 2180 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2181 MI.eraseFromParent(); 2182 return true; 2183 } 2184 2185 if (TLI->shouldEmitPCReloc(GV)) { 2186 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2187 MI.eraseFromParent(); 2188 return true; 2189 } 2190 2191 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2192 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2193 2194 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2195 MachinePointerInfo::getGOT(MF), 2196 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2197 MachineMemOperand::MOInvariant, 2198 8 /*Size*/, Align(8)); 2199 2200 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2201 2202 if (Ty.getSizeInBits() == 32) { 2203 // Truncate if this is a 32-bit constant adrdess. 2204 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2205 B.buildExtract(DstReg, Load, 0); 2206 } else 2207 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2208 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 bool AMDGPULegalizerInfo::legalizeLoad( 2214 MachineInstr &MI, MachineRegisterInfo &MRI, 2215 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2216 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2217 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2218 Observer.changingInstr(MI); 2219 MI.getOperand(1).setReg(Cast.getReg(0)); 2220 Observer.changedInstr(MI); 2221 return true; 2222 } 2223 2224 bool AMDGPULegalizerInfo::legalizeFMad( 2225 MachineInstr &MI, MachineRegisterInfo &MRI, 2226 MachineIRBuilder &B) const { 2227 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2228 assert(Ty.isScalar()); 2229 2230 MachineFunction &MF = B.getMF(); 2231 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2232 2233 // TODO: Always legal with future ftz flag. 2234 // FIXME: Do we need just output? 2235 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2236 return true; 2237 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2238 return true; 2239 2240 MachineIRBuilder HelperBuilder(MI); 2241 GISelObserverWrapper DummyObserver; 2242 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2243 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2247 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2248 Register DstReg = MI.getOperand(0).getReg(); 2249 Register PtrReg = MI.getOperand(1).getReg(); 2250 Register CmpVal = MI.getOperand(2).getReg(); 2251 Register NewVal = MI.getOperand(3).getReg(); 2252 2253 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2254 "this should not have been custom lowered"); 2255 2256 LLT ValTy = MRI.getType(CmpVal); 2257 LLT VecTy = LLT::vector(2, ValTy); 2258 2259 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2260 2261 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2262 .addDef(DstReg) 2263 .addUse(PtrReg) 2264 .addUse(PackedVal) 2265 .setMemRefs(MI.memoperands()); 2266 2267 MI.eraseFromParent(); 2268 return true; 2269 } 2270 2271 bool AMDGPULegalizerInfo::legalizeFlog( 2272 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2273 Register Dst = MI.getOperand(0).getReg(); 2274 Register Src = MI.getOperand(1).getReg(); 2275 LLT Ty = B.getMRI()->getType(Dst); 2276 unsigned Flags = MI.getFlags(); 2277 2278 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2279 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2280 2281 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2282 MI.eraseFromParent(); 2283 return true; 2284 } 2285 2286 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2287 MachineIRBuilder &B) const { 2288 Register Dst = MI.getOperand(0).getReg(); 2289 Register Src = MI.getOperand(1).getReg(); 2290 unsigned Flags = MI.getFlags(); 2291 LLT Ty = B.getMRI()->getType(Dst); 2292 2293 auto K = B.buildFConstant(Ty, numbers::log2e); 2294 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2295 B.buildFExp2(Dst, Mul, Flags); 2296 MI.eraseFromParent(); 2297 return true; 2298 } 2299 2300 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2301 MachineIRBuilder &B) const { 2302 Register Dst = MI.getOperand(0).getReg(); 2303 Register Src0 = MI.getOperand(1).getReg(); 2304 Register Src1 = MI.getOperand(2).getReg(); 2305 unsigned Flags = MI.getFlags(); 2306 LLT Ty = B.getMRI()->getType(Dst); 2307 const LLT S16 = LLT::scalar(16); 2308 const LLT S32 = LLT::scalar(32); 2309 2310 if (Ty == S32) { 2311 auto Log = B.buildFLog2(S32, Src0, Flags); 2312 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2313 .addUse(Log.getReg(0)) 2314 .addUse(Src1) 2315 .setMIFlags(Flags); 2316 B.buildFExp2(Dst, Mul, Flags); 2317 } else if (Ty == S16) { 2318 // There's no f16 fmul_legacy, so we need to convert for it. 2319 auto Log = B.buildFLog2(S16, Src0, Flags); 2320 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2321 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2322 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2323 .addUse(Ext0.getReg(0)) 2324 .addUse(Ext1.getReg(0)) 2325 .setMIFlags(Flags); 2326 2327 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2328 } else 2329 return false; 2330 2331 MI.eraseFromParent(); 2332 return true; 2333 } 2334 2335 // Find a source register, ignoring any possible source modifiers. 2336 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2337 Register ModSrc = OrigSrc; 2338 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2339 ModSrc = SrcFNeg->getOperand(1).getReg(); 2340 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2341 ModSrc = SrcFAbs->getOperand(1).getReg(); 2342 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2343 ModSrc = SrcFAbs->getOperand(1).getReg(); 2344 return ModSrc; 2345 } 2346 2347 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2348 MachineRegisterInfo &MRI, 2349 MachineIRBuilder &B) const { 2350 2351 const LLT S1 = LLT::scalar(1); 2352 const LLT S64 = LLT::scalar(64); 2353 Register Dst = MI.getOperand(0).getReg(); 2354 Register OrigSrc = MI.getOperand(1).getReg(); 2355 unsigned Flags = MI.getFlags(); 2356 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2357 "this should not have been custom lowered"); 2358 2359 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2360 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2361 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2362 // V_FRACT bug is: 2363 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2364 // 2365 // Convert floor(x) to (x - fract(x)) 2366 2367 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2368 .addUse(OrigSrc) 2369 .setMIFlags(Flags); 2370 2371 // Give source modifier matching some assistance before obscuring a foldable 2372 // pattern. 2373 2374 // TODO: We can avoid the neg on the fract? The input sign to fract 2375 // shouldn't matter? 2376 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2377 2378 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2379 2380 Register Min = MRI.createGenericVirtualRegister(S64); 2381 2382 // We don't need to concern ourselves with the snan handling difference, so 2383 // use the one which will directly select. 2384 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2385 if (MFI->getMode().IEEE) 2386 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2387 else 2388 B.buildFMinNum(Min, Fract, Const, Flags); 2389 2390 Register CorrectedFract = Min; 2391 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2392 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2393 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2394 } 2395 2396 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2397 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2398 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 2403 // Turn an illegal packed v2s16 build vector into bit operations. 2404 // TODO: This should probably be a bitcast action in LegalizerHelper. 2405 bool AMDGPULegalizerInfo::legalizeBuildVector( 2406 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2407 Register Dst = MI.getOperand(0).getReg(); 2408 const LLT S32 = LLT::scalar(32); 2409 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2410 2411 Register Src0 = MI.getOperand(1).getReg(); 2412 Register Src1 = MI.getOperand(2).getReg(); 2413 assert(MRI.getType(Src0) == LLT::scalar(16)); 2414 2415 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2416 B.buildBitcast(Dst, Merge); 2417 2418 MI.eraseFromParent(); 2419 return true; 2420 } 2421 2422 // Return the use branch instruction, otherwise null if the usage is invalid. 2423 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2424 MachineRegisterInfo &MRI, 2425 MachineInstr *&Br, 2426 MachineBasicBlock *&UncondBrTarget) { 2427 Register CondDef = MI.getOperand(0).getReg(); 2428 if (!MRI.hasOneNonDBGUse(CondDef)) 2429 return nullptr; 2430 2431 MachineBasicBlock *Parent = MI.getParent(); 2432 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2433 if (UseMI.getParent() != Parent || 2434 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2435 return nullptr; 2436 2437 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2438 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2439 if (Next == Parent->end()) { 2440 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2441 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2442 return nullptr; 2443 UncondBrTarget = &*NextMBB; 2444 } else { 2445 if (Next->getOpcode() != AMDGPU::G_BR) 2446 return nullptr; 2447 Br = &*Next; 2448 UncondBrTarget = Br->getOperand(0).getMBB(); 2449 } 2450 2451 return &UseMI; 2452 } 2453 2454 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2455 MachineRegisterInfo &MRI, 2456 Register LiveIn, 2457 Register PhyReg) const { 2458 assert(PhyReg.isPhysical() && "Physical register expected"); 2459 2460 // Insert the live-in copy, if required, by defining destination virtual 2461 // register. 2462 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2463 if (!MRI.getVRegDef(LiveIn)) { 2464 // FIXME: Should have scoped insert pt 2465 MachineBasicBlock &OrigInsBB = B.getMBB(); 2466 auto OrigInsPt = B.getInsertPt(); 2467 2468 MachineBasicBlock &EntryMBB = B.getMF().front(); 2469 EntryMBB.addLiveIn(PhyReg); 2470 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2471 B.buildCopy(LiveIn, PhyReg); 2472 2473 B.setInsertPt(OrigInsBB, OrigInsPt); 2474 } 2475 2476 return LiveIn; 2477 } 2478 2479 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2480 MachineRegisterInfo &MRI, 2481 Register PhyReg, LLT Ty, 2482 bool InsertLiveInCopy) const { 2483 assert(PhyReg.isPhysical() && "Physical register expected"); 2484 2485 // Get or create virtual live-in regester 2486 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2487 if (!LiveIn) { 2488 LiveIn = MRI.createGenericVirtualRegister(Ty); 2489 MRI.addLiveIn(PhyReg, LiveIn); 2490 } 2491 2492 // When the actual true copy required is from virtual register to physical 2493 // register (to be inserted later), live-in copy insertion from physical 2494 // to register virtual register is not required 2495 if (!InsertLiveInCopy) 2496 return LiveIn; 2497 2498 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2499 } 2500 2501 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2502 const ArgDescriptor *Arg, 2503 const TargetRegisterClass *ArgRC, 2504 LLT ArgTy) const { 2505 MCRegister SrcReg = Arg->getRegister(); 2506 assert(SrcReg.isPhysical() && "Physical register expected"); 2507 assert(DstReg.isVirtual() && "Virtual register expected"); 2508 2509 MachineRegisterInfo &MRI = *B.getMRI(); 2510 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); 2511 2512 if (Arg->isMasked()) { 2513 // TODO: Should we try to emit this once in the entry block? 2514 const LLT S32 = LLT::scalar(32); 2515 const unsigned Mask = Arg->getMask(); 2516 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2517 2518 Register AndMaskSrc = LiveIn; 2519 2520 if (Shift != 0) { 2521 auto ShiftAmt = B.buildConstant(S32, Shift); 2522 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2523 } 2524 2525 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2526 } else { 2527 B.buildCopy(DstReg, LiveIn); 2528 } 2529 2530 return true; 2531 } 2532 2533 bool AMDGPULegalizerInfo::loadInputValue( 2534 Register DstReg, MachineIRBuilder &B, 2535 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2536 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2537 const ArgDescriptor *Arg; 2538 const TargetRegisterClass *ArgRC; 2539 LLT ArgTy; 2540 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2541 2542 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2543 return false; // TODO: Handle these 2544 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2545 } 2546 2547 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2548 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2549 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2550 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2551 return false; 2552 2553 MI.eraseFromParent(); 2554 return true; 2555 } 2556 2557 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2558 MachineRegisterInfo &MRI, 2559 MachineIRBuilder &B) const { 2560 Register Dst = MI.getOperand(0).getReg(); 2561 LLT DstTy = MRI.getType(Dst); 2562 LLT S16 = LLT::scalar(16); 2563 LLT S32 = LLT::scalar(32); 2564 LLT S64 = LLT::scalar(64); 2565 2566 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2567 return true; 2568 2569 if (DstTy == S16) 2570 return legalizeFDIV16(MI, MRI, B); 2571 if (DstTy == S32) 2572 return legalizeFDIV32(MI, MRI, B); 2573 if (DstTy == S64) 2574 return legalizeFDIV64(MI, MRI, B); 2575 2576 return false; 2577 } 2578 2579 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2580 Register DstReg, 2581 Register X, 2582 Register Y, 2583 bool IsDiv) const { 2584 const LLT S1 = LLT::scalar(1); 2585 const LLT S32 = LLT::scalar(32); 2586 2587 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2588 // algorithm used here. 2589 2590 // Initial estimate of inv(y). 2591 auto FloatY = B.buildUITOFP(S32, Y); 2592 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2593 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2594 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2595 auto Z = B.buildFPTOUI(S32, ScaledY); 2596 2597 // One round of UNR. 2598 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2599 auto NegYZ = B.buildMul(S32, NegY, Z); 2600 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2601 2602 // Quotient/remainder estimate. 2603 auto Q = B.buildUMulH(S32, X, Z); 2604 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2605 2606 // First quotient/remainder refinement. 2607 auto One = B.buildConstant(S32, 1); 2608 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2609 if (IsDiv) 2610 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2611 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2612 2613 // Second quotient/remainder refinement. 2614 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2615 if (IsDiv) 2616 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2617 else 2618 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2619 } 2620 2621 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2622 MachineRegisterInfo &MRI, 2623 MachineIRBuilder &B) const { 2624 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2625 Register DstReg = MI.getOperand(0).getReg(); 2626 Register Num = MI.getOperand(1).getReg(); 2627 Register Den = MI.getOperand(2).getReg(); 2628 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2634 // 2635 // Return lo, hi of result 2636 // 2637 // %cvt.lo = G_UITOFP Val.lo 2638 // %cvt.hi = G_UITOFP Val.hi 2639 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2640 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2641 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2642 // %mul2 = G_FMUL %mul1, 2**(-32) 2643 // %trunc = G_INTRINSIC_TRUNC %mul2 2644 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2645 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2646 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2647 Register Val) { 2648 const LLT S32 = LLT::scalar(32); 2649 auto Unmerge = B.buildUnmerge(S32, Val); 2650 2651 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2652 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2653 2654 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2655 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2656 2657 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2658 auto Mul1 = 2659 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2660 2661 // 2**(-32) 2662 auto Mul2 = 2663 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2664 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2665 2666 // -(2**32) 2667 auto Mad2 = B.buildFMAD(S32, Trunc, 2668 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2669 2670 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2671 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2672 2673 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2674 } 2675 2676 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2677 Register DstReg, 2678 Register Numer, 2679 Register Denom, 2680 bool IsDiv) const { 2681 const LLT S32 = LLT::scalar(32); 2682 const LLT S64 = LLT::scalar(64); 2683 const LLT S1 = LLT::scalar(1); 2684 Register RcpLo, RcpHi; 2685 2686 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2687 2688 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2689 2690 auto Zero64 = B.buildConstant(S64, 0); 2691 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2692 2693 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2694 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2695 2696 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2697 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2698 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2699 2700 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2701 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2702 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2703 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2704 2705 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2706 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2707 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2708 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2709 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2710 2711 auto Zero32 = B.buildConstant(S32, 0); 2712 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2713 auto Add2_HiC = 2714 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2715 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2716 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2717 2718 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2719 Register NumerLo = UnmergeNumer.getReg(0); 2720 Register NumerHi = UnmergeNumer.getReg(1); 2721 2722 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2723 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2724 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2725 Register Mul3_Lo = UnmergeMul3.getReg(0); 2726 Register Mul3_Hi = UnmergeMul3.getReg(1); 2727 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2728 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2729 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2730 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2731 2732 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2733 Register DenomLo = UnmergeDenom.getReg(0); 2734 Register DenomHi = UnmergeDenom.getReg(1); 2735 2736 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2737 auto C1 = B.buildSExt(S32, CmpHi); 2738 2739 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2740 auto C2 = B.buildSExt(S32, CmpLo); 2741 2742 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2743 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2744 2745 // TODO: Here and below portions of the code can be enclosed into if/endif. 2746 // Currently control flow is unconditional and we have 4 selects after 2747 // potential endif to substitute PHIs. 2748 2749 // if C3 != 0 ... 2750 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2751 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2752 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2753 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2754 2755 auto One64 = B.buildConstant(S64, 1); 2756 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2757 2758 auto C4 = 2759 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2760 auto C5 = 2761 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2762 auto C6 = B.buildSelect( 2763 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2764 2765 // if (C6 != 0) 2766 auto Add4 = B.buildAdd(S64, Add3, One64); 2767 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2768 2769 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2770 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2771 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2772 2773 // endif C6 2774 // endif C3 2775 2776 if (IsDiv) { 2777 auto Sel1 = B.buildSelect( 2778 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2779 B.buildSelect(DstReg, 2780 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2781 } else { 2782 auto Sel2 = B.buildSelect( 2783 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2784 B.buildSelect(DstReg, 2785 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2786 } 2787 } 2788 2789 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2790 MachineRegisterInfo &MRI, 2791 MachineIRBuilder &B) const { 2792 const LLT S64 = LLT::scalar(64); 2793 const LLT S32 = LLT::scalar(32); 2794 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2795 Register DstReg = MI.getOperand(0).getReg(); 2796 Register Num = MI.getOperand(1).getReg(); 2797 Register Den = MI.getOperand(2).getReg(); 2798 LLT Ty = MRI.getType(DstReg); 2799 2800 if (Ty == S32) 2801 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2802 else if (Ty == S64) 2803 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2804 else 2805 return false; 2806 2807 MI.eraseFromParent(); 2808 return true; 2809 2810 } 2811 2812 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2813 MachineRegisterInfo &MRI, 2814 MachineIRBuilder &B) const { 2815 const LLT S64 = LLT::scalar(64); 2816 const LLT S32 = LLT::scalar(32); 2817 2818 Register DstReg = MI.getOperand(0).getReg(); 2819 const LLT Ty = MRI.getType(DstReg); 2820 if (Ty != S32 && Ty != S64) 2821 return false; 2822 2823 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2824 2825 Register LHS = MI.getOperand(1).getReg(); 2826 Register RHS = MI.getOperand(2).getReg(); 2827 2828 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2829 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2830 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2831 2832 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2833 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2834 2835 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2836 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2837 2838 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2839 if (Ty == S32) 2840 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2841 else 2842 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2843 2844 Register Sign; 2845 if (IsDiv) 2846 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2847 else 2848 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2849 2850 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2851 B.buildSub(DstReg, UDivRem, Sign); 2852 2853 MI.eraseFromParent(); 2854 return true; 2855 } 2856 2857 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2858 MachineRegisterInfo &MRI, 2859 MachineIRBuilder &B) const { 2860 Register Res = MI.getOperand(0).getReg(); 2861 Register LHS = MI.getOperand(1).getReg(); 2862 Register RHS = MI.getOperand(2).getReg(); 2863 2864 uint16_t Flags = MI.getFlags(); 2865 2866 LLT ResTy = MRI.getType(Res); 2867 LLT S32 = LLT::scalar(32); 2868 LLT S64 = LLT::scalar(64); 2869 2870 const MachineFunction &MF = B.getMF(); 2871 bool Unsafe = 2872 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2873 2874 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2875 return false; 2876 2877 if (!Unsafe && ResTy == S32 && 2878 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2879 return false; 2880 2881 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2882 // 1 / x -> RCP(x) 2883 if (CLHS->isExactlyValue(1.0)) { 2884 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2885 .addUse(RHS) 2886 .setMIFlags(Flags); 2887 2888 MI.eraseFromParent(); 2889 return true; 2890 } 2891 2892 // -1 / x -> RCP( FNEG(x) ) 2893 if (CLHS->isExactlyValue(-1.0)) { 2894 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2895 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2896 .addUse(FNeg.getReg(0)) 2897 .setMIFlags(Flags); 2898 2899 MI.eraseFromParent(); 2900 return true; 2901 } 2902 } 2903 2904 // x / y -> x * (1.0 / y) 2905 if (Unsafe) { 2906 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2907 .addUse(RHS) 2908 .setMIFlags(Flags); 2909 B.buildFMul(Res, LHS, RCP, Flags); 2910 2911 MI.eraseFromParent(); 2912 return true; 2913 } 2914 2915 return false; 2916 } 2917 2918 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2919 MachineRegisterInfo &MRI, 2920 MachineIRBuilder &B) const { 2921 Register Res = MI.getOperand(0).getReg(); 2922 Register LHS = MI.getOperand(1).getReg(); 2923 Register RHS = MI.getOperand(2).getReg(); 2924 2925 uint16_t Flags = MI.getFlags(); 2926 2927 LLT S16 = LLT::scalar(16); 2928 LLT S32 = LLT::scalar(32); 2929 2930 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2931 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2932 2933 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2934 .addUse(RHSExt.getReg(0)) 2935 .setMIFlags(Flags); 2936 2937 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2938 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2939 2940 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2941 .addUse(RDst.getReg(0)) 2942 .addUse(RHS) 2943 .addUse(LHS) 2944 .setMIFlags(Flags); 2945 2946 MI.eraseFromParent(); 2947 return true; 2948 } 2949 2950 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2951 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2952 static void toggleSPDenormMode(bool Enable, 2953 MachineIRBuilder &B, 2954 const GCNSubtarget &ST, 2955 AMDGPU::SIModeRegisterDefaults Mode) { 2956 // Set SP denorm mode to this value. 2957 unsigned SPDenormMode = 2958 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2959 2960 if (ST.hasDenormModeInst()) { 2961 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2962 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2963 2964 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2965 B.buildInstr(AMDGPU::S_DENORM_MODE) 2966 .addImm(NewDenormModeValue); 2967 2968 } else { 2969 // Select FP32 bit field in mode register. 2970 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2971 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2972 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2973 2974 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2975 .addImm(SPDenormMode) 2976 .addImm(SPDenormModeBitField); 2977 } 2978 } 2979 2980 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2981 MachineRegisterInfo &MRI, 2982 MachineIRBuilder &B) const { 2983 Register Res = MI.getOperand(0).getReg(); 2984 Register LHS = MI.getOperand(1).getReg(); 2985 Register RHS = MI.getOperand(2).getReg(); 2986 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2987 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2988 2989 uint16_t Flags = MI.getFlags(); 2990 2991 LLT S32 = LLT::scalar(32); 2992 LLT S1 = LLT::scalar(1); 2993 2994 auto One = B.buildFConstant(S32, 1.0f); 2995 2996 auto DenominatorScaled = 2997 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2998 .addUse(LHS) 2999 .addUse(RHS) 3000 .addImm(0) 3001 .setMIFlags(Flags); 3002 auto NumeratorScaled = 3003 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3004 .addUse(LHS) 3005 .addUse(RHS) 3006 .addImm(1) 3007 .setMIFlags(Flags); 3008 3009 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3010 .addUse(DenominatorScaled.getReg(0)) 3011 .setMIFlags(Flags); 3012 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3013 3014 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3015 // aren't modeled as reading it. 3016 if (!Mode.allFP32Denormals()) 3017 toggleSPDenormMode(true, B, ST, Mode); 3018 3019 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3020 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3021 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3022 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3023 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3024 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3025 3026 if (!Mode.allFP32Denormals()) 3027 toggleSPDenormMode(false, B, ST, Mode); 3028 3029 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3030 .addUse(Fma4.getReg(0)) 3031 .addUse(Fma1.getReg(0)) 3032 .addUse(Fma3.getReg(0)) 3033 .addUse(NumeratorScaled.getReg(1)) 3034 .setMIFlags(Flags); 3035 3036 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3037 .addUse(Fmas.getReg(0)) 3038 .addUse(RHS) 3039 .addUse(LHS) 3040 .setMIFlags(Flags); 3041 3042 MI.eraseFromParent(); 3043 return true; 3044 } 3045 3046 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3047 MachineRegisterInfo &MRI, 3048 MachineIRBuilder &B) const { 3049 Register Res = MI.getOperand(0).getReg(); 3050 Register LHS = MI.getOperand(1).getReg(); 3051 Register RHS = MI.getOperand(2).getReg(); 3052 3053 uint16_t Flags = MI.getFlags(); 3054 3055 LLT S64 = LLT::scalar(64); 3056 LLT S1 = LLT::scalar(1); 3057 3058 auto One = B.buildFConstant(S64, 1.0); 3059 3060 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3061 .addUse(LHS) 3062 .addUse(RHS) 3063 .addImm(0) 3064 .setMIFlags(Flags); 3065 3066 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3067 3068 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3069 .addUse(DivScale0.getReg(0)) 3070 .setMIFlags(Flags); 3071 3072 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3073 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3074 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3075 3076 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3077 .addUse(LHS) 3078 .addUse(RHS) 3079 .addImm(1) 3080 .setMIFlags(Flags); 3081 3082 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3083 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3084 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3085 3086 Register Scale; 3087 if (!ST.hasUsableDivScaleConditionOutput()) { 3088 // Workaround a hardware bug on SI where the condition output from div_scale 3089 // is not usable. 3090 3091 LLT S32 = LLT::scalar(32); 3092 3093 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3094 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3095 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3096 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3097 3098 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3099 Scale1Unmerge.getReg(1)); 3100 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3101 Scale0Unmerge.getReg(1)); 3102 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3103 } else { 3104 Scale = DivScale1.getReg(1); 3105 } 3106 3107 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3108 .addUse(Fma4.getReg(0)) 3109 .addUse(Fma3.getReg(0)) 3110 .addUse(Mul.getReg(0)) 3111 .addUse(Scale) 3112 .setMIFlags(Flags); 3113 3114 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3115 .addUse(Fmas.getReg(0)) 3116 .addUse(RHS) 3117 .addUse(LHS) 3118 .setMIFlags(Flags); 3119 3120 MI.eraseFromParent(); 3121 return true; 3122 } 3123 3124 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3125 MachineRegisterInfo &MRI, 3126 MachineIRBuilder &B) const { 3127 Register Res = MI.getOperand(0).getReg(); 3128 Register LHS = MI.getOperand(2).getReg(); 3129 Register RHS = MI.getOperand(3).getReg(); 3130 uint16_t Flags = MI.getFlags(); 3131 3132 LLT S32 = LLT::scalar(32); 3133 LLT S1 = LLT::scalar(1); 3134 3135 auto Abs = B.buildFAbs(S32, RHS, Flags); 3136 const APFloat C0Val(1.0f); 3137 3138 auto C0 = B.buildConstant(S32, 0x6f800000); 3139 auto C1 = B.buildConstant(S32, 0x2f800000); 3140 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3141 3142 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3143 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3144 3145 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3146 3147 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3148 .addUse(Mul0.getReg(0)) 3149 .setMIFlags(Flags); 3150 3151 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3152 3153 B.buildFMul(Res, Sel, Mul1, Flags); 3154 3155 MI.eraseFromParent(); 3156 return true; 3157 } 3158 3159 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3160 MachineRegisterInfo &MRI, 3161 MachineIRBuilder &B) const { 3162 uint64_t Offset = 3163 ST.getTargetLowering()->getImplicitParameterOffset( 3164 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3165 LLT DstTy = MRI.getType(DstReg); 3166 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3167 3168 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3169 if (!loadInputValue(KernargPtrReg, B, 3170 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3171 return false; 3172 3173 // FIXME: This should be nuw 3174 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3175 return true; 3176 } 3177 3178 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3179 MachineRegisterInfo &MRI, 3180 MachineIRBuilder &B) const { 3181 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3182 if (!MFI->isEntryFunction()) { 3183 return legalizePreloadedArgIntrin(MI, MRI, B, 3184 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3185 } 3186 3187 Register DstReg = MI.getOperand(0).getReg(); 3188 if (!getImplicitArgPtr(DstReg, MRI, B)) 3189 return false; 3190 3191 MI.eraseFromParent(); 3192 return true; 3193 } 3194 3195 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3196 MachineRegisterInfo &MRI, 3197 MachineIRBuilder &B, 3198 unsigned AddrSpace) const { 3199 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3200 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3201 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3207 // offset (the offset that is included in bounds checking and swizzling, to be 3208 // split between the instruction's voffset and immoffset fields) and soffset 3209 // (the offset that is excluded from bounds checking and swizzling, to go in 3210 // the instruction's soffset field). This function takes the first kind of 3211 // offset and figures out how to split it between voffset and immoffset. 3212 std::tuple<Register, unsigned, unsigned> 3213 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3214 Register OrigOffset) const { 3215 const unsigned MaxImm = 4095; 3216 Register BaseReg; 3217 unsigned TotalConstOffset; 3218 MachineInstr *OffsetDef; 3219 const LLT S32 = LLT::scalar(32); 3220 3221 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3222 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3223 3224 unsigned ImmOffset = TotalConstOffset; 3225 3226 // If the immediate value is too big for the immoffset field, put the value 3227 // and -4096 into the immoffset field so that the value that is copied/added 3228 // for the voffset field is a multiple of 4096, and it stands more chance 3229 // of being CSEd with the copy/add for another similar load/store. 3230 // However, do not do that rounding down to a multiple of 4096 if that is a 3231 // negative number, as it appears to be illegal to have a negative offset 3232 // in the vgpr, even if adding the immediate offset makes it positive. 3233 unsigned Overflow = ImmOffset & ~MaxImm; 3234 ImmOffset -= Overflow; 3235 if ((int32_t)Overflow < 0) { 3236 Overflow += ImmOffset; 3237 ImmOffset = 0; 3238 } 3239 3240 if (Overflow != 0) { 3241 if (!BaseReg) { 3242 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3243 } else { 3244 auto OverflowVal = B.buildConstant(S32, Overflow); 3245 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3246 } 3247 } 3248 3249 if (!BaseReg) 3250 BaseReg = B.buildConstant(S32, 0).getReg(0); 3251 3252 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3253 } 3254 3255 /// Handle register layout difference for f16 images for some subtargets. 3256 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3257 MachineRegisterInfo &MRI, 3258 Register Reg) const { 3259 if (!ST.hasUnpackedD16VMem()) 3260 return Reg; 3261 3262 const LLT S16 = LLT::scalar(16); 3263 const LLT S32 = LLT::scalar(32); 3264 LLT StoreVT = MRI.getType(Reg); 3265 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3266 3267 auto Unmerge = B.buildUnmerge(S16, Reg); 3268 3269 SmallVector<Register, 4> WideRegs; 3270 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3271 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3272 3273 int NumElts = StoreVT.getNumElements(); 3274 3275 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3276 } 3277 3278 Register AMDGPULegalizerInfo::fixStoreSourceType( 3279 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3280 MachineRegisterInfo *MRI = B.getMRI(); 3281 LLT Ty = MRI->getType(VData); 3282 3283 const LLT S16 = LLT::scalar(16); 3284 3285 // Fixup illegal register types for i8 stores. 3286 if (Ty == LLT::scalar(8) || Ty == S16) { 3287 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3288 return AnyExt; 3289 } 3290 3291 if (Ty.isVector()) { 3292 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3293 if (IsFormat) 3294 return handleD16VData(B, *MRI, VData); 3295 } 3296 } 3297 3298 return VData; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B, 3304 bool IsTyped, 3305 bool IsFormat) const { 3306 Register VData = MI.getOperand(1).getReg(); 3307 LLT Ty = MRI.getType(VData); 3308 LLT EltTy = Ty.getScalarType(); 3309 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3310 const LLT S32 = LLT::scalar(32); 3311 3312 VData = fixStoreSourceType(B, VData, IsFormat); 3313 Register RSrc = MI.getOperand(2).getReg(); 3314 3315 MachineMemOperand *MMO = *MI.memoperands_begin(); 3316 const int MemSize = MMO->getSize(); 3317 3318 unsigned ImmOffset; 3319 unsigned TotalOffset; 3320 3321 // The typed intrinsics add an immediate after the registers. 3322 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3323 3324 // The struct intrinsic variants add one additional operand over raw. 3325 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3326 Register VIndex; 3327 int OpOffset = 0; 3328 if (HasVIndex) { 3329 VIndex = MI.getOperand(3).getReg(); 3330 OpOffset = 1; 3331 } 3332 3333 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3334 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3335 3336 unsigned Format = 0; 3337 if (IsTyped) { 3338 Format = MI.getOperand(5 + OpOffset).getImm(); 3339 ++OpOffset; 3340 } 3341 3342 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3343 3344 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3345 if (TotalOffset != 0) 3346 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3347 3348 unsigned Opc; 3349 if (IsTyped) { 3350 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3351 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3352 } else if (IsFormat) { 3353 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3354 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3355 } else { 3356 switch (MemSize) { 3357 case 1: 3358 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3359 break; 3360 case 2: 3361 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3362 break; 3363 default: 3364 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3365 break; 3366 } 3367 } 3368 3369 if (!VIndex) 3370 VIndex = B.buildConstant(S32, 0).getReg(0); 3371 3372 auto MIB = B.buildInstr(Opc) 3373 .addUse(VData) // vdata 3374 .addUse(RSrc) // rsrc 3375 .addUse(VIndex) // vindex 3376 .addUse(VOffset) // voffset 3377 .addUse(SOffset) // soffset 3378 .addImm(ImmOffset); // offset(imm) 3379 3380 if (IsTyped) 3381 MIB.addImm(Format); 3382 3383 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3384 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3385 .addMemOperand(MMO); 3386 3387 MI.eraseFromParent(); 3388 return true; 3389 } 3390 3391 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3392 MachineRegisterInfo &MRI, 3393 MachineIRBuilder &B, 3394 bool IsFormat, 3395 bool IsTyped) const { 3396 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3397 MachineMemOperand *MMO = *MI.memoperands_begin(); 3398 const int MemSize = MMO->getSize(); 3399 const LLT S32 = LLT::scalar(32); 3400 3401 Register Dst = MI.getOperand(0).getReg(); 3402 Register RSrc = MI.getOperand(2).getReg(); 3403 3404 // The typed intrinsics add an immediate after the registers. 3405 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3406 3407 // The struct intrinsic variants add one additional operand over raw. 3408 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3409 Register VIndex; 3410 int OpOffset = 0; 3411 if (HasVIndex) { 3412 VIndex = MI.getOperand(3).getReg(); 3413 OpOffset = 1; 3414 } 3415 3416 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3417 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3418 3419 unsigned Format = 0; 3420 if (IsTyped) { 3421 Format = MI.getOperand(5 + OpOffset).getImm(); 3422 ++OpOffset; 3423 } 3424 3425 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3426 unsigned ImmOffset; 3427 unsigned TotalOffset; 3428 3429 LLT Ty = MRI.getType(Dst); 3430 LLT EltTy = Ty.getScalarType(); 3431 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3432 const bool Unpacked = ST.hasUnpackedD16VMem(); 3433 3434 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3435 if (TotalOffset != 0) 3436 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3437 3438 unsigned Opc; 3439 3440 if (IsTyped) { 3441 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3442 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3443 } else if (IsFormat) { 3444 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3445 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3446 } else { 3447 switch (MemSize) { 3448 case 1: 3449 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3450 break; 3451 case 2: 3452 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3453 break; 3454 default: 3455 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3456 break; 3457 } 3458 } 3459 3460 Register LoadDstReg; 3461 3462 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3463 LLT UnpackedTy = Ty.changeElementSize(32); 3464 3465 if (IsExtLoad) 3466 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3467 else if (Unpacked && IsD16 && Ty.isVector()) 3468 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3469 else 3470 LoadDstReg = Dst; 3471 3472 if (!VIndex) 3473 VIndex = B.buildConstant(S32, 0).getReg(0); 3474 3475 auto MIB = B.buildInstr(Opc) 3476 .addDef(LoadDstReg) // vdata 3477 .addUse(RSrc) // rsrc 3478 .addUse(VIndex) // vindex 3479 .addUse(VOffset) // voffset 3480 .addUse(SOffset) // soffset 3481 .addImm(ImmOffset); // offset(imm) 3482 3483 if (IsTyped) 3484 MIB.addImm(Format); 3485 3486 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3487 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3488 .addMemOperand(MMO); 3489 3490 if (LoadDstReg != Dst) { 3491 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3492 3493 // Widen result for extending loads was widened. 3494 if (IsExtLoad) 3495 B.buildTrunc(Dst, LoadDstReg); 3496 else { 3497 // Repack to original 16-bit vector result 3498 // FIXME: G_TRUNC should work, but legalization currently fails 3499 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3500 SmallVector<Register, 4> Repack; 3501 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3502 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3503 B.buildMerge(Dst, Repack); 3504 } 3505 } 3506 3507 MI.eraseFromParent(); 3508 return true; 3509 } 3510 3511 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3512 MachineIRBuilder &B, 3513 bool IsInc) const { 3514 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3515 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3516 B.buildInstr(Opc) 3517 .addDef(MI.getOperand(0).getReg()) 3518 .addUse(MI.getOperand(2).getReg()) 3519 .addUse(MI.getOperand(3).getReg()) 3520 .cloneMemRefs(MI); 3521 MI.eraseFromParent(); 3522 return true; 3523 } 3524 3525 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3526 switch (IntrID) { 3527 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3528 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3529 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3530 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3531 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3532 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3533 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3534 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3536 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3537 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3539 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3540 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3541 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3542 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3543 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3544 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3545 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3546 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3547 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3548 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3549 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3550 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3551 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3552 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3553 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3554 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3555 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3556 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3557 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3558 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3559 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3560 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3561 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3562 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3563 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3564 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3565 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3566 default: 3567 llvm_unreachable("unhandled atomic opcode"); 3568 } 3569 } 3570 3571 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3572 MachineIRBuilder &B, 3573 Intrinsic::ID IID) const { 3574 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3575 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3576 3577 Register Dst = MI.getOperand(0).getReg(); 3578 Register VData = MI.getOperand(2).getReg(); 3579 3580 Register CmpVal; 3581 int OpOffset = 0; 3582 3583 if (IsCmpSwap) { 3584 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3585 ++OpOffset; 3586 } 3587 3588 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3589 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3590 3591 // The struct intrinsic variants add one additional operand over raw. 3592 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3593 Register VIndex; 3594 if (HasVIndex) { 3595 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3596 ++OpOffset; 3597 } 3598 3599 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3600 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3601 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3602 3603 MachineMemOperand *MMO = *MI.memoperands_begin(); 3604 3605 unsigned ImmOffset; 3606 unsigned TotalOffset; 3607 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3608 if (TotalOffset != 0) 3609 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3610 3611 if (!VIndex) 3612 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3613 3614 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3615 .addDef(Dst) 3616 .addUse(VData); // vdata 3617 3618 if (IsCmpSwap) 3619 MIB.addReg(CmpVal); 3620 3621 MIB.addUse(RSrc) // rsrc 3622 .addUse(VIndex) // vindex 3623 .addUse(VOffset) // voffset 3624 .addUse(SOffset) // soffset 3625 .addImm(ImmOffset) // offset(imm) 3626 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3627 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3628 .addMemOperand(MMO); 3629 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3635 /// vector with s16 typed elements. 3636 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3637 SmallVectorImpl<Register> &PackedAddrs, 3638 int AddrIdx, int DimIdx, int EndIdx, 3639 int NumGradients) { 3640 const LLT S16 = LLT::scalar(16); 3641 const LLT V2S16 = LLT::vector(2, 16); 3642 3643 for (int I = AddrIdx; I < EndIdx; ++I) { 3644 MachineOperand &SrcOp = MI.getOperand(I); 3645 if (!SrcOp.isReg()) 3646 continue; // _L to _LZ may have eliminated this. 3647 3648 Register AddrReg = SrcOp.getReg(); 3649 3650 if (I < DimIdx) { 3651 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3652 PackedAddrs.push_back(AddrReg); 3653 } else { 3654 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3655 // derivatives dx/dh and dx/dv are packed with undef. 3656 if (((I + 1) >= EndIdx) || 3657 ((NumGradients / 2) % 2 == 1 && 3658 (I == DimIdx + (NumGradients / 2) - 1 || 3659 I == DimIdx + NumGradients - 1)) || 3660 // Check for _L to _LZ optimization 3661 !MI.getOperand(I + 1).isReg()) { 3662 PackedAddrs.push_back( 3663 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3664 .getReg(0)); 3665 } else { 3666 PackedAddrs.push_back( 3667 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3668 .getReg(0)); 3669 ++I; 3670 } 3671 } 3672 } 3673 } 3674 3675 /// Convert from separate vaddr components to a single vector address register, 3676 /// and replace the remaining operands with $noreg. 3677 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3678 int DimIdx, int NumVAddrs) { 3679 const LLT S32 = LLT::scalar(32); 3680 3681 SmallVector<Register, 8> AddrRegs; 3682 for (int I = 0; I != NumVAddrs; ++I) { 3683 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3684 if (SrcOp.isReg()) { 3685 AddrRegs.push_back(SrcOp.getReg()); 3686 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3687 } 3688 } 3689 3690 int NumAddrRegs = AddrRegs.size(); 3691 if (NumAddrRegs != 1) { 3692 // Round up to 8 elements for v5-v7 3693 // FIXME: Missing intermediate sized register classes and instructions. 3694 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3695 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3696 auto Undef = B.buildUndef(S32); 3697 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3698 NumAddrRegs = RoundedNumRegs; 3699 } 3700 3701 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3702 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3703 } 3704 3705 for (int I = 1; I != NumVAddrs; ++I) { 3706 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3707 if (SrcOp.isReg()) 3708 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3709 } 3710 } 3711 3712 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3713 /// 3714 /// Depending on the subtarget, load/store with 16-bit element data need to be 3715 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3716 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3717 /// registers. 3718 /// 3719 /// We don't want to directly select image instructions just yet, but also want 3720 /// to exposes all register repacking to the legalizer/combiners. We also don't 3721 /// want a selected instrution entering RegBankSelect. In order to avoid 3722 /// defining a multitude of intermediate image instructions, directly hack on 3723 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3724 /// now unnecessary arguments with $noreg. 3725 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3726 MachineInstr &MI, MachineIRBuilder &B, 3727 GISelChangeObserver &Observer, 3728 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3729 3730 const int NumDefs = MI.getNumExplicitDefs(); 3731 bool IsTFE = NumDefs == 2; 3732 // We are only processing the operands of d16 image operations on subtargets 3733 // that use the unpacked register layout, or need to repack the TFE result. 3734 3735 // TODO: Do we need to guard against already legalized intrinsics? 3736 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3737 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3738 3739 MachineRegisterInfo *MRI = B.getMRI(); 3740 const LLT S32 = LLT::scalar(32); 3741 const LLT S16 = LLT::scalar(16); 3742 const LLT V2S16 = LLT::vector(2, 16); 3743 3744 // Index of first address argument 3745 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3746 3747 int NumVAddrs, NumGradients; 3748 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3749 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3750 getDMaskIdx(BaseOpcode, NumDefs); 3751 unsigned DMask = 0; 3752 3753 // Check for 16 bit addresses and pack if true. 3754 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3755 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3756 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3757 const bool IsG16 = GradTy == S16; 3758 const bool IsA16 = AddrTy == S16; 3759 3760 int DMaskLanes = 0; 3761 if (!BaseOpcode->Atomic) { 3762 DMask = MI.getOperand(DMaskIdx).getImm(); 3763 if (BaseOpcode->Gather4) { 3764 DMaskLanes = 4; 3765 } else if (DMask != 0) { 3766 DMaskLanes = countPopulation(DMask); 3767 } else if (!IsTFE && !BaseOpcode->Store) { 3768 // If dmask is 0, this is a no-op load. This can be eliminated. 3769 B.buildUndef(MI.getOperand(0)); 3770 MI.eraseFromParent(); 3771 return true; 3772 } 3773 } 3774 3775 Observer.changingInstr(MI); 3776 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3777 3778 unsigned NewOpcode = NumDefs == 0 ? 3779 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3780 3781 // Track that we legalized this 3782 MI.setDesc(B.getTII().get(NewOpcode)); 3783 3784 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3785 // dmask to be at least 1 otherwise the instruction will fail 3786 if (IsTFE && DMask == 0) { 3787 DMask = 0x1; 3788 DMaskLanes = 1; 3789 MI.getOperand(DMaskIdx).setImm(DMask); 3790 } 3791 3792 if (BaseOpcode->Atomic) { 3793 Register VData0 = MI.getOperand(2).getReg(); 3794 LLT Ty = MRI->getType(VData0); 3795 3796 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3797 if (Ty.isVector()) 3798 return false; 3799 3800 if (BaseOpcode->AtomicX2) { 3801 Register VData1 = MI.getOperand(3).getReg(); 3802 // The two values are packed in one register. 3803 LLT PackedTy = LLT::vector(2, Ty); 3804 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3805 MI.getOperand(2).setReg(Concat.getReg(0)); 3806 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3807 } 3808 } 3809 3810 int CorrectedNumVAddrs = NumVAddrs; 3811 3812 // Optimize _L to _LZ when _L is zero 3813 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3814 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3815 const ConstantFP *ConstantLod; 3816 const int LodIdx = AddrIdx + NumVAddrs - 1; 3817 3818 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3819 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3820 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3821 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3822 LZMappingInfo->LZ, ImageDimIntr->Dim); 3823 3824 // The starting indexes should remain in the same place. 3825 --NumVAddrs; 3826 --CorrectedNumVAddrs; 3827 3828 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3829 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3830 MI.RemoveOperand(LodIdx); 3831 } 3832 } 3833 } 3834 3835 // Optimize _mip away, when 'lod' is zero 3836 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3837 int64_t ConstantLod; 3838 const int LodIdx = AddrIdx + NumVAddrs - 1; 3839 3840 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3841 if (ConstantLod == 0) { 3842 // TODO: Change intrinsic opcode and remove operand instead or replacing 3843 // it with 0, as the _L to _LZ handling is done above. 3844 MI.getOperand(LodIdx).ChangeToImmediate(0); 3845 --CorrectedNumVAddrs; 3846 } 3847 } 3848 } 3849 3850 // Rewrite the addressing register layout before doing anything else. 3851 if (IsA16 || IsG16) { 3852 if (IsA16) { 3853 // Target must support the feature and gradients need to be 16 bit too 3854 if (!ST.hasA16() || !IsG16) 3855 return false; 3856 } else if (!ST.hasG16()) 3857 return false; 3858 3859 if (NumVAddrs > 1) { 3860 SmallVector<Register, 4> PackedRegs; 3861 // Don't compress addresses for G16 3862 const int PackEndIdx = 3863 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3864 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3865 PackEndIdx, NumGradients); 3866 3867 if (!IsA16) { 3868 // Add uncompressed address 3869 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3870 int AddrReg = MI.getOperand(I).getReg(); 3871 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3872 PackedRegs.push_back(AddrReg); 3873 } 3874 } 3875 3876 // See also below in the non-a16 branch 3877 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3878 3879 if (!UseNSA && PackedRegs.size() > 1) { 3880 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3881 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3882 PackedRegs[0] = Concat.getReg(0); 3883 PackedRegs.resize(1); 3884 } 3885 3886 const int NumPacked = PackedRegs.size(); 3887 for (int I = 0; I != NumVAddrs; ++I) { 3888 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3889 if (!SrcOp.isReg()) { 3890 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3891 continue; 3892 } 3893 3894 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3895 3896 if (I < NumPacked) 3897 SrcOp.setReg(PackedRegs[I]); 3898 else 3899 SrcOp.setReg(AMDGPU::NoRegister); 3900 } 3901 } 3902 } else { 3903 // If the register allocator cannot place the address registers contiguously 3904 // without introducing moves, then using the non-sequential address encoding 3905 // is always preferable, since it saves VALU instructions and is usually a 3906 // wash in terms of code size or even better. 3907 // 3908 // However, we currently have no way of hinting to the register allocator 3909 // that MIMG addresses should be placed contiguously when it is possible to 3910 // do so, so force non-NSA for the common 2-address case as a heuristic. 3911 // 3912 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3913 // allocation when possible. 3914 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3915 3916 if (!UseNSA && NumVAddrs > 1) 3917 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3918 } 3919 3920 int Flags = 0; 3921 if (IsA16) 3922 Flags |= 1; 3923 if (IsG16) 3924 Flags |= 2; 3925 MI.addOperand(MachineOperand::CreateImm(Flags)); 3926 3927 if (BaseOpcode->Store) { // No TFE for stores? 3928 // TODO: Handle dmask trim 3929 Register VData = MI.getOperand(1).getReg(); 3930 LLT Ty = MRI->getType(VData); 3931 if (!Ty.isVector() || Ty.getElementType() != S16) 3932 return true; 3933 3934 Register RepackedReg = handleD16VData(B, *MRI, VData); 3935 if (RepackedReg != VData) { 3936 MI.getOperand(1).setReg(RepackedReg); 3937 } 3938 3939 return true; 3940 } 3941 3942 Register DstReg = MI.getOperand(0).getReg(); 3943 LLT Ty = MRI->getType(DstReg); 3944 const LLT EltTy = Ty.getScalarType(); 3945 const bool IsD16 = Ty.getScalarType() == S16; 3946 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3947 3948 // Confirm that the return type is large enough for the dmask specified 3949 if (NumElts < DMaskLanes) 3950 return false; 3951 3952 if (NumElts > 4 || DMaskLanes > 4) 3953 return false; 3954 3955 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3956 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3957 3958 // The raw dword aligned data component of the load. The only legal cases 3959 // where this matters should be when using the packed D16 format, for 3960 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3961 LLT RoundedTy; 3962 3963 // S32 vector to to cover all data, plus TFE result element. 3964 LLT TFETy; 3965 3966 // Register type to use for each loaded component. Will be S32 or V2S16. 3967 LLT RegTy; 3968 3969 if (IsD16 && ST.hasUnpackedD16VMem()) { 3970 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3971 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3972 RegTy = S32; 3973 } else { 3974 unsigned EltSize = EltTy.getSizeInBits(); 3975 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3976 unsigned RoundedSize = 32 * RoundedElts; 3977 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3978 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3979 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3980 } 3981 3982 // The return type does not need adjustment. 3983 // TODO: Should we change s16 case to s32 or <2 x s16>? 3984 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3985 return true; 3986 3987 Register Dst1Reg; 3988 3989 // Insert after the instruction. 3990 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3991 3992 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3993 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3994 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3995 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3996 3997 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3998 3999 MI.getOperand(0).setReg(NewResultReg); 4000 4001 // In the IR, TFE is supposed to be used with a 2 element struct return 4002 // type. The intruction really returns these two values in one contiguous 4003 // register, with one additional dword beyond the loaded data. Rewrite the 4004 // return type to use a single register result. 4005 4006 if (IsTFE) { 4007 Dst1Reg = MI.getOperand(1).getReg(); 4008 if (MRI->getType(Dst1Reg) != S32) 4009 return false; 4010 4011 // TODO: Make sure the TFE operand bit is set. 4012 MI.RemoveOperand(1); 4013 4014 // Handle the easy case that requires no repack instructions. 4015 if (Ty == S32) { 4016 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4017 return true; 4018 } 4019 } 4020 4021 // Now figure out how to copy the new result register back into the old 4022 // result. 4023 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4024 4025 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4026 4027 if (ResultNumRegs == 1) { 4028 assert(!IsTFE); 4029 ResultRegs[0] = NewResultReg; 4030 } else { 4031 // We have to repack into a new vector of some kind. 4032 for (int I = 0; I != NumDataRegs; ++I) 4033 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4034 B.buildUnmerge(ResultRegs, NewResultReg); 4035 4036 // Drop the final TFE element to get the data part. The TFE result is 4037 // directly written to the right place already. 4038 if (IsTFE) 4039 ResultRegs.resize(NumDataRegs); 4040 } 4041 4042 // For an s16 scalar result, we form an s32 result with a truncate regardless 4043 // of packed vs. unpacked. 4044 if (IsD16 && !Ty.isVector()) { 4045 B.buildTrunc(DstReg, ResultRegs[0]); 4046 return true; 4047 } 4048 4049 // Avoid a build/concat_vector of 1 entry. 4050 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4051 B.buildBitcast(DstReg, ResultRegs[0]); 4052 return true; 4053 } 4054 4055 assert(Ty.isVector()); 4056 4057 if (IsD16) { 4058 // For packed D16 results with TFE enabled, all the data components are 4059 // S32. Cast back to the expected type. 4060 // 4061 // TODO: We don't really need to use load s32 elements. We would only need one 4062 // cast for the TFE result if a multiple of v2s16 was used. 4063 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4064 for (Register &Reg : ResultRegs) 4065 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4066 } else if (ST.hasUnpackedD16VMem()) { 4067 for (Register &Reg : ResultRegs) 4068 Reg = B.buildTrunc(S16, Reg).getReg(0); 4069 } 4070 } 4071 4072 auto padWithUndef = [&](LLT Ty, int NumElts) { 4073 if (NumElts == 0) 4074 return; 4075 Register Undef = B.buildUndef(Ty).getReg(0); 4076 for (int I = 0; I != NumElts; ++I) 4077 ResultRegs.push_back(Undef); 4078 }; 4079 4080 // Pad out any elements eliminated due to the dmask. 4081 LLT ResTy = MRI->getType(ResultRegs[0]); 4082 if (!ResTy.isVector()) { 4083 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4084 B.buildBuildVector(DstReg, ResultRegs); 4085 return true; 4086 } 4087 4088 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4089 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4090 4091 // Deal with the one annoying legal case. 4092 const LLT V3S16 = LLT::vector(3, 16); 4093 if (Ty == V3S16) { 4094 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4095 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4096 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4097 return true; 4098 } 4099 4100 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4101 B.buildConcatVectors(DstReg, ResultRegs); 4102 return true; 4103 } 4104 4105 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4106 MachineInstr &MI, MachineIRBuilder &B, 4107 GISelChangeObserver &Observer) const { 4108 Register Dst = MI.getOperand(0).getReg(); 4109 LLT Ty = B.getMRI()->getType(Dst); 4110 unsigned Size = Ty.getSizeInBits(); 4111 MachineFunction &MF = B.getMF(); 4112 4113 Observer.changingInstr(MI); 4114 4115 // FIXME: We don't really need this intermediate instruction. The intrinsic 4116 // should be fixed to have a memory operand. Since it's readnone, we're not 4117 // allowed to add one. 4118 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4119 MI.RemoveOperand(1); // Remove intrinsic ID 4120 4121 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4122 // TODO: Should this use datalayout alignment? 4123 const unsigned MemSize = (Size + 7) / 8; 4124 const Align MemAlign(4); 4125 MachineMemOperand *MMO = MF.getMachineMemOperand( 4126 MachinePointerInfo(), 4127 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4128 MachineMemOperand::MOInvariant, 4129 MemSize, MemAlign); 4130 MI.addMemOperand(MF, MMO); 4131 4132 // There are no 96-bit result scalar loads, but widening to 128-bit should 4133 // always be legal. We may need to restore this to a 96-bit result if it turns 4134 // out this needs to be converted to a vector load during RegBankSelect. 4135 if (!isPowerOf2_32(Size)) { 4136 LegalizerHelper Helper(MF, *this, Observer, B); 4137 4138 if (Ty.isVector()) 4139 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4140 else 4141 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4142 } 4143 4144 Observer.changedInstr(MI); 4145 return true; 4146 } 4147 4148 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4149 MachineRegisterInfo &MRI, 4150 MachineIRBuilder &B) const { 4151 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4152 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4153 !ST.isTrapHandlerEnabled()) { 4154 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4155 } else { 4156 // Pass queue pointer to trap handler as input, and insert trap instruction 4157 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4158 MachineRegisterInfo &MRI = *B.getMRI(); 4159 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4160 Register LiveIn = getLiveInRegister( 4161 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4162 /*InsertLiveInCopy=*/false); 4163 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4164 return false; 4165 B.buildCopy(SGPR01, LiveIn); 4166 B.buildInstr(AMDGPU::S_TRAP) 4167 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4168 .addReg(SGPR01, RegState::Implicit); 4169 } 4170 4171 MI.eraseFromParent(); 4172 return true; 4173 } 4174 4175 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4176 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4177 // Is non-HSA path or trap-handler disabled? then, report a warning 4178 // accordingly 4179 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4180 !ST.isTrapHandlerEnabled()) { 4181 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4182 "debugtrap handler not supported", 4183 MI.getDebugLoc(), DS_Warning); 4184 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4185 Ctx.diagnose(NoTrap); 4186 } else { 4187 // Insert debug-trap instruction 4188 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4189 } 4190 4191 MI.eraseFromParent(); 4192 return true; 4193 } 4194 4195 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4196 MachineInstr &MI) const { 4197 MachineIRBuilder &B = Helper.MIRBuilder; 4198 MachineRegisterInfo &MRI = *B.getMRI(); 4199 4200 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4201 auto IntrID = MI.getIntrinsicID(); 4202 switch (IntrID) { 4203 case Intrinsic::amdgcn_if: 4204 case Intrinsic::amdgcn_else: { 4205 MachineInstr *Br = nullptr; 4206 MachineBasicBlock *UncondBrTarget = nullptr; 4207 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4208 const SIRegisterInfo *TRI 4209 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4210 4211 Register Def = MI.getOperand(1).getReg(); 4212 Register Use = MI.getOperand(3).getReg(); 4213 4214 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4215 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4216 if (IntrID == Intrinsic::amdgcn_if) { 4217 B.buildInstr(AMDGPU::SI_IF) 4218 .addDef(Def) 4219 .addUse(Use) 4220 .addMBB(UncondBrTarget); 4221 } else { 4222 B.buildInstr(AMDGPU::SI_ELSE) 4223 .addDef(Def) 4224 .addUse(Use) 4225 .addMBB(UncondBrTarget) 4226 .addImm(0); 4227 } 4228 4229 if (Br) { 4230 Br->getOperand(0).setMBB(CondBrTarget); 4231 } else { 4232 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4233 // since we're swapping branch targets it needs to be reinserted. 4234 // FIXME: IRTranslator should probably not do this 4235 B.buildBr(*CondBrTarget); 4236 } 4237 4238 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4239 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4240 MI.eraseFromParent(); 4241 BrCond->eraseFromParent(); 4242 return true; 4243 } 4244 4245 return false; 4246 } 4247 case Intrinsic::amdgcn_loop: { 4248 MachineInstr *Br = nullptr; 4249 MachineBasicBlock *UncondBrTarget = nullptr; 4250 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4251 const SIRegisterInfo *TRI 4252 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4253 4254 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4255 Register Reg = MI.getOperand(2).getReg(); 4256 4257 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4258 B.buildInstr(AMDGPU::SI_LOOP) 4259 .addUse(Reg) 4260 .addMBB(UncondBrTarget); 4261 4262 if (Br) 4263 Br->getOperand(0).setMBB(CondBrTarget); 4264 else 4265 B.buildBr(*CondBrTarget); 4266 4267 MI.eraseFromParent(); 4268 BrCond->eraseFromParent(); 4269 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4270 return true; 4271 } 4272 4273 return false; 4274 } 4275 case Intrinsic::amdgcn_kernarg_segment_ptr: 4276 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4277 // This only makes sense to call in a kernel, so just lower to null. 4278 B.buildConstant(MI.getOperand(0).getReg(), 0); 4279 MI.eraseFromParent(); 4280 return true; 4281 } 4282 4283 return legalizePreloadedArgIntrin( 4284 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4285 case Intrinsic::amdgcn_implicitarg_ptr: 4286 return legalizeImplicitArgPtr(MI, MRI, B); 4287 case Intrinsic::amdgcn_workitem_id_x: 4288 return legalizePreloadedArgIntrin(MI, MRI, B, 4289 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4290 case Intrinsic::amdgcn_workitem_id_y: 4291 return legalizePreloadedArgIntrin(MI, MRI, B, 4292 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4293 case Intrinsic::amdgcn_workitem_id_z: 4294 return legalizePreloadedArgIntrin(MI, MRI, B, 4295 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4296 case Intrinsic::amdgcn_workgroup_id_x: 4297 return legalizePreloadedArgIntrin(MI, MRI, B, 4298 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4299 case Intrinsic::amdgcn_workgroup_id_y: 4300 return legalizePreloadedArgIntrin(MI, MRI, B, 4301 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4302 case Intrinsic::amdgcn_workgroup_id_z: 4303 return legalizePreloadedArgIntrin(MI, MRI, B, 4304 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4305 case Intrinsic::amdgcn_dispatch_ptr: 4306 return legalizePreloadedArgIntrin(MI, MRI, B, 4307 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4308 case Intrinsic::amdgcn_queue_ptr: 4309 return legalizePreloadedArgIntrin(MI, MRI, B, 4310 AMDGPUFunctionArgInfo::QUEUE_PTR); 4311 case Intrinsic::amdgcn_implicit_buffer_ptr: 4312 return legalizePreloadedArgIntrin( 4313 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4314 case Intrinsic::amdgcn_dispatch_id: 4315 return legalizePreloadedArgIntrin(MI, MRI, B, 4316 AMDGPUFunctionArgInfo::DISPATCH_ID); 4317 case Intrinsic::amdgcn_fdiv_fast: 4318 return legalizeFDIVFastIntrin(MI, MRI, B); 4319 case Intrinsic::amdgcn_is_shared: 4320 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4321 case Intrinsic::amdgcn_is_private: 4322 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4323 case Intrinsic::amdgcn_wavefrontsize: { 4324 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4325 MI.eraseFromParent(); 4326 return true; 4327 } 4328 case Intrinsic::amdgcn_s_buffer_load: 4329 return legalizeSBufferLoad(MI, B, Helper.Observer); 4330 case Intrinsic::amdgcn_raw_buffer_store: 4331 case Intrinsic::amdgcn_struct_buffer_store: 4332 return legalizeBufferStore(MI, MRI, B, false, false); 4333 case Intrinsic::amdgcn_raw_buffer_store_format: 4334 case Intrinsic::amdgcn_struct_buffer_store_format: 4335 return legalizeBufferStore(MI, MRI, B, false, true); 4336 case Intrinsic::amdgcn_raw_tbuffer_store: 4337 case Intrinsic::amdgcn_struct_tbuffer_store: 4338 return legalizeBufferStore(MI, MRI, B, true, true); 4339 case Intrinsic::amdgcn_raw_buffer_load: 4340 case Intrinsic::amdgcn_struct_buffer_load: 4341 return legalizeBufferLoad(MI, MRI, B, false, false); 4342 case Intrinsic::amdgcn_raw_buffer_load_format: 4343 case Intrinsic::amdgcn_struct_buffer_load_format: 4344 return legalizeBufferLoad(MI, MRI, B, true, false); 4345 case Intrinsic::amdgcn_raw_tbuffer_load: 4346 case Intrinsic::amdgcn_struct_tbuffer_load: 4347 return legalizeBufferLoad(MI, MRI, B, true, true); 4348 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4349 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4350 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4351 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4352 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4353 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4354 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4355 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4356 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4357 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4358 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4359 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4360 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4361 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4362 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4363 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4364 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4365 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4366 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4374 return legalizeBufferAtomic(MI, B, IntrID); 4375 case Intrinsic::amdgcn_atomic_inc: 4376 return legalizeAtomicIncDec(MI, B, true); 4377 case Intrinsic::amdgcn_atomic_dec: 4378 return legalizeAtomicIncDec(MI, B, false); 4379 case Intrinsic::trap: 4380 return legalizeTrapIntrinsic(MI, MRI, B); 4381 case Intrinsic::debugtrap: 4382 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4383 default: { 4384 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4385 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4386 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4387 return true; 4388 } 4389 } 4390 4391 return true; 4392 } 4393