1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts()) { 426 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 427 .legalFor({S32, S16, V2S16}) 428 .clampScalar(0, S16, S32) 429 .clampMaxNumElements(0, S16, 2) 430 .scalarize(0) 431 .widenScalarToNextPow2(0, 32); 432 433 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 434 .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul 435 .minScalar(0, S16) 436 .clampMaxNumElements(0, S16, 2) 437 .scalarize(0) 438 .widenScalarToNextPow2(0, 32) 439 .lower(); 440 } else if (ST.has16BitInsts()) { 441 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 442 .legalFor({S32, S16}) 443 .clampScalar(0, S16, S32) 444 .scalarize(0) 445 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 446 447 // Technically the saturating operations require clamp bit support, but this 448 // was introduced at the same time as 16-bit operations. 449 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 450 .lowerFor({S32, S16}) // FIXME: legal with clamp modifier 451 .minScalar(0, S16) 452 .scalarize(0) 453 .widenScalarToNextPow2(0, 16) 454 .lower(); 455 456 // We're just lowering this, but it helps get a better result to try to 457 // coerce to the desired type first. 458 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 459 .minScalar(0, S16) 460 .scalarize(0) 461 .lower(); 462 } else { 463 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 464 .legalFor({S32}) 465 .clampScalar(0, S32, S32) 466 .scalarize(0); 467 468 if (ST.hasIntClamp()) { 469 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 470 .lowerFor({S32}) // FIXME: legal with clamp modifier. 471 .scalarize(0) 472 .minScalarOrElt(0, S32) 473 .lower(); 474 } else { 475 // Clamp bit support was added in VI, along with 16-bit operations. 476 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 477 .minScalar(0, S32) 478 .scalarize(0) 479 .lower(); 480 } 481 482 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 483 .minScalar(0, S32) 484 .scalarize(0) 485 .lower(); 486 } 487 488 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 489 .customFor({S32, S64}) 490 .clampScalar(0, S32, S64) 491 .widenScalarToNextPow2(0, 32) 492 .scalarize(0); 493 494 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 495 .legalFor({S32}) 496 .clampScalar(0, S32, S32) 497 .scalarize(0); 498 499 // Report legal for any types we can handle anywhere. For the cases only legal 500 // on the SALU, RegBankSelect will be able to re-legalize. 501 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 502 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 503 .clampScalar(0, S32, S64) 504 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 505 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 506 .widenScalarToNextPow2(0) 507 .scalarize(0); 508 509 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 510 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 511 .legalFor({{S32, S1}, {S32, S32}}) 512 .minScalar(0, S32) 513 // TODO: .scalarize(0) 514 .lower(); 515 516 getActionDefinitionsBuilder(G_BITCAST) 517 // Don't worry about the size constraint. 518 .legalIf(all(isRegisterType(0), isRegisterType(1))) 519 .lower(); 520 521 522 getActionDefinitionsBuilder(G_CONSTANT) 523 .legalFor({S1, S32, S64, S16, GlobalPtr, 524 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 525 .legalIf(isPointer(0)) 526 .clampScalar(0, S32, S64) 527 .widenScalarToNextPow2(0); 528 529 getActionDefinitionsBuilder(G_FCONSTANT) 530 .legalFor({S32, S64, S16}) 531 .clampScalar(0, S16, S64); 532 533 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 534 .legalIf(isRegisterType(0)) 535 // s1 and s16 are special cases because they have legal operations on 536 // them, but don't really occupy registers in the normal way. 537 .legalFor({S1, S16}) 538 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 539 .clampScalarOrElt(0, S32, MaxScalar) 540 .widenScalarToNextPow2(0, 32) 541 .clampMaxNumElements(0, S32, 16); 542 543 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 544 545 // If the amount is divergent, we have to do a wave reduction to get the 546 // maximum value, so this is expanded during RegBankSelect. 547 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 548 .legalFor({{PrivatePtr, S32}}); 549 550 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 551 .unsupportedFor({PrivatePtr}) 552 .custom(); 553 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 554 555 auto &FPOpActions = getActionDefinitionsBuilder( 556 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 557 .legalFor({S32, S64}); 558 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 559 .customFor({S32, S64}); 560 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 561 .customFor({S32, S64}); 562 563 if (ST.has16BitInsts()) { 564 if (ST.hasVOP3PInsts()) 565 FPOpActions.legalFor({S16, V2S16}); 566 else 567 FPOpActions.legalFor({S16}); 568 569 TrigActions.customFor({S16}); 570 FDIVActions.customFor({S16}); 571 } 572 573 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 574 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 575 576 if (ST.hasVOP3PInsts()) { 577 MinNumMaxNum.customFor(FPTypesPK16) 578 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 579 .clampMaxNumElements(0, S16, 2) 580 .clampScalar(0, S16, S64) 581 .scalarize(0); 582 } else if (ST.has16BitInsts()) { 583 MinNumMaxNum.customFor(FPTypes16) 584 .clampScalar(0, S16, S64) 585 .scalarize(0); 586 } else { 587 MinNumMaxNum.customFor(FPTypesBase) 588 .clampScalar(0, S32, S64) 589 .scalarize(0); 590 } 591 592 if (ST.hasVOP3PInsts()) 593 FPOpActions.clampMaxNumElements(0, S16, 2); 594 595 FPOpActions 596 .scalarize(0) 597 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 598 599 TrigActions 600 .scalarize(0) 601 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 602 603 FDIVActions 604 .scalarize(0) 605 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 606 607 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 608 .legalFor(FPTypesPK16) 609 .clampMaxNumElements(0, S16, 2) 610 .scalarize(0) 611 .clampScalar(0, S16, S64); 612 613 if (ST.has16BitInsts()) { 614 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 615 .legalFor({S32, S64, S16}) 616 .scalarize(0) 617 .clampScalar(0, S16, S64); 618 } else { 619 getActionDefinitionsBuilder(G_FSQRT) 620 .legalFor({S32, S64}) 621 .scalarize(0) 622 .clampScalar(0, S32, S64); 623 624 if (ST.hasFractBug()) { 625 getActionDefinitionsBuilder(G_FFLOOR) 626 .customFor({S64}) 627 .legalFor({S32, S64}) 628 .scalarize(0) 629 .clampScalar(0, S32, S64); 630 } else { 631 getActionDefinitionsBuilder(G_FFLOOR) 632 .legalFor({S32, S64}) 633 .scalarize(0) 634 .clampScalar(0, S32, S64); 635 } 636 } 637 638 getActionDefinitionsBuilder(G_FPTRUNC) 639 .legalFor({{S32, S64}, {S16, S32}}) 640 .scalarize(0) 641 .lower(); 642 643 getActionDefinitionsBuilder(G_FPEXT) 644 .legalFor({{S64, S32}, {S32, S16}}) 645 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 646 .scalarize(0); 647 648 getActionDefinitionsBuilder(G_FSUB) 649 // Use actual fsub instruction 650 .legalFor({S32}) 651 // Must use fadd + fneg 652 .lowerFor({S64, S16, V2S16}) 653 .scalarize(0) 654 .clampScalar(0, S32, S64); 655 656 // Whether this is legal depends on the floating point mode for the function. 657 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 658 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 659 FMad.customFor({S32, S16}); 660 else if (ST.hasMadMacF32Insts()) 661 FMad.customFor({S32}); 662 else if (ST.hasMadF16()) 663 FMad.customFor({S16}); 664 FMad.scalarize(0) 665 .lower(); 666 667 // TODO: Do we need to clamp maximum bitwidth? 668 getActionDefinitionsBuilder(G_TRUNC) 669 .legalIf(isScalar(0)) 670 .legalFor({{V2S16, V2S32}}) 671 .clampMaxNumElements(0, S16, 2) 672 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 673 // situations (like an invalid implicit use), we don't want to infinite loop 674 // in the legalizer. 675 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 676 .alwaysLegal(); 677 678 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 679 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 680 {S32, S1}, {S64, S1}, {S16, S1}}) 681 .scalarize(0) 682 .clampScalar(0, S32, S64) 683 .widenScalarToNextPow2(1, 32); 684 685 // TODO: Split s1->s64 during regbankselect for VALU. 686 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 687 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 688 .lowerFor({{S32, S64}}) 689 .lowerIf(typeIs(1, S1)) 690 .customFor({{S64, S64}}); 691 if (ST.has16BitInsts()) 692 IToFP.legalFor({{S16, S16}}); 693 IToFP.clampScalar(1, S32, S64) 694 .minScalar(0, S32) 695 .scalarize(0) 696 .widenScalarToNextPow2(1); 697 698 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 699 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 700 .customFor({{S64, S64}}) 701 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 702 if (ST.has16BitInsts()) 703 FPToI.legalFor({{S16, S16}}); 704 else 705 FPToI.minScalar(1, S32); 706 707 FPToI.minScalar(0, S32) 708 .scalarize(0) 709 .lower(); 710 711 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 712 .scalarize(0) 713 .lower(); 714 715 if (ST.has16BitInsts()) { 716 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 717 .legalFor({S16, S32, S64}) 718 .clampScalar(0, S16, S64) 719 .scalarize(0); 720 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 721 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 722 .legalFor({S32, S64}) 723 .clampScalar(0, S32, S64) 724 .scalarize(0); 725 } else { 726 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 727 .legalFor({S32}) 728 .customFor({S64}) 729 .clampScalar(0, S32, S64) 730 .scalarize(0); 731 } 732 733 getActionDefinitionsBuilder(G_PTR_ADD) 734 .legalIf(all(isPointer(0), sameSize(0, 1))) 735 .scalarize(0) 736 .scalarSameSizeAs(1, 0); 737 738 getActionDefinitionsBuilder(G_PTRMASK) 739 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 740 .scalarSameSizeAs(1, 0) 741 .scalarize(0); 742 743 auto &CmpBuilder = 744 getActionDefinitionsBuilder(G_ICMP) 745 // The compare output type differs based on the register bank of the output, 746 // so make both s1 and s32 legal. 747 // 748 // Scalar compares producing output in scc will be promoted to s32, as that 749 // is the allocatable register type that will be needed for the copy from 750 // scc. This will be promoted during RegBankSelect, and we assume something 751 // before that won't try to use s32 result types. 752 // 753 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 754 // bank. 755 .legalForCartesianProduct( 756 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 757 .legalForCartesianProduct( 758 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 759 if (ST.has16BitInsts()) { 760 CmpBuilder.legalFor({{S1, S16}}); 761 } 762 763 CmpBuilder 764 .widenScalarToNextPow2(1) 765 .clampScalar(1, S32, S64) 766 .scalarize(0) 767 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 768 769 getActionDefinitionsBuilder(G_FCMP) 770 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 771 .widenScalarToNextPow2(1) 772 .clampScalar(1, S32, S64) 773 .scalarize(0); 774 775 // FIXME: fpow has a selection pattern that should move to custom lowering. 776 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 777 if (ST.has16BitInsts()) 778 Exp2Ops.legalFor({S32, S16}); 779 else 780 Exp2Ops.legalFor({S32}); 781 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 782 Exp2Ops.scalarize(0); 783 784 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 785 if (ST.has16BitInsts()) 786 ExpOps.customFor({{S32}, {S16}}); 787 else 788 ExpOps.customFor({S32}); 789 ExpOps.clampScalar(0, MinScalarFPTy, S32) 790 .scalarize(0); 791 792 getActionDefinitionsBuilder(G_FPOWI) 793 .clampScalar(0, MinScalarFPTy, S32) 794 .lower(); 795 796 // The 64-bit versions produce 32-bit results, but only on the SALU. 797 getActionDefinitionsBuilder(G_CTPOP) 798 .legalFor({{S32, S32}, {S32, S64}}) 799 .clampScalar(0, S32, S32) 800 .clampScalar(1, S32, S64) 801 .scalarize(0) 802 .widenScalarToNextPow2(0, 32) 803 .widenScalarToNextPow2(1, 32); 804 805 // The hardware instructions return a different result on 0 than the generic 806 // instructions expect. The hardware produces -1, but these produce the 807 // bitwidth. 808 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 809 .scalarize(0) 810 .clampScalar(0, S32, S32) 811 .clampScalar(1, S32, S64) 812 .widenScalarToNextPow2(0, 32) 813 .widenScalarToNextPow2(1, 32) 814 .lower(); 815 816 // The 64-bit versions produce 32-bit results, but only on the SALU. 817 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 818 .legalFor({{S32, S32}, {S32, S64}}) 819 .clampScalar(0, S32, S32) 820 .clampScalar(1, S32, S64) 821 .scalarize(0) 822 .widenScalarToNextPow2(0, 32) 823 .widenScalarToNextPow2(1, 32); 824 825 getActionDefinitionsBuilder(G_BITREVERSE) 826 .legalFor({S32}) 827 .clampScalar(0, S32, S32) 828 .scalarize(0); 829 830 if (ST.has16BitInsts()) { 831 getActionDefinitionsBuilder(G_BSWAP) 832 .legalFor({S16, S32, V2S16}) 833 .clampMaxNumElements(0, S16, 2) 834 // FIXME: Fixing non-power-of-2 before clamp is workaround for 835 // narrowScalar limitation. 836 .widenScalarToNextPow2(0) 837 .clampScalar(0, S16, S32) 838 .scalarize(0); 839 840 if (ST.hasVOP3PInsts()) { 841 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 842 .legalFor({S32, S16, V2S16}) 843 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 844 .clampMaxNumElements(0, S16, 2) 845 .minScalar(0, S16) 846 .widenScalarToNextPow2(0) 847 .scalarize(0) 848 .lower(); 849 } else { 850 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 851 .legalFor({S32, S16}) 852 .widenScalarToNextPow2(0) 853 .minScalar(0, S16) 854 .scalarize(0) 855 .lower(); 856 } 857 } else { 858 // TODO: Should have same legality without v_perm_b32 859 getActionDefinitionsBuilder(G_BSWAP) 860 .legalFor({S32}) 861 .lowerIf(scalarNarrowerThan(0, 32)) 862 // FIXME: Fixing non-power-of-2 before clamp is workaround for 863 // narrowScalar limitation. 864 .widenScalarToNextPow2(0) 865 .maxScalar(0, S32) 866 .scalarize(0) 867 .lower(); 868 869 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 870 .legalFor({S32}) 871 .minScalar(0, S32) 872 .widenScalarToNextPow2(0) 873 .scalarize(0) 874 .lower(); 875 } 876 877 getActionDefinitionsBuilder(G_INTTOPTR) 878 // List the common cases 879 .legalForCartesianProduct(AddrSpaces64, {S64}) 880 .legalForCartesianProduct(AddrSpaces32, {S32}) 881 .scalarize(0) 882 // Accept any address space as long as the size matches 883 .legalIf(sameSize(0, 1)) 884 .widenScalarIf(smallerThan(1, 0), 885 [](const LegalityQuery &Query) { 886 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 887 }) 888 .narrowScalarIf(largerThan(1, 0), 889 [](const LegalityQuery &Query) { 890 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 891 }); 892 893 getActionDefinitionsBuilder(G_PTRTOINT) 894 // List the common cases 895 .legalForCartesianProduct(AddrSpaces64, {S64}) 896 .legalForCartesianProduct(AddrSpaces32, {S32}) 897 .scalarize(0) 898 // Accept any address space as long as the size matches 899 .legalIf(sameSize(0, 1)) 900 .widenScalarIf(smallerThan(0, 1), 901 [](const LegalityQuery &Query) { 902 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 903 }) 904 .narrowScalarIf( 905 largerThan(0, 1), 906 [](const LegalityQuery &Query) { 907 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 908 }); 909 910 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 911 .scalarize(0) 912 .custom(); 913 914 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 915 bool IsLoad) -> bool { 916 const LLT DstTy = Query.Types[0]; 917 918 // Split vector extloads. 919 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 920 unsigned Align = Query.MMODescrs[0].AlignInBits; 921 922 if (MemSize < DstTy.getSizeInBits()) 923 MemSize = std::max(MemSize, Align); 924 925 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 926 return true; 927 928 const LLT PtrTy = Query.Types[1]; 929 unsigned AS = PtrTy.getAddressSpace(); 930 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 931 return true; 932 933 // Catch weird sized loads that don't evenly divide into the access sizes 934 // TODO: May be able to widen depending on alignment etc. 935 unsigned NumRegs = (MemSize + 31) / 32; 936 if (NumRegs == 3) { 937 if (!ST.hasDwordx3LoadStores()) 938 return true; 939 } else { 940 // If the alignment allows, these should have been widened. 941 if (!isPowerOf2_32(NumRegs)) 942 return true; 943 } 944 945 if (Align < MemSize) { 946 const SITargetLowering *TLI = ST.getTargetLowering(); 947 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 948 } 949 950 return false; 951 }; 952 953 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 954 unsigned Opc) -> bool { 955 unsigned Size = Query.Types[0].getSizeInBits(); 956 if (isPowerOf2_32(Size)) 957 return false; 958 959 if (Size == 96 && ST.hasDwordx3LoadStores()) 960 return false; 961 962 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 963 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 964 return false; 965 966 unsigned Align = Query.MMODescrs[0].AlignInBits; 967 unsigned RoundedSize = NextPowerOf2(Size); 968 return (Align >= RoundedSize); 969 }; 970 971 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 972 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 973 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 974 975 // TODO: Refine based on subtargets which support unaligned access or 128-bit 976 // LDS 977 // TODO: Unsupported flat for SI. 978 979 for (unsigned Op : {G_LOAD, G_STORE}) { 980 const bool IsStore = Op == G_STORE; 981 982 auto &Actions = getActionDefinitionsBuilder(Op); 983 // Explicitly list some common cases. 984 // TODO: Does this help compile time at all? 985 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 986 {V2S32, GlobalPtr, 64, GlobalAlign32}, 987 {V4S32, GlobalPtr, 128, GlobalAlign32}, 988 {S64, GlobalPtr, 64, GlobalAlign32}, 989 {V2S64, GlobalPtr, 128, GlobalAlign32}, 990 {V2S16, GlobalPtr, 32, GlobalAlign32}, 991 {S32, GlobalPtr, 8, GlobalAlign8}, 992 {S32, GlobalPtr, 16, GlobalAlign16}, 993 994 {S32, LocalPtr, 32, 32}, 995 {S64, LocalPtr, 64, 32}, 996 {V2S32, LocalPtr, 64, 32}, 997 {S32, LocalPtr, 8, 8}, 998 {S32, LocalPtr, 16, 16}, 999 {V2S16, LocalPtr, 32, 32}, 1000 1001 {S32, PrivatePtr, 32, 32}, 1002 {S32, PrivatePtr, 8, 8}, 1003 {S32, PrivatePtr, 16, 16}, 1004 {V2S16, PrivatePtr, 32, 32}, 1005 1006 {S32, ConstantPtr, 32, GlobalAlign32}, 1007 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1008 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1009 {S64, ConstantPtr, 64, GlobalAlign32}, 1010 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1011 Actions.legalIf( 1012 [=](const LegalityQuery &Query) -> bool { 1013 return isLoadStoreLegal(ST, Query, Op); 1014 }); 1015 1016 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1017 // 64-bits. 1018 // 1019 // TODO: Should generalize bitcast action into coerce, which will also cover 1020 // inserting addrspacecasts. 1021 Actions.customIf(typeIs(1, Constant32Ptr)); 1022 1023 // Turn any illegal element vectors into something easier to deal 1024 // with. These will ultimately produce 32-bit scalar shifts to extract the 1025 // parts anyway. 1026 // 1027 // For odd 16-bit element vectors, prefer to split those into pieces with 1028 // 16-bit vector parts. 1029 Actions.bitcastIf( 1030 [=](const LegalityQuery &Query) -> bool { 1031 const LLT Ty = Query.Types[0]; 1032 const unsigned Size = Ty.getSizeInBits(); 1033 1034 if (Size != Query.MMODescrs[0].SizeInBits) 1035 return Size <= 32 && Ty.isVector(); 1036 1037 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1038 return true; 1039 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1040 !isRegisterVectorElementType(Ty.getElementType()); 1041 }, bitcastToRegisterType(0)); 1042 1043 Actions 1044 .customIf(typeIs(1, Constant32Ptr)) 1045 // Widen suitably aligned loads by loading extra elements. 1046 .moreElementsIf([=](const LegalityQuery &Query) { 1047 const LLT Ty = Query.Types[0]; 1048 return Op == G_LOAD && Ty.isVector() && 1049 shouldWidenLoadResult(Query, Op); 1050 }, moreElementsToNextPow2(0)) 1051 .widenScalarIf([=](const LegalityQuery &Query) { 1052 const LLT Ty = Query.Types[0]; 1053 return Op == G_LOAD && !Ty.isVector() && 1054 shouldWidenLoadResult(Query, Op); 1055 }, widenScalarOrEltToNextPow2(0)) 1056 .narrowScalarIf( 1057 [=](const LegalityQuery &Query) -> bool { 1058 return !Query.Types[0].isVector() && 1059 needToSplitMemOp(Query, Op == G_LOAD); 1060 }, 1061 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1062 const LLT DstTy = Query.Types[0]; 1063 const LLT PtrTy = Query.Types[1]; 1064 1065 const unsigned DstSize = DstTy.getSizeInBits(); 1066 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1067 1068 // Split extloads. 1069 if (DstSize > MemSize) 1070 return std::make_pair(0, LLT::scalar(MemSize)); 1071 1072 if (!isPowerOf2_32(DstSize)) { 1073 // We're probably decomposing an odd sized store. Try to split 1074 // to the widest type. TODO: Account for alignment. As-is it 1075 // should be OK, since the new parts will be further legalized. 1076 unsigned FloorSize = PowerOf2Floor(DstSize); 1077 return std::make_pair(0, LLT::scalar(FloorSize)); 1078 } 1079 1080 if (DstSize > 32 && (DstSize % 32 != 0)) { 1081 // FIXME: Need a way to specify non-extload of larger size if 1082 // suitably aligned. 1083 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1084 } 1085 1086 unsigned MaxSize = maxSizeForAddrSpace(ST, 1087 PtrTy.getAddressSpace(), 1088 Op == G_LOAD); 1089 if (MemSize > MaxSize) 1090 return std::make_pair(0, LLT::scalar(MaxSize)); 1091 1092 unsigned Align = Query.MMODescrs[0].AlignInBits; 1093 return std::make_pair(0, LLT::scalar(Align)); 1094 }) 1095 .fewerElementsIf( 1096 [=](const LegalityQuery &Query) -> bool { 1097 return Query.Types[0].isVector() && 1098 needToSplitMemOp(Query, Op == G_LOAD); 1099 }, 1100 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1101 const LLT DstTy = Query.Types[0]; 1102 const LLT PtrTy = Query.Types[1]; 1103 1104 LLT EltTy = DstTy.getElementType(); 1105 unsigned MaxSize = maxSizeForAddrSpace(ST, 1106 PtrTy.getAddressSpace(), 1107 Op == G_LOAD); 1108 1109 // FIXME: Handle widened to power of 2 results better. This ends 1110 // up scalarizing. 1111 // FIXME: 3 element stores scalarized on SI 1112 1113 // Split if it's too large for the address space. 1114 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1115 unsigned NumElts = DstTy.getNumElements(); 1116 unsigned EltSize = EltTy.getSizeInBits(); 1117 1118 if (MaxSize % EltSize == 0) { 1119 return std::make_pair( 1120 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1121 } 1122 1123 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1124 1125 // FIXME: Refine when odd breakdowns handled 1126 // The scalars will need to be re-legalized. 1127 if (NumPieces == 1 || NumPieces >= NumElts || 1128 NumElts % NumPieces != 0) 1129 return std::make_pair(0, EltTy); 1130 1131 return std::make_pair(0, 1132 LLT::vector(NumElts / NumPieces, EltTy)); 1133 } 1134 1135 // FIXME: We could probably handle weird extending loads better. 1136 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1137 if (DstTy.getSizeInBits() > MemSize) 1138 return std::make_pair(0, EltTy); 1139 1140 unsigned EltSize = EltTy.getSizeInBits(); 1141 unsigned DstSize = DstTy.getSizeInBits(); 1142 if (!isPowerOf2_32(DstSize)) { 1143 // We're probably decomposing an odd sized store. Try to split 1144 // to the widest type. TODO: Account for alignment. As-is it 1145 // should be OK, since the new parts will be further legalized. 1146 unsigned FloorSize = PowerOf2Floor(DstSize); 1147 return std::make_pair( 1148 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1149 } 1150 1151 // Need to split because of alignment. 1152 unsigned Align = Query.MMODescrs[0].AlignInBits; 1153 if (EltSize > Align && 1154 (EltSize / Align < DstTy.getNumElements())) { 1155 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1156 } 1157 1158 // May need relegalization for the scalars. 1159 return std::make_pair(0, EltTy); 1160 }) 1161 .minScalar(0, S32); 1162 1163 if (IsStore) 1164 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1165 1166 // TODO: Need a bitcast lower option? 1167 Actions 1168 .widenScalarToNextPow2(0) 1169 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1170 } 1171 1172 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1173 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1174 {S32, GlobalPtr, 16, 2 * 8}, 1175 {S32, LocalPtr, 8, 8}, 1176 {S32, LocalPtr, 16, 16}, 1177 {S32, PrivatePtr, 8, 8}, 1178 {S32, PrivatePtr, 16, 16}, 1179 {S32, ConstantPtr, 8, 8}, 1180 {S32, ConstantPtr, 16, 2 * 8}}); 1181 if (ST.hasFlatAddressSpace()) { 1182 ExtLoads.legalForTypesWithMemDesc( 1183 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1184 } 1185 1186 ExtLoads.clampScalar(0, S32, S32) 1187 .widenScalarToNextPow2(0) 1188 .unsupportedIfMemSizeNotPow2() 1189 .lower(); 1190 1191 auto &Atomics = getActionDefinitionsBuilder( 1192 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1193 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1194 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1195 G_ATOMICRMW_UMIN}) 1196 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1197 {S64, GlobalPtr}, {S64, LocalPtr}, 1198 {S32, RegionPtr}, {S64, RegionPtr}}); 1199 if (ST.hasFlatAddressSpace()) { 1200 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1201 } 1202 1203 if (ST.hasLDSFPAtomics()) { 1204 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1205 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1206 } 1207 1208 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1209 // demarshalling 1210 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1211 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1212 {S32, FlatPtr}, {S64, FlatPtr}}) 1213 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1214 {S32, RegionPtr}, {S64, RegionPtr}}); 1215 // TODO: Pointer types, any 32-bit or 64-bit vector 1216 1217 // Condition should be s32 for scalar, s1 for vector. 1218 getActionDefinitionsBuilder(G_SELECT) 1219 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1220 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1221 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1222 .clampScalar(0, S16, S64) 1223 .scalarize(1) 1224 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1225 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1226 .clampMaxNumElements(0, S32, 2) 1227 .clampMaxNumElements(0, LocalPtr, 2) 1228 .clampMaxNumElements(0, PrivatePtr, 2) 1229 .scalarize(0) 1230 .widenScalarToNextPow2(0) 1231 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1232 1233 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1234 // be more flexible with the shift amount type. 1235 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1236 .legalFor({{S32, S32}, {S64, S32}}); 1237 if (ST.has16BitInsts()) { 1238 if (ST.hasVOP3PInsts()) { 1239 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1240 .clampMaxNumElements(0, S16, 2); 1241 } else 1242 Shifts.legalFor({{S16, S16}}); 1243 1244 // TODO: Support 16-bit shift amounts for all types 1245 Shifts.widenScalarIf( 1246 [=](const LegalityQuery &Query) { 1247 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1248 // 32-bit amount. 1249 const LLT ValTy = Query.Types[0]; 1250 const LLT AmountTy = Query.Types[1]; 1251 return ValTy.getSizeInBits() <= 16 && 1252 AmountTy.getSizeInBits() < 16; 1253 }, changeTo(1, S16)); 1254 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1255 Shifts.clampScalar(1, S32, S32); 1256 Shifts.clampScalar(0, S16, S64); 1257 Shifts.widenScalarToNextPow2(0, 16); 1258 } else { 1259 // Make sure we legalize the shift amount type first, as the general 1260 // expansion for the shifted type will produce much worse code if it hasn't 1261 // been truncated already. 1262 Shifts.clampScalar(1, S32, S32); 1263 Shifts.clampScalar(0, S32, S64); 1264 Shifts.widenScalarToNextPow2(0, 32); 1265 } 1266 Shifts.scalarize(0); 1267 1268 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1269 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1270 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1271 unsigned IdxTypeIdx = 2; 1272 1273 getActionDefinitionsBuilder(Op) 1274 .customIf([=](const LegalityQuery &Query) { 1275 const LLT EltTy = Query.Types[EltTypeIdx]; 1276 const LLT VecTy = Query.Types[VecTypeIdx]; 1277 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1278 return (EltTy.getSizeInBits() == 16 || 1279 EltTy.getSizeInBits() % 32 == 0) && 1280 VecTy.getSizeInBits() % 32 == 0 && 1281 VecTy.getSizeInBits() <= MaxRegisterSize && 1282 IdxTy.getSizeInBits() == 32; 1283 }) 1284 .clampScalar(EltTypeIdx, S32, S64) 1285 .clampScalar(VecTypeIdx, S32, S64) 1286 .clampScalar(IdxTypeIdx, S32, S32); 1287 } 1288 1289 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1290 .unsupportedIf([=](const LegalityQuery &Query) { 1291 const LLT &EltTy = Query.Types[1].getElementType(); 1292 return Query.Types[0] != EltTy; 1293 }); 1294 1295 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1296 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1297 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1298 1299 // FIXME: Doesn't handle extract of illegal sizes. 1300 getActionDefinitionsBuilder(Op) 1301 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1302 // FIXME: Multiples of 16 should not be legal. 1303 .legalIf([=](const LegalityQuery &Query) { 1304 const LLT BigTy = Query.Types[BigTyIdx]; 1305 const LLT LitTy = Query.Types[LitTyIdx]; 1306 return (BigTy.getSizeInBits() % 32 == 0) && 1307 (LitTy.getSizeInBits() % 16 == 0); 1308 }) 1309 .widenScalarIf( 1310 [=](const LegalityQuery &Query) { 1311 const LLT BigTy = Query.Types[BigTyIdx]; 1312 return (BigTy.getScalarSizeInBits() < 16); 1313 }, 1314 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1315 .widenScalarIf( 1316 [=](const LegalityQuery &Query) { 1317 const LLT LitTy = Query.Types[LitTyIdx]; 1318 return (LitTy.getScalarSizeInBits() < 16); 1319 }, 1320 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1321 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1322 .widenScalarToNextPow2(BigTyIdx, 32); 1323 1324 } 1325 1326 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1327 .legalForCartesianProduct(AllS32Vectors, {S32}) 1328 .legalForCartesianProduct(AllS64Vectors, {S64}) 1329 .clampNumElements(0, V16S32, V32S32) 1330 .clampNumElements(0, V2S64, V16S64) 1331 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1332 1333 if (ST.hasScalarPackInsts()) { 1334 BuildVector 1335 // FIXME: Should probably widen s1 vectors straight to s32 1336 .minScalarOrElt(0, S16) 1337 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1338 .minScalar(1, S32); 1339 1340 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1341 .legalFor({V2S16, S32}) 1342 .lower(); 1343 BuildVector.minScalarOrElt(0, S32); 1344 } else { 1345 BuildVector.customFor({V2S16, S16}); 1346 BuildVector.minScalarOrElt(0, S32); 1347 1348 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1349 .customFor({V2S16, S32}) 1350 .lower(); 1351 } 1352 1353 BuildVector.legalIf(isRegisterType(0)); 1354 1355 // FIXME: Clamp maximum size 1356 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1357 .legalIf(isRegisterType(0)); 1358 1359 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1360 // pre-legalize. 1361 if (ST.hasVOP3PInsts()) { 1362 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1363 .customFor({V2S16, V2S16}) 1364 .lower(); 1365 } else 1366 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1367 1368 // Merge/Unmerge 1369 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1370 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1371 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1372 1373 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1374 const LLT Ty = Query.Types[TypeIdx]; 1375 if (Ty.isVector()) { 1376 const LLT &EltTy = Ty.getElementType(); 1377 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1378 return true; 1379 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1380 return true; 1381 } 1382 return false; 1383 }; 1384 1385 auto &Builder = getActionDefinitionsBuilder(Op) 1386 .lowerFor({{S16, V2S16}}) 1387 .lowerIf([=](const LegalityQuery &Query) { 1388 const LLT BigTy = Query.Types[BigTyIdx]; 1389 return BigTy.getSizeInBits() == 32; 1390 }) 1391 // Try to widen to s16 first for small types. 1392 // TODO: Only do this on targets with legal s16 shifts 1393 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1394 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1395 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1396 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1397 elementTypeIs(1, S16)), 1398 changeTo(1, V2S16)) 1399 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1400 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1401 // valid. 1402 .clampScalar(LitTyIdx, S32, S512) 1403 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1404 // Break up vectors with weird elements into scalars 1405 .fewerElementsIf( 1406 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1407 scalarize(0)) 1408 .fewerElementsIf( 1409 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1410 scalarize(1)) 1411 .clampScalar(BigTyIdx, S32, MaxScalar); 1412 1413 if (Op == G_MERGE_VALUES) { 1414 Builder.widenScalarIf( 1415 // TODO: Use 16-bit shifts if legal for 8-bit values? 1416 [=](const LegalityQuery &Query) { 1417 const LLT Ty = Query.Types[LitTyIdx]; 1418 return Ty.getSizeInBits() < 32; 1419 }, 1420 changeTo(LitTyIdx, S32)); 1421 } 1422 1423 Builder.widenScalarIf( 1424 [=](const LegalityQuery &Query) { 1425 const LLT Ty = Query.Types[BigTyIdx]; 1426 return !isPowerOf2_32(Ty.getSizeInBits()) && 1427 Ty.getSizeInBits() % 16 != 0; 1428 }, 1429 [=](const LegalityQuery &Query) { 1430 // Pick the next power of 2, or a multiple of 64 over 128. 1431 // Whichever is smaller. 1432 const LLT &Ty = Query.Types[BigTyIdx]; 1433 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1434 if (NewSizeInBits >= 256) { 1435 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1436 if (RoundedTo < NewSizeInBits) 1437 NewSizeInBits = RoundedTo; 1438 } 1439 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1440 }) 1441 .legalIf([=](const LegalityQuery &Query) { 1442 const LLT &BigTy = Query.Types[BigTyIdx]; 1443 const LLT &LitTy = Query.Types[LitTyIdx]; 1444 1445 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1446 return false; 1447 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1448 return false; 1449 1450 return BigTy.getSizeInBits() % 16 == 0 && 1451 LitTy.getSizeInBits() % 16 == 0 && 1452 BigTy.getSizeInBits() <= MaxRegisterSize; 1453 }) 1454 // Any vectors left are the wrong size. Scalarize them. 1455 .scalarize(0) 1456 .scalarize(1); 1457 } 1458 1459 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1460 // RegBankSelect. 1461 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1462 .legalFor({{S32}, {S64}}); 1463 1464 if (ST.hasVOP3PInsts()) { 1465 SextInReg.lowerFor({{V2S16}}) 1466 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1467 // get more vector shift opportunities, since we'll get those when 1468 // expanded. 1469 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1470 } else if (ST.has16BitInsts()) { 1471 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1472 } else { 1473 // Prefer to promote to s32 before lowering if we don't have 16-bit 1474 // shifts. This avoid a lot of intermediate truncate and extend operations. 1475 SextInReg.lowerFor({{S32}, {S64}}); 1476 } 1477 1478 SextInReg 1479 .scalarize(0) 1480 .clampScalar(0, S32, S64) 1481 .lower(); 1482 1483 getActionDefinitionsBuilder(G_FSHR) 1484 .legalFor({{S32, S32}}) 1485 .scalarize(0) 1486 .lower(); 1487 1488 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1489 .legalFor({S64}); 1490 1491 getActionDefinitionsBuilder({ 1492 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1493 G_FCOPYSIGN, 1494 1495 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1496 G_READ_REGISTER, 1497 G_WRITE_REGISTER, 1498 1499 G_SADDO, G_SSUBO, 1500 1501 // TODO: Implement 1502 G_FMINIMUM, G_FMAXIMUM, 1503 G_FSHL 1504 }).lower(); 1505 1506 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1507 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1508 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1509 .unsupported(); 1510 1511 computeTables(); 1512 verify(*ST.getInstrInfo()); 1513 } 1514 1515 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1516 MachineInstr &MI) const { 1517 MachineIRBuilder &B = Helper.MIRBuilder; 1518 MachineRegisterInfo &MRI = *B.getMRI(); 1519 GISelChangeObserver &Observer = Helper.Observer; 1520 1521 switch (MI.getOpcode()) { 1522 case TargetOpcode::G_ADDRSPACE_CAST: 1523 return legalizeAddrSpaceCast(MI, MRI, B); 1524 case TargetOpcode::G_FRINT: 1525 return legalizeFrint(MI, MRI, B); 1526 case TargetOpcode::G_FCEIL: 1527 return legalizeFceil(MI, MRI, B); 1528 case TargetOpcode::G_INTRINSIC_TRUNC: 1529 return legalizeIntrinsicTrunc(MI, MRI, B); 1530 case TargetOpcode::G_SITOFP: 1531 return legalizeITOFP(MI, MRI, B, true); 1532 case TargetOpcode::G_UITOFP: 1533 return legalizeITOFP(MI, MRI, B, false); 1534 case TargetOpcode::G_FPTOSI: 1535 return legalizeFPTOI(MI, MRI, B, true); 1536 case TargetOpcode::G_FPTOUI: 1537 return legalizeFPTOI(MI, MRI, B, false); 1538 case TargetOpcode::G_FMINNUM: 1539 case TargetOpcode::G_FMAXNUM: 1540 case TargetOpcode::G_FMINNUM_IEEE: 1541 case TargetOpcode::G_FMAXNUM_IEEE: 1542 return legalizeMinNumMaxNum(Helper, MI); 1543 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1544 return legalizeExtractVectorElt(MI, MRI, B); 1545 case TargetOpcode::G_INSERT_VECTOR_ELT: 1546 return legalizeInsertVectorElt(MI, MRI, B); 1547 case TargetOpcode::G_SHUFFLE_VECTOR: 1548 return legalizeShuffleVector(MI, MRI, B); 1549 case TargetOpcode::G_FSIN: 1550 case TargetOpcode::G_FCOS: 1551 return legalizeSinCos(MI, MRI, B); 1552 case TargetOpcode::G_GLOBAL_VALUE: 1553 return legalizeGlobalValue(MI, MRI, B); 1554 case TargetOpcode::G_LOAD: 1555 return legalizeLoad(MI, MRI, B, Observer); 1556 case TargetOpcode::G_FMAD: 1557 return legalizeFMad(MI, MRI, B); 1558 case TargetOpcode::G_FDIV: 1559 return legalizeFDIV(MI, MRI, B); 1560 case TargetOpcode::G_UDIV: 1561 case TargetOpcode::G_UREM: 1562 return legalizeUDIV_UREM(MI, MRI, B); 1563 case TargetOpcode::G_SDIV: 1564 case TargetOpcode::G_SREM: 1565 return legalizeSDIV_SREM(MI, MRI, B); 1566 case TargetOpcode::G_ATOMIC_CMPXCHG: 1567 return legalizeAtomicCmpXChg(MI, MRI, B); 1568 case TargetOpcode::G_FLOG: 1569 return legalizeFlog(MI, B, numbers::ln2f); 1570 case TargetOpcode::G_FLOG10: 1571 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1572 case TargetOpcode::G_FEXP: 1573 return legalizeFExp(MI, B); 1574 case TargetOpcode::G_FPOW: 1575 return legalizeFPow(MI, B); 1576 case TargetOpcode::G_FFLOOR: 1577 return legalizeFFloor(MI, MRI, B); 1578 case TargetOpcode::G_BUILD_VECTOR: 1579 return legalizeBuildVector(MI, MRI, B); 1580 default: 1581 return false; 1582 } 1583 1584 llvm_unreachable("expected switch to return"); 1585 } 1586 1587 Register AMDGPULegalizerInfo::getSegmentAperture( 1588 unsigned AS, 1589 MachineRegisterInfo &MRI, 1590 MachineIRBuilder &B) const { 1591 MachineFunction &MF = B.getMF(); 1592 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1593 const LLT S32 = LLT::scalar(32); 1594 1595 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1596 1597 if (ST.hasApertureRegs()) { 1598 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1599 // getreg. 1600 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1601 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1602 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1603 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1604 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1605 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1606 unsigned Encoding = 1607 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1608 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1609 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1610 1611 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1612 1613 B.buildInstr(AMDGPU::S_GETREG_B32) 1614 .addDef(GetReg) 1615 .addImm(Encoding); 1616 MRI.setType(GetReg, S32); 1617 1618 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1619 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1620 } 1621 1622 Register QueuePtr = MRI.createGenericVirtualRegister( 1623 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1624 1625 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1626 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1627 return Register(); 1628 1629 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1630 // private_segment_aperture_base_hi. 1631 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1632 1633 // TODO: can we be smarter about machine pointer info? 1634 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1635 MachineMemOperand *MMO = MF.getMachineMemOperand( 1636 PtrInfo, 1637 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1638 MachineMemOperand::MOInvariant, 1639 4, commonAlignment(Align(64), StructOffset)); 1640 1641 Register LoadAddr; 1642 1643 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1644 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1645 } 1646 1647 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1648 MachineInstr &MI, MachineRegisterInfo &MRI, 1649 MachineIRBuilder &B) const { 1650 MachineFunction &MF = B.getMF(); 1651 1652 const LLT S32 = LLT::scalar(32); 1653 Register Dst = MI.getOperand(0).getReg(); 1654 Register Src = MI.getOperand(1).getReg(); 1655 1656 LLT DstTy = MRI.getType(Dst); 1657 LLT SrcTy = MRI.getType(Src); 1658 unsigned DestAS = DstTy.getAddressSpace(); 1659 unsigned SrcAS = SrcTy.getAddressSpace(); 1660 1661 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1662 // vector element. 1663 assert(!DstTy.isVector()); 1664 1665 const AMDGPUTargetMachine &TM 1666 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1667 1668 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1669 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1670 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1671 return true; 1672 } 1673 1674 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1675 // Truncate. 1676 B.buildExtract(Dst, Src, 0); 1677 MI.eraseFromParent(); 1678 return true; 1679 } 1680 1681 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1682 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1683 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1684 1685 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1686 // another. Merge operands are required to be the same type, but creating an 1687 // extra ptrtoint would be kind of pointless. 1688 auto HighAddr = B.buildConstant( 1689 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1690 B.buildMerge(Dst, {Src, HighAddr}); 1691 MI.eraseFromParent(); 1692 return true; 1693 } 1694 1695 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1696 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1697 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1698 unsigned NullVal = TM.getNullPointerValue(DestAS); 1699 1700 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1701 auto FlatNull = B.buildConstant(SrcTy, 0); 1702 1703 // Extract low 32-bits of the pointer. 1704 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1705 1706 auto CmpRes = 1707 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1708 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1709 1710 MI.eraseFromParent(); 1711 return true; 1712 } 1713 1714 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1715 return false; 1716 1717 if (!ST.hasFlatAddressSpace()) 1718 return false; 1719 1720 auto SegmentNull = 1721 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1722 auto FlatNull = 1723 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1724 1725 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1726 if (!ApertureReg.isValid()) 1727 return false; 1728 1729 auto CmpRes = 1730 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1731 1732 // Coerce the type of the low half of the result so we can use merge_values. 1733 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1734 1735 // TODO: Should we allow mismatched types but matching sizes in merges to 1736 // avoid the ptrtoint? 1737 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1738 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1739 1740 MI.eraseFromParent(); 1741 return true; 1742 } 1743 1744 bool AMDGPULegalizerInfo::legalizeFrint( 1745 MachineInstr &MI, MachineRegisterInfo &MRI, 1746 MachineIRBuilder &B) const { 1747 Register Src = MI.getOperand(1).getReg(); 1748 LLT Ty = MRI.getType(Src); 1749 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1750 1751 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1752 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1753 1754 auto C1 = B.buildFConstant(Ty, C1Val); 1755 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1756 1757 // TODO: Should this propagate fast-math-flags? 1758 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1759 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1760 1761 auto C2 = B.buildFConstant(Ty, C2Val); 1762 auto Fabs = B.buildFAbs(Ty, Src); 1763 1764 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1765 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1766 MI.eraseFromParent(); 1767 return true; 1768 } 1769 1770 bool AMDGPULegalizerInfo::legalizeFceil( 1771 MachineInstr &MI, MachineRegisterInfo &MRI, 1772 MachineIRBuilder &B) const { 1773 1774 const LLT S1 = LLT::scalar(1); 1775 const LLT S64 = LLT::scalar(64); 1776 1777 Register Src = MI.getOperand(1).getReg(); 1778 assert(MRI.getType(Src) == S64); 1779 1780 // result = trunc(src) 1781 // if (src > 0.0 && src != result) 1782 // result += 1.0 1783 1784 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1785 1786 const auto Zero = B.buildFConstant(S64, 0.0); 1787 const auto One = B.buildFConstant(S64, 1.0); 1788 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1789 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1790 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1791 auto Add = B.buildSelect(S64, And, One, Zero); 1792 1793 // TODO: Should this propagate fast-math-flags? 1794 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1795 return true; 1796 } 1797 1798 static MachineInstrBuilder extractF64Exponent(Register Hi, 1799 MachineIRBuilder &B) { 1800 const unsigned FractBits = 52; 1801 const unsigned ExpBits = 11; 1802 LLT S32 = LLT::scalar(32); 1803 1804 auto Const0 = B.buildConstant(S32, FractBits - 32); 1805 auto Const1 = B.buildConstant(S32, ExpBits); 1806 1807 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1808 .addUse(Hi) 1809 .addUse(Const0.getReg(0)) 1810 .addUse(Const1.getReg(0)); 1811 1812 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 const LLT S1 = LLT::scalar(1); 1819 const LLT S32 = LLT::scalar(32); 1820 const LLT S64 = LLT::scalar(64); 1821 1822 Register Src = MI.getOperand(1).getReg(); 1823 assert(MRI.getType(Src) == S64); 1824 1825 // TODO: Should this use extract since the low half is unused? 1826 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1827 Register Hi = Unmerge.getReg(1); 1828 1829 // Extract the upper half, since this is where we will find the sign and 1830 // exponent. 1831 auto Exp = extractF64Exponent(Hi, B); 1832 1833 const unsigned FractBits = 52; 1834 1835 // Extract the sign bit. 1836 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1837 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1838 1839 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1840 1841 const auto Zero32 = B.buildConstant(S32, 0); 1842 1843 // Extend back to 64-bits. 1844 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1845 1846 auto Shr = B.buildAShr(S64, FractMask, Exp); 1847 auto Not = B.buildNot(S64, Shr); 1848 auto Tmp0 = B.buildAnd(S64, Src, Not); 1849 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1850 1851 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1852 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1853 1854 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1855 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1856 MI.eraseFromParent(); 1857 return true; 1858 } 1859 1860 bool AMDGPULegalizerInfo::legalizeITOFP( 1861 MachineInstr &MI, MachineRegisterInfo &MRI, 1862 MachineIRBuilder &B, bool Signed) const { 1863 1864 Register Dst = MI.getOperand(0).getReg(); 1865 Register Src = MI.getOperand(1).getReg(); 1866 1867 const LLT S64 = LLT::scalar(64); 1868 const LLT S32 = LLT::scalar(32); 1869 1870 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1871 1872 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1873 1874 auto CvtHi = Signed ? 1875 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1876 B.buildUITOFP(S64, Unmerge.getReg(1)); 1877 1878 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1879 1880 auto ThirtyTwo = B.buildConstant(S32, 32); 1881 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1882 .addUse(CvtHi.getReg(0)) 1883 .addUse(ThirtyTwo.getReg(0)); 1884 1885 // TODO: Should this propagate fast-math-flags? 1886 B.buildFAdd(Dst, LdExp, CvtLo); 1887 MI.eraseFromParent(); 1888 return true; 1889 } 1890 1891 // TODO: Copied from DAG implementation. Verify logic and document how this 1892 // actually works. 1893 bool AMDGPULegalizerInfo::legalizeFPTOI( 1894 MachineInstr &MI, MachineRegisterInfo &MRI, 1895 MachineIRBuilder &B, bool Signed) const { 1896 1897 Register Dst = MI.getOperand(0).getReg(); 1898 Register Src = MI.getOperand(1).getReg(); 1899 1900 const LLT S64 = LLT::scalar(64); 1901 const LLT S32 = LLT::scalar(32); 1902 1903 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1904 1905 unsigned Flags = MI.getFlags(); 1906 1907 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1908 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1909 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1910 1911 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1912 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1913 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1914 1915 auto Hi = Signed ? 1916 B.buildFPTOSI(S32, FloorMul) : 1917 B.buildFPTOUI(S32, FloorMul); 1918 auto Lo = B.buildFPTOUI(S32, Fma); 1919 1920 B.buildMerge(Dst, { Lo, Hi }); 1921 MI.eraseFromParent(); 1922 1923 return true; 1924 } 1925 1926 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1927 MachineInstr &MI) const { 1928 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1929 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1930 1931 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1932 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1933 1934 // With ieee_mode disabled, the instructions have the correct behavior 1935 // already for G_FMINNUM/G_FMAXNUM 1936 if (!MFI->getMode().IEEE) 1937 return !IsIEEEOp; 1938 1939 if (IsIEEEOp) 1940 return true; 1941 1942 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 // TODO: Should move some of this into LegalizerHelper. 1949 1950 // TODO: Promote dynamic indexing of s16 to s32 1951 1952 // FIXME: Artifact combiner probably should have replaced the truncated 1953 // constant before this, so we shouldn't need 1954 // getConstantVRegValWithLookThrough. 1955 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1956 MI.getOperand(2).getReg(), MRI); 1957 if (!IdxVal) // Dynamic case will be selected to register indexing. 1958 return true; 1959 1960 Register Dst = MI.getOperand(0).getReg(); 1961 Register Vec = MI.getOperand(1).getReg(); 1962 1963 LLT VecTy = MRI.getType(Vec); 1964 LLT EltTy = VecTy.getElementType(); 1965 assert(EltTy == MRI.getType(Dst)); 1966 1967 if (IdxVal->Value < VecTy.getNumElements()) 1968 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1969 else 1970 B.buildUndef(Dst); 1971 1972 MI.eraseFromParent(); 1973 return true; 1974 } 1975 1976 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1977 MachineInstr &MI, MachineRegisterInfo &MRI, 1978 MachineIRBuilder &B) const { 1979 // TODO: Should move some of this into LegalizerHelper. 1980 1981 // TODO: Promote dynamic indexing of s16 to s32 1982 1983 // FIXME: Artifact combiner probably should have replaced the truncated 1984 // constant before this, so we shouldn't need 1985 // getConstantVRegValWithLookThrough. 1986 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1987 MI.getOperand(3).getReg(), MRI); 1988 if (!IdxVal) // Dynamic case will be selected to register indexing. 1989 return true; 1990 1991 Register Dst = MI.getOperand(0).getReg(); 1992 Register Vec = MI.getOperand(1).getReg(); 1993 Register Ins = MI.getOperand(2).getReg(); 1994 1995 LLT VecTy = MRI.getType(Vec); 1996 LLT EltTy = VecTy.getElementType(); 1997 assert(EltTy == MRI.getType(Ins)); 1998 1999 if (IdxVal->Value < VecTy.getNumElements()) 2000 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2001 else 2002 B.buildUndef(Dst); 2003 2004 MI.eraseFromParent(); 2005 return true; 2006 } 2007 2008 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2009 MachineInstr &MI, MachineRegisterInfo &MRI, 2010 MachineIRBuilder &B) const { 2011 const LLT V2S16 = LLT::vector(2, 16); 2012 2013 Register Dst = MI.getOperand(0).getReg(); 2014 Register Src0 = MI.getOperand(1).getReg(); 2015 LLT DstTy = MRI.getType(Dst); 2016 LLT SrcTy = MRI.getType(Src0); 2017 2018 if (SrcTy == V2S16 && DstTy == V2S16 && 2019 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2020 return true; 2021 2022 MachineIRBuilder HelperBuilder(MI); 2023 GISelObserverWrapper DummyObserver; 2024 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2025 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2026 } 2027 2028 bool AMDGPULegalizerInfo::legalizeSinCos( 2029 MachineInstr &MI, MachineRegisterInfo &MRI, 2030 MachineIRBuilder &B) const { 2031 2032 Register DstReg = MI.getOperand(0).getReg(); 2033 Register SrcReg = MI.getOperand(1).getReg(); 2034 LLT Ty = MRI.getType(DstReg); 2035 unsigned Flags = MI.getFlags(); 2036 2037 Register TrigVal; 2038 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2039 if (ST.hasTrigReducedRange()) { 2040 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2041 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2042 .addUse(MulVal.getReg(0)) 2043 .setMIFlags(Flags).getReg(0); 2044 } else 2045 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2046 2047 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2048 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2049 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2050 .addUse(TrigVal) 2051 .setMIFlags(Flags); 2052 MI.eraseFromParent(); 2053 return true; 2054 } 2055 2056 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2057 MachineIRBuilder &B, 2058 const GlobalValue *GV, 2059 int64_t Offset, 2060 unsigned GAFlags) const { 2061 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2062 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2063 // to the following code sequence: 2064 // 2065 // For constant address space: 2066 // s_getpc_b64 s[0:1] 2067 // s_add_u32 s0, s0, $symbol 2068 // s_addc_u32 s1, s1, 0 2069 // 2070 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2071 // a fixup or relocation is emitted to replace $symbol with a literal 2072 // constant, which is a pc-relative offset from the encoding of the $symbol 2073 // operand to the global variable. 2074 // 2075 // For global address space: 2076 // s_getpc_b64 s[0:1] 2077 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2078 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2079 // 2080 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2081 // fixups or relocations are emitted to replace $symbol@*@lo and 2082 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2083 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2084 // operand to the global variable. 2085 // 2086 // What we want here is an offset from the value returned by s_getpc 2087 // (which is the address of the s_add_u32 instruction) to the global 2088 // variable, but since the encoding of $symbol starts 4 bytes after the start 2089 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2090 // small. This requires us to add 4 to the global variable offset in order to 2091 // compute the correct address. 2092 2093 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2094 2095 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2096 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2097 2098 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2099 .addDef(PCReg); 2100 2101 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2102 if (GAFlags == SIInstrInfo::MO_NONE) 2103 MIB.addImm(0); 2104 else 2105 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2106 2107 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2108 2109 if (PtrTy.getSizeInBits() == 32) 2110 B.buildExtract(DstReg, PCReg, 0); 2111 return true; 2112 } 2113 2114 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2115 MachineInstr &MI, MachineRegisterInfo &MRI, 2116 MachineIRBuilder &B) const { 2117 Register DstReg = MI.getOperand(0).getReg(); 2118 LLT Ty = MRI.getType(DstReg); 2119 unsigned AS = Ty.getAddressSpace(); 2120 2121 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2122 MachineFunction &MF = B.getMF(); 2123 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2124 2125 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2126 if (!MFI->isEntryFunction()) { 2127 const Function &Fn = MF.getFunction(); 2128 DiagnosticInfoUnsupported BadLDSDecl( 2129 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2130 DS_Warning); 2131 Fn.getContext().diagnose(BadLDSDecl); 2132 2133 // We currently don't have a way to correctly allocate LDS objects that 2134 // aren't directly associated with a kernel. We do force inlining of 2135 // functions that use local objects. However, if these dead functions are 2136 // not eliminated, we don't want a compile time error. Just emit a warning 2137 // and a trap, since there should be no callable path here. 2138 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2139 B.buildUndef(DstReg); 2140 MI.eraseFromParent(); 2141 return true; 2142 } 2143 2144 // TODO: We could emit code to handle the initialization somewhere. 2145 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2146 const SITargetLowering *TLI = ST.getTargetLowering(); 2147 if (!TLI->shouldUseLDSConstAddress(GV)) { 2148 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2149 return true; // Leave in place; 2150 } 2151 2152 B.buildConstant( 2153 DstReg, 2154 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2155 MI.eraseFromParent(); 2156 return true; 2157 } 2158 2159 const Function &Fn = MF.getFunction(); 2160 DiagnosticInfoUnsupported BadInit( 2161 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2162 Fn.getContext().diagnose(BadInit); 2163 return true; 2164 } 2165 2166 const SITargetLowering *TLI = ST.getTargetLowering(); 2167 2168 if (TLI->shouldEmitFixup(GV)) { 2169 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2170 MI.eraseFromParent(); 2171 return true; 2172 } 2173 2174 if (TLI->shouldEmitPCReloc(GV)) { 2175 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2176 MI.eraseFromParent(); 2177 return true; 2178 } 2179 2180 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2181 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2182 2183 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2184 MachinePointerInfo::getGOT(MF), 2185 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2186 MachineMemOperand::MOInvariant, 2187 8 /*Size*/, Align(8)); 2188 2189 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2190 2191 if (Ty.getSizeInBits() == 32) { 2192 // Truncate if this is a 32-bit constant adrdess. 2193 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2194 B.buildExtract(DstReg, Load, 0); 2195 } else 2196 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2197 2198 MI.eraseFromParent(); 2199 return true; 2200 } 2201 2202 bool AMDGPULegalizerInfo::legalizeLoad( 2203 MachineInstr &MI, MachineRegisterInfo &MRI, 2204 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2205 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2206 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2207 Observer.changingInstr(MI); 2208 MI.getOperand(1).setReg(Cast.getReg(0)); 2209 Observer.changedInstr(MI); 2210 return true; 2211 } 2212 2213 bool AMDGPULegalizerInfo::legalizeFMad( 2214 MachineInstr &MI, MachineRegisterInfo &MRI, 2215 MachineIRBuilder &B) const { 2216 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2217 assert(Ty.isScalar()); 2218 2219 MachineFunction &MF = B.getMF(); 2220 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2221 2222 // TODO: Always legal with future ftz flag. 2223 // FIXME: Do we need just output? 2224 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2225 return true; 2226 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2227 return true; 2228 2229 MachineIRBuilder HelperBuilder(MI); 2230 GISelObserverWrapper DummyObserver; 2231 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2232 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2233 } 2234 2235 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2236 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2237 Register DstReg = MI.getOperand(0).getReg(); 2238 Register PtrReg = MI.getOperand(1).getReg(); 2239 Register CmpVal = MI.getOperand(2).getReg(); 2240 Register NewVal = MI.getOperand(3).getReg(); 2241 2242 assert(SITargetLowering::isFlatGlobalAddrSpace( 2243 MRI.getType(PtrReg).getAddressSpace()) && 2244 "this should not have been custom lowered"); 2245 2246 LLT ValTy = MRI.getType(CmpVal); 2247 LLT VecTy = LLT::vector(2, ValTy); 2248 2249 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2250 2251 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2252 .addDef(DstReg) 2253 .addUse(PtrReg) 2254 .addUse(PackedVal) 2255 .setMemRefs(MI.memoperands()); 2256 2257 MI.eraseFromParent(); 2258 return true; 2259 } 2260 2261 bool AMDGPULegalizerInfo::legalizeFlog( 2262 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2263 Register Dst = MI.getOperand(0).getReg(); 2264 Register Src = MI.getOperand(1).getReg(); 2265 LLT Ty = B.getMRI()->getType(Dst); 2266 unsigned Flags = MI.getFlags(); 2267 2268 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2269 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2270 2271 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2277 MachineIRBuilder &B) const { 2278 Register Dst = MI.getOperand(0).getReg(); 2279 Register Src = MI.getOperand(1).getReg(); 2280 unsigned Flags = MI.getFlags(); 2281 LLT Ty = B.getMRI()->getType(Dst); 2282 2283 auto K = B.buildFConstant(Ty, numbers::log2e); 2284 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2285 B.buildFExp2(Dst, Mul, Flags); 2286 MI.eraseFromParent(); 2287 return true; 2288 } 2289 2290 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2291 MachineIRBuilder &B) const { 2292 Register Dst = MI.getOperand(0).getReg(); 2293 Register Src0 = MI.getOperand(1).getReg(); 2294 Register Src1 = MI.getOperand(2).getReg(); 2295 unsigned Flags = MI.getFlags(); 2296 LLT Ty = B.getMRI()->getType(Dst); 2297 const LLT S16 = LLT::scalar(16); 2298 const LLT S32 = LLT::scalar(32); 2299 2300 if (Ty == S32) { 2301 auto Log = B.buildFLog2(S32, Src0, Flags); 2302 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2303 .addUse(Log.getReg(0)) 2304 .addUse(Src1) 2305 .setMIFlags(Flags); 2306 B.buildFExp2(Dst, Mul, Flags); 2307 } else if (Ty == S16) { 2308 // There's no f16 fmul_legacy, so we need to convert for it. 2309 auto Log = B.buildFLog2(S16, Src0, Flags); 2310 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2311 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2312 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2313 .addUse(Ext0.getReg(0)) 2314 .addUse(Ext1.getReg(0)) 2315 .setMIFlags(Flags); 2316 2317 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2318 } else 2319 return false; 2320 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 // Find a source register, ignoring any possible source modifiers. 2326 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2327 Register ModSrc = OrigSrc; 2328 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2329 ModSrc = SrcFNeg->getOperand(1).getReg(); 2330 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2331 ModSrc = SrcFAbs->getOperand(1).getReg(); 2332 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2333 ModSrc = SrcFAbs->getOperand(1).getReg(); 2334 return ModSrc; 2335 } 2336 2337 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2338 MachineRegisterInfo &MRI, 2339 MachineIRBuilder &B) const { 2340 2341 const LLT S1 = LLT::scalar(1); 2342 const LLT S64 = LLT::scalar(64); 2343 Register Dst = MI.getOperand(0).getReg(); 2344 Register OrigSrc = MI.getOperand(1).getReg(); 2345 unsigned Flags = MI.getFlags(); 2346 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2347 "this should not have been custom lowered"); 2348 2349 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2350 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2351 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2352 // V_FRACT bug is: 2353 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2354 // 2355 // Convert floor(x) to (x - fract(x)) 2356 2357 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2358 .addUse(OrigSrc) 2359 .setMIFlags(Flags); 2360 2361 // Give source modifier matching some assistance before obscuring a foldable 2362 // pattern. 2363 2364 // TODO: We can avoid the neg on the fract? The input sign to fract 2365 // shouldn't matter? 2366 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2367 2368 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2369 2370 Register Min = MRI.createGenericVirtualRegister(S64); 2371 2372 // We don't need to concern ourselves with the snan handling difference, so 2373 // use the one which will directly select. 2374 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2375 if (MFI->getMode().IEEE) 2376 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2377 else 2378 B.buildFMinNum(Min, Fract, Const, Flags); 2379 2380 Register CorrectedFract = Min; 2381 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2382 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2383 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2384 } 2385 2386 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2387 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2388 2389 MI.eraseFromParent(); 2390 return true; 2391 } 2392 2393 // Turn an illegal packed v2s16 build vector into bit operations. 2394 // TODO: This should probably be a bitcast action in LegalizerHelper. 2395 bool AMDGPULegalizerInfo::legalizeBuildVector( 2396 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2397 Register Dst = MI.getOperand(0).getReg(); 2398 const LLT S32 = LLT::scalar(32); 2399 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2400 2401 Register Src0 = MI.getOperand(1).getReg(); 2402 Register Src1 = MI.getOperand(2).getReg(); 2403 assert(MRI.getType(Src0) == LLT::scalar(16)); 2404 2405 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2406 B.buildBitcast(Dst, Merge); 2407 2408 MI.eraseFromParent(); 2409 return true; 2410 } 2411 2412 // Return the use branch instruction, otherwise null if the usage is invalid. 2413 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2414 MachineRegisterInfo &MRI, 2415 MachineInstr *&Br, 2416 MachineBasicBlock *&UncondBrTarget) { 2417 Register CondDef = MI.getOperand(0).getReg(); 2418 if (!MRI.hasOneNonDBGUse(CondDef)) 2419 return nullptr; 2420 2421 MachineBasicBlock *Parent = MI.getParent(); 2422 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2423 if (UseMI.getParent() != Parent || 2424 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2425 return nullptr; 2426 2427 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2428 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2429 if (Next == Parent->end()) { 2430 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2431 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2432 return nullptr; 2433 UncondBrTarget = &*NextMBB; 2434 } else { 2435 if (Next->getOpcode() != AMDGPU::G_BR) 2436 return nullptr; 2437 Br = &*Next; 2438 UncondBrTarget = Br->getOperand(0).getMBB(); 2439 } 2440 2441 return &UseMI; 2442 } 2443 2444 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2445 MachineRegisterInfo &MRI, 2446 Register LiveIn, 2447 Register PhyReg) const { 2448 assert(PhyReg.isPhysical() && "Physical register expected"); 2449 2450 // Insert the live-in copy, if required, by defining destination virtual 2451 // register. 2452 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2453 if (!MRI.getVRegDef(LiveIn)) { 2454 // FIXME: Should have scoped insert pt 2455 MachineBasicBlock &OrigInsBB = B.getMBB(); 2456 auto OrigInsPt = B.getInsertPt(); 2457 2458 MachineBasicBlock &EntryMBB = B.getMF().front(); 2459 EntryMBB.addLiveIn(PhyReg); 2460 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2461 B.buildCopy(LiveIn, PhyReg); 2462 2463 B.setInsertPt(OrigInsBB, OrigInsPt); 2464 } 2465 2466 return LiveIn; 2467 } 2468 2469 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2470 MachineRegisterInfo &MRI, 2471 Register PhyReg, LLT Ty, 2472 bool InsertLiveInCopy) const { 2473 assert(PhyReg.isPhysical() && "Physical register expected"); 2474 2475 // Get or create virtual live-in regester 2476 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2477 if (!LiveIn) { 2478 LiveIn = MRI.createGenericVirtualRegister(Ty); 2479 MRI.addLiveIn(PhyReg, LiveIn); 2480 } 2481 2482 // When the actual true copy required is from virtual register to physical 2483 // register (to be inserted later), live-in copy insertion from physical 2484 // to register virtual register is not required 2485 if (!InsertLiveInCopy) 2486 return LiveIn; 2487 2488 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2489 } 2490 2491 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2492 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2493 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2494 const ArgDescriptor *Arg; 2495 const TargetRegisterClass *RC; 2496 LLT ArgTy; 2497 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2498 if (!Arg) { 2499 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2500 return nullptr; 2501 } 2502 return Arg; 2503 } 2504 2505 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2506 const ArgDescriptor *Arg) const { 2507 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2508 return false; // TODO: Handle these 2509 2510 Register SrcReg = Arg->getRegister(); 2511 assert(SrcReg.isPhysical() && "Physical register expected"); 2512 assert(DstReg.isVirtual() && "Virtual register expected"); 2513 2514 MachineRegisterInfo &MRI = *B.getMRI(); 2515 2516 LLT Ty = MRI.getType(DstReg); 2517 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2518 2519 if (Arg->isMasked()) { 2520 // TODO: Should we try to emit this once in the entry block? 2521 const LLT S32 = LLT::scalar(32); 2522 const unsigned Mask = Arg->getMask(); 2523 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2524 2525 Register AndMaskSrc = LiveIn; 2526 2527 if (Shift != 0) { 2528 auto ShiftAmt = B.buildConstant(S32, Shift); 2529 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2530 } 2531 2532 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2533 } else { 2534 B.buildCopy(DstReg, LiveIn); 2535 } 2536 2537 return true; 2538 } 2539 2540 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2541 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2542 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2543 2544 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2545 if (!Arg) 2546 return false; 2547 2548 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2549 return false; 2550 2551 MI.eraseFromParent(); 2552 return true; 2553 } 2554 2555 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2556 MachineRegisterInfo &MRI, 2557 MachineIRBuilder &B) const { 2558 Register Dst = MI.getOperand(0).getReg(); 2559 LLT DstTy = MRI.getType(Dst); 2560 LLT S16 = LLT::scalar(16); 2561 LLT S32 = LLT::scalar(32); 2562 LLT S64 = LLT::scalar(64); 2563 2564 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2565 return true; 2566 2567 if (DstTy == S16) 2568 return legalizeFDIV16(MI, MRI, B); 2569 if (DstTy == S32) 2570 return legalizeFDIV32(MI, MRI, B); 2571 if (DstTy == S64) 2572 return legalizeFDIV64(MI, MRI, B); 2573 2574 return false; 2575 } 2576 2577 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2578 Register DstReg, 2579 Register X, 2580 Register Y, 2581 bool IsDiv) const { 2582 const LLT S1 = LLT::scalar(1); 2583 const LLT S32 = LLT::scalar(32); 2584 2585 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2586 // algorithm used here. 2587 2588 // Initial estimate of inv(y). 2589 auto FloatY = B.buildUITOFP(S32, Y); 2590 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2591 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2592 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2593 auto Z = B.buildFPTOUI(S32, ScaledY); 2594 2595 // One round of UNR. 2596 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2597 auto NegYZ = B.buildMul(S32, NegY, Z); 2598 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2599 2600 // Quotient/remainder estimate. 2601 auto Q = B.buildUMulH(S32, X, Z); 2602 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2603 2604 // First quotient/remainder refinement. 2605 auto One = B.buildConstant(S32, 1); 2606 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2607 if (IsDiv) 2608 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2609 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2610 2611 // Second quotient/remainder refinement. 2612 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2613 if (IsDiv) 2614 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2615 else 2616 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2617 } 2618 2619 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2620 MachineRegisterInfo &MRI, 2621 MachineIRBuilder &B) const { 2622 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2623 Register DstReg = MI.getOperand(0).getReg(); 2624 Register Num = MI.getOperand(1).getReg(); 2625 Register Den = MI.getOperand(2).getReg(); 2626 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2627 MI.eraseFromParent(); 2628 return true; 2629 } 2630 2631 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2632 // 2633 // Return lo, hi of result 2634 // 2635 // %cvt.lo = G_UITOFP Val.lo 2636 // %cvt.hi = G_UITOFP Val.hi 2637 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2638 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2639 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2640 // %mul2 = G_FMUL %mul1, 2**(-32) 2641 // %trunc = G_INTRINSIC_TRUNC %mul2 2642 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2643 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2644 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2645 Register Val) { 2646 const LLT S32 = LLT::scalar(32); 2647 auto Unmerge = B.buildUnmerge(S32, Val); 2648 2649 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2650 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2651 2652 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2653 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2654 2655 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2656 auto Mul1 = 2657 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2658 2659 // 2**(-32) 2660 auto Mul2 = 2661 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2662 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2663 2664 // -(2**32) 2665 auto Mad2 = B.buildFMAD(S32, Trunc, 2666 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2667 2668 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2669 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2670 2671 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2672 } 2673 2674 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2675 Register DstReg, 2676 Register Numer, 2677 Register Denom, 2678 bool IsDiv) const { 2679 const LLT S32 = LLT::scalar(32); 2680 const LLT S64 = LLT::scalar(64); 2681 const LLT S1 = LLT::scalar(1); 2682 Register RcpLo, RcpHi; 2683 2684 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2685 2686 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2687 2688 auto Zero64 = B.buildConstant(S64, 0); 2689 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2690 2691 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2692 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2693 2694 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2695 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2696 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2697 2698 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2699 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2700 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2701 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2702 2703 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2704 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2705 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2706 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2707 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2708 2709 auto Zero32 = B.buildConstant(S32, 0); 2710 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2711 auto Add2_HiC = 2712 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2713 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2714 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2715 2716 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2717 Register NumerLo = UnmergeNumer.getReg(0); 2718 Register NumerHi = UnmergeNumer.getReg(1); 2719 2720 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2721 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2722 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2723 Register Mul3_Lo = UnmergeMul3.getReg(0); 2724 Register Mul3_Hi = UnmergeMul3.getReg(1); 2725 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2726 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2727 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2728 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2729 2730 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2731 Register DenomLo = UnmergeDenom.getReg(0); 2732 Register DenomHi = UnmergeDenom.getReg(1); 2733 2734 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2735 auto C1 = B.buildSExt(S32, CmpHi); 2736 2737 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2738 auto C2 = B.buildSExt(S32, CmpLo); 2739 2740 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2741 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2742 2743 // TODO: Here and below portions of the code can be enclosed into if/endif. 2744 // Currently control flow is unconditional and we have 4 selects after 2745 // potential endif to substitute PHIs. 2746 2747 // if C3 != 0 ... 2748 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2749 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2750 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2751 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2752 2753 auto One64 = B.buildConstant(S64, 1); 2754 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2755 2756 auto C4 = 2757 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2758 auto C5 = 2759 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2760 auto C6 = B.buildSelect( 2761 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2762 2763 // if (C6 != 0) 2764 auto Add4 = B.buildAdd(S64, Add3, One64); 2765 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2766 2767 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2768 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2769 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2770 2771 // endif C6 2772 // endif C3 2773 2774 if (IsDiv) { 2775 auto Sel1 = B.buildSelect( 2776 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2777 B.buildSelect(DstReg, 2778 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2779 } else { 2780 auto Sel2 = B.buildSelect( 2781 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2782 B.buildSelect(DstReg, 2783 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2784 } 2785 } 2786 2787 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2788 MachineRegisterInfo &MRI, 2789 MachineIRBuilder &B) const { 2790 const LLT S64 = LLT::scalar(64); 2791 const LLT S32 = LLT::scalar(32); 2792 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2793 Register DstReg = MI.getOperand(0).getReg(); 2794 Register Num = MI.getOperand(1).getReg(); 2795 Register Den = MI.getOperand(2).getReg(); 2796 LLT Ty = MRI.getType(DstReg); 2797 2798 if (Ty == S32) 2799 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2800 else if (Ty == S64) 2801 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2802 else 2803 return false; 2804 2805 MI.eraseFromParent(); 2806 return true; 2807 2808 } 2809 2810 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2811 MachineRegisterInfo &MRI, 2812 MachineIRBuilder &B) const { 2813 const LLT S64 = LLT::scalar(64); 2814 const LLT S32 = LLT::scalar(32); 2815 2816 Register DstReg = MI.getOperand(0).getReg(); 2817 const LLT Ty = MRI.getType(DstReg); 2818 if (Ty != S32 && Ty != S64) 2819 return false; 2820 2821 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2822 2823 Register LHS = MI.getOperand(1).getReg(); 2824 Register RHS = MI.getOperand(2).getReg(); 2825 2826 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2827 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2828 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2829 2830 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2831 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2832 2833 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2834 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2835 2836 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2837 if (Ty == S32) 2838 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2839 else 2840 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2841 2842 Register Sign; 2843 if (IsDiv) 2844 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2845 else 2846 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2847 2848 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2849 B.buildSub(DstReg, UDivRem, Sign); 2850 2851 MI.eraseFromParent(); 2852 return true; 2853 } 2854 2855 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2856 MachineRegisterInfo &MRI, 2857 MachineIRBuilder &B) const { 2858 Register Res = MI.getOperand(0).getReg(); 2859 Register LHS = MI.getOperand(1).getReg(); 2860 Register RHS = MI.getOperand(2).getReg(); 2861 2862 uint16_t Flags = MI.getFlags(); 2863 2864 LLT ResTy = MRI.getType(Res); 2865 LLT S32 = LLT::scalar(32); 2866 LLT S64 = LLT::scalar(64); 2867 2868 const MachineFunction &MF = B.getMF(); 2869 bool Unsafe = 2870 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2871 2872 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2873 return false; 2874 2875 if (!Unsafe && ResTy == S32 && 2876 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2877 return false; 2878 2879 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2880 // 1 / x -> RCP(x) 2881 if (CLHS->isExactlyValue(1.0)) { 2882 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2883 .addUse(RHS) 2884 .setMIFlags(Flags); 2885 2886 MI.eraseFromParent(); 2887 return true; 2888 } 2889 2890 // -1 / x -> RCP( FNEG(x) ) 2891 if (CLHS->isExactlyValue(-1.0)) { 2892 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2893 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2894 .addUse(FNeg.getReg(0)) 2895 .setMIFlags(Flags); 2896 2897 MI.eraseFromParent(); 2898 return true; 2899 } 2900 } 2901 2902 // x / y -> x * (1.0 / y) 2903 if (Unsafe) { 2904 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2905 .addUse(RHS) 2906 .setMIFlags(Flags); 2907 B.buildFMul(Res, LHS, RCP, Flags); 2908 2909 MI.eraseFromParent(); 2910 return true; 2911 } 2912 2913 return false; 2914 } 2915 2916 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2917 MachineRegisterInfo &MRI, 2918 MachineIRBuilder &B) const { 2919 Register Res = MI.getOperand(0).getReg(); 2920 Register LHS = MI.getOperand(1).getReg(); 2921 Register RHS = MI.getOperand(2).getReg(); 2922 2923 uint16_t Flags = MI.getFlags(); 2924 2925 LLT S16 = LLT::scalar(16); 2926 LLT S32 = LLT::scalar(32); 2927 2928 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2929 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2930 2931 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2932 .addUse(RHSExt.getReg(0)) 2933 .setMIFlags(Flags); 2934 2935 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2936 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2937 2938 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2939 .addUse(RDst.getReg(0)) 2940 .addUse(RHS) 2941 .addUse(LHS) 2942 .setMIFlags(Flags); 2943 2944 MI.eraseFromParent(); 2945 return true; 2946 } 2947 2948 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2949 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2950 static void toggleSPDenormMode(bool Enable, 2951 MachineIRBuilder &B, 2952 const GCNSubtarget &ST, 2953 AMDGPU::SIModeRegisterDefaults Mode) { 2954 // Set SP denorm mode to this value. 2955 unsigned SPDenormMode = 2956 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2957 2958 if (ST.hasDenormModeInst()) { 2959 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2960 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2961 2962 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2963 B.buildInstr(AMDGPU::S_DENORM_MODE) 2964 .addImm(NewDenormModeValue); 2965 2966 } else { 2967 // Select FP32 bit field in mode register. 2968 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2969 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2970 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2971 2972 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2973 .addImm(SPDenormMode) 2974 .addImm(SPDenormModeBitField); 2975 } 2976 } 2977 2978 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2979 MachineRegisterInfo &MRI, 2980 MachineIRBuilder &B) const { 2981 Register Res = MI.getOperand(0).getReg(); 2982 Register LHS = MI.getOperand(1).getReg(); 2983 Register RHS = MI.getOperand(2).getReg(); 2984 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2985 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2986 2987 uint16_t Flags = MI.getFlags(); 2988 2989 LLT S32 = LLT::scalar(32); 2990 LLT S1 = LLT::scalar(1); 2991 2992 auto One = B.buildFConstant(S32, 1.0f); 2993 2994 auto DenominatorScaled = 2995 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2996 .addUse(LHS) 2997 .addUse(RHS) 2998 .addImm(0) 2999 .setMIFlags(Flags); 3000 auto NumeratorScaled = 3001 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3002 .addUse(LHS) 3003 .addUse(RHS) 3004 .addImm(1) 3005 .setMIFlags(Flags); 3006 3007 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3008 .addUse(DenominatorScaled.getReg(0)) 3009 .setMIFlags(Flags); 3010 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3011 3012 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3013 // aren't modeled as reading it. 3014 if (!Mode.allFP32Denormals()) 3015 toggleSPDenormMode(true, B, ST, Mode); 3016 3017 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3018 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3019 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3020 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3021 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3022 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3023 3024 if (!Mode.allFP32Denormals()) 3025 toggleSPDenormMode(false, B, ST, Mode); 3026 3027 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3028 .addUse(Fma4.getReg(0)) 3029 .addUse(Fma1.getReg(0)) 3030 .addUse(Fma3.getReg(0)) 3031 .addUse(NumeratorScaled.getReg(1)) 3032 .setMIFlags(Flags); 3033 3034 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3035 .addUse(Fmas.getReg(0)) 3036 .addUse(RHS) 3037 .addUse(LHS) 3038 .setMIFlags(Flags); 3039 3040 MI.eraseFromParent(); 3041 return true; 3042 } 3043 3044 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3045 MachineRegisterInfo &MRI, 3046 MachineIRBuilder &B) const { 3047 Register Res = MI.getOperand(0).getReg(); 3048 Register LHS = MI.getOperand(1).getReg(); 3049 Register RHS = MI.getOperand(2).getReg(); 3050 3051 uint16_t Flags = MI.getFlags(); 3052 3053 LLT S64 = LLT::scalar(64); 3054 LLT S1 = LLT::scalar(1); 3055 3056 auto One = B.buildFConstant(S64, 1.0); 3057 3058 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3059 .addUse(LHS) 3060 .addUse(RHS) 3061 .addImm(0) 3062 .setMIFlags(Flags); 3063 3064 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3065 3066 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3067 .addUse(DivScale0.getReg(0)) 3068 .setMIFlags(Flags); 3069 3070 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3071 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3072 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3073 3074 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3075 .addUse(LHS) 3076 .addUse(RHS) 3077 .addImm(1) 3078 .setMIFlags(Flags); 3079 3080 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3081 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3082 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3083 3084 Register Scale; 3085 if (!ST.hasUsableDivScaleConditionOutput()) { 3086 // Workaround a hardware bug on SI where the condition output from div_scale 3087 // is not usable. 3088 3089 LLT S32 = LLT::scalar(32); 3090 3091 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3092 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3093 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3094 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3095 3096 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3097 Scale1Unmerge.getReg(1)); 3098 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3099 Scale0Unmerge.getReg(1)); 3100 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3101 } else { 3102 Scale = DivScale1.getReg(1); 3103 } 3104 3105 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3106 .addUse(Fma4.getReg(0)) 3107 .addUse(Fma3.getReg(0)) 3108 .addUse(Mul.getReg(0)) 3109 .addUse(Scale) 3110 .setMIFlags(Flags); 3111 3112 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3113 .addUse(Fmas.getReg(0)) 3114 .addUse(RHS) 3115 .addUse(LHS) 3116 .setMIFlags(Flags); 3117 3118 MI.eraseFromParent(); 3119 return true; 3120 } 3121 3122 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3123 MachineRegisterInfo &MRI, 3124 MachineIRBuilder &B) const { 3125 Register Res = MI.getOperand(0).getReg(); 3126 Register LHS = MI.getOperand(2).getReg(); 3127 Register RHS = MI.getOperand(3).getReg(); 3128 uint16_t Flags = MI.getFlags(); 3129 3130 LLT S32 = LLT::scalar(32); 3131 LLT S1 = LLT::scalar(1); 3132 3133 auto Abs = B.buildFAbs(S32, RHS, Flags); 3134 const APFloat C0Val(1.0f); 3135 3136 auto C0 = B.buildConstant(S32, 0x6f800000); 3137 auto C1 = B.buildConstant(S32, 0x2f800000); 3138 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3139 3140 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3141 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3142 3143 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3144 3145 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3146 .addUse(Mul0.getReg(0)) 3147 .setMIFlags(Flags); 3148 3149 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3150 3151 B.buildFMul(Res, Sel, Mul1, Flags); 3152 3153 MI.eraseFromParent(); 3154 return true; 3155 } 3156 3157 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3158 MachineRegisterInfo &MRI, 3159 MachineIRBuilder &B) const { 3160 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3161 uint64_t Offset = 3162 ST.getTargetLowering()->getImplicitParameterOffset( 3163 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3164 LLT DstTy = MRI.getType(DstReg); 3165 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3166 3167 const ArgDescriptor *Arg; 3168 const TargetRegisterClass *RC; 3169 LLT ArgTy; 3170 std::tie(Arg, RC, ArgTy) = 3171 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3172 if (!Arg) 3173 return false; 3174 3175 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3176 if (!loadInputValue(KernargPtrReg, B, Arg)) 3177 return false; 3178 3179 // FIXME: This should be nuw 3180 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3181 return true; 3182 } 3183 3184 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3185 MachineRegisterInfo &MRI, 3186 MachineIRBuilder &B) const { 3187 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3188 if (!MFI->isEntryFunction()) { 3189 return legalizePreloadedArgIntrin(MI, MRI, B, 3190 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3191 } 3192 3193 Register DstReg = MI.getOperand(0).getReg(); 3194 if (!getImplicitArgPtr(DstReg, MRI, B)) 3195 return false; 3196 3197 MI.eraseFromParent(); 3198 return true; 3199 } 3200 3201 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3202 MachineRegisterInfo &MRI, 3203 MachineIRBuilder &B, 3204 unsigned AddrSpace) const { 3205 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3206 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3207 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3208 MI.eraseFromParent(); 3209 return true; 3210 } 3211 3212 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3213 // offset (the offset that is included in bounds checking and swizzling, to be 3214 // split between the instruction's voffset and immoffset fields) and soffset 3215 // (the offset that is excluded from bounds checking and swizzling, to go in 3216 // the instruction's soffset field). This function takes the first kind of 3217 // offset and figures out how to split it between voffset and immoffset. 3218 std::tuple<Register, unsigned, unsigned> 3219 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3220 Register OrigOffset) const { 3221 const unsigned MaxImm = 4095; 3222 Register BaseReg; 3223 unsigned TotalConstOffset; 3224 MachineInstr *OffsetDef; 3225 const LLT S32 = LLT::scalar(32); 3226 3227 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3228 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3229 3230 unsigned ImmOffset = TotalConstOffset; 3231 3232 // If the immediate value is too big for the immoffset field, put the value 3233 // and -4096 into the immoffset field so that the value that is copied/added 3234 // for the voffset field is a multiple of 4096, and it stands more chance 3235 // of being CSEd with the copy/add for another similar load/store. 3236 // However, do not do that rounding down to a multiple of 4096 if that is a 3237 // negative number, as it appears to be illegal to have a negative offset 3238 // in the vgpr, even if adding the immediate offset makes it positive. 3239 unsigned Overflow = ImmOffset & ~MaxImm; 3240 ImmOffset -= Overflow; 3241 if ((int32_t)Overflow < 0) { 3242 Overflow += ImmOffset; 3243 ImmOffset = 0; 3244 } 3245 3246 if (Overflow != 0) { 3247 if (!BaseReg) { 3248 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3249 } else { 3250 auto OverflowVal = B.buildConstant(S32, Overflow); 3251 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3252 } 3253 } 3254 3255 if (!BaseReg) 3256 BaseReg = B.buildConstant(S32, 0).getReg(0); 3257 3258 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3259 } 3260 3261 /// Handle register layout difference for f16 images for some subtargets. 3262 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3263 MachineRegisterInfo &MRI, 3264 Register Reg) const { 3265 if (!ST.hasUnpackedD16VMem()) 3266 return Reg; 3267 3268 const LLT S16 = LLT::scalar(16); 3269 const LLT S32 = LLT::scalar(32); 3270 LLT StoreVT = MRI.getType(Reg); 3271 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3272 3273 auto Unmerge = B.buildUnmerge(S16, Reg); 3274 3275 SmallVector<Register, 4> WideRegs; 3276 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3277 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3278 3279 int NumElts = StoreVT.getNumElements(); 3280 3281 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3282 } 3283 3284 Register AMDGPULegalizerInfo::fixStoreSourceType( 3285 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3286 MachineRegisterInfo *MRI = B.getMRI(); 3287 LLT Ty = MRI->getType(VData); 3288 3289 const LLT S16 = LLT::scalar(16); 3290 3291 // Fixup illegal register types for i8 stores. 3292 if (Ty == LLT::scalar(8) || Ty == S16) { 3293 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3294 return AnyExt; 3295 } 3296 3297 if (Ty.isVector()) { 3298 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3299 if (IsFormat) 3300 return handleD16VData(B, *MRI, VData); 3301 } 3302 } 3303 3304 return VData; 3305 } 3306 3307 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3308 MachineRegisterInfo &MRI, 3309 MachineIRBuilder &B, 3310 bool IsTyped, 3311 bool IsFormat) const { 3312 Register VData = MI.getOperand(1).getReg(); 3313 LLT Ty = MRI.getType(VData); 3314 LLT EltTy = Ty.getScalarType(); 3315 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3316 const LLT S32 = LLT::scalar(32); 3317 3318 VData = fixStoreSourceType(B, VData, IsFormat); 3319 Register RSrc = MI.getOperand(2).getReg(); 3320 3321 MachineMemOperand *MMO = *MI.memoperands_begin(); 3322 const int MemSize = MMO->getSize(); 3323 3324 unsigned ImmOffset; 3325 unsigned TotalOffset; 3326 3327 // The typed intrinsics add an immediate after the registers. 3328 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3329 3330 // The struct intrinsic variants add one additional operand over raw. 3331 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3332 Register VIndex; 3333 int OpOffset = 0; 3334 if (HasVIndex) { 3335 VIndex = MI.getOperand(3).getReg(); 3336 OpOffset = 1; 3337 } 3338 3339 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3340 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3341 3342 unsigned Format = 0; 3343 if (IsTyped) { 3344 Format = MI.getOperand(5 + OpOffset).getImm(); 3345 ++OpOffset; 3346 } 3347 3348 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3349 3350 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3351 if (TotalOffset != 0) 3352 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3353 3354 unsigned Opc; 3355 if (IsTyped) { 3356 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3357 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3358 } else if (IsFormat) { 3359 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3360 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3361 } else { 3362 switch (MemSize) { 3363 case 1: 3364 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3365 break; 3366 case 2: 3367 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3368 break; 3369 default: 3370 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3371 break; 3372 } 3373 } 3374 3375 if (!VIndex) 3376 VIndex = B.buildConstant(S32, 0).getReg(0); 3377 3378 auto MIB = B.buildInstr(Opc) 3379 .addUse(VData) // vdata 3380 .addUse(RSrc) // rsrc 3381 .addUse(VIndex) // vindex 3382 .addUse(VOffset) // voffset 3383 .addUse(SOffset) // soffset 3384 .addImm(ImmOffset); // offset(imm) 3385 3386 if (IsTyped) 3387 MIB.addImm(Format); 3388 3389 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3390 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3391 .addMemOperand(MMO); 3392 3393 MI.eraseFromParent(); 3394 return true; 3395 } 3396 3397 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3398 MachineRegisterInfo &MRI, 3399 MachineIRBuilder &B, 3400 bool IsFormat, 3401 bool IsTyped) const { 3402 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3403 MachineMemOperand *MMO = *MI.memoperands_begin(); 3404 const int MemSize = MMO->getSize(); 3405 const LLT S32 = LLT::scalar(32); 3406 3407 Register Dst = MI.getOperand(0).getReg(); 3408 Register RSrc = MI.getOperand(2).getReg(); 3409 3410 // The typed intrinsics add an immediate after the registers. 3411 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3412 3413 // The struct intrinsic variants add one additional operand over raw. 3414 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3415 Register VIndex; 3416 int OpOffset = 0; 3417 if (HasVIndex) { 3418 VIndex = MI.getOperand(3).getReg(); 3419 OpOffset = 1; 3420 } 3421 3422 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3423 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3424 3425 unsigned Format = 0; 3426 if (IsTyped) { 3427 Format = MI.getOperand(5 + OpOffset).getImm(); 3428 ++OpOffset; 3429 } 3430 3431 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3432 unsigned ImmOffset; 3433 unsigned TotalOffset; 3434 3435 LLT Ty = MRI.getType(Dst); 3436 LLT EltTy = Ty.getScalarType(); 3437 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3438 const bool Unpacked = ST.hasUnpackedD16VMem(); 3439 3440 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3441 if (TotalOffset != 0) 3442 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3443 3444 unsigned Opc; 3445 3446 if (IsTyped) { 3447 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3448 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3449 } else if (IsFormat) { 3450 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3451 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3452 } else { 3453 switch (MemSize) { 3454 case 1: 3455 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3456 break; 3457 case 2: 3458 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3459 break; 3460 default: 3461 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3462 break; 3463 } 3464 } 3465 3466 Register LoadDstReg; 3467 3468 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3469 LLT UnpackedTy = Ty.changeElementSize(32); 3470 3471 if (IsExtLoad) 3472 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3473 else if (Unpacked && IsD16 && Ty.isVector()) 3474 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3475 else 3476 LoadDstReg = Dst; 3477 3478 if (!VIndex) 3479 VIndex = B.buildConstant(S32, 0).getReg(0); 3480 3481 auto MIB = B.buildInstr(Opc) 3482 .addDef(LoadDstReg) // vdata 3483 .addUse(RSrc) // rsrc 3484 .addUse(VIndex) // vindex 3485 .addUse(VOffset) // voffset 3486 .addUse(SOffset) // soffset 3487 .addImm(ImmOffset); // offset(imm) 3488 3489 if (IsTyped) 3490 MIB.addImm(Format); 3491 3492 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3493 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3494 .addMemOperand(MMO); 3495 3496 if (LoadDstReg != Dst) { 3497 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3498 3499 // Widen result for extending loads was widened. 3500 if (IsExtLoad) 3501 B.buildTrunc(Dst, LoadDstReg); 3502 else { 3503 // Repack to original 16-bit vector result 3504 // FIXME: G_TRUNC should work, but legalization currently fails 3505 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3506 SmallVector<Register, 4> Repack; 3507 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3508 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3509 B.buildMerge(Dst, Repack); 3510 } 3511 } 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3518 MachineIRBuilder &B, 3519 bool IsInc) const { 3520 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3521 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3522 B.buildInstr(Opc) 3523 .addDef(MI.getOperand(0).getReg()) 3524 .addUse(MI.getOperand(2).getReg()) 3525 .addUse(MI.getOperand(3).getReg()) 3526 .cloneMemRefs(MI); 3527 MI.eraseFromParent(); 3528 return true; 3529 } 3530 3531 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3532 switch (IntrID) { 3533 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3534 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3536 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3537 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3539 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3540 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3541 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3542 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3543 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3544 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3545 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3546 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3547 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3548 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3549 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3550 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3551 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3552 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3553 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3554 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3555 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3556 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3557 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3558 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3559 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3560 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3561 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3562 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3563 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3564 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3565 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3566 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3567 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3568 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3569 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3570 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3571 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3572 default: 3573 llvm_unreachable("unhandled atomic opcode"); 3574 } 3575 } 3576 3577 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3578 MachineIRBuilder &B, 3579 Intrinsic::ID IID) const { 3580 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3581 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3582 3583 Register Dst = MI.getOperand(0).getReg(); 3584 Register VData = MI.getOperand(2).getReg(); 3585 3586 Register CmpVal; 3587 int OpOffset = 0; 3588 3589 if (IsCmpSwap) { 3590 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3591 ++OpOffset; 3592 } 3593 3594 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3595 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3596 3597 // The struct intrinsic variants add one additional operand over raw. 3598 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3599 Register VIndex; 3600 if (HasVIndex) { 3601 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3602 ++OpOffset; 3603 } 3604 3605 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3606 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3607 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3608 3609 MachineMemOperand *MMO = *MI.memoperands_begin(); 3610 3611 unsigned ImmOffset; 3612 unsigned TotalOffset; 3613 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3614 if (TotalOffset != 0) 3615 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3616 3617 if (!VIndex) 3618 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3619 3620 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3621 .addDef(Dst) 3622 .addUse(VData); // vdata 3623 3624 if (IsCmpSwap) 3625 MIB.addReg(CmpVal); 3626 3627 MIB.addUse(RSrc) // rsrc 3628 .addUse(VIndex) // vindex 3629 .addUse(VOffset) // voffset 3630 .addUse(SOffset) // soffset 3631 .addImm(ImmOffset) // offset(imm) 3632 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3633 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3634 .addMemOperand(MMO); 3635 3636 MI.eraseFromParent(); 3637 return true; 3638 } 3639 3640 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3641 /// vector with s16 typed elements. 3642 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3643 SmallVectorImpl<Register> &PackedAddrs, 3644 int AddrIdx, int DimIdx, int EndIdx, 3645 int NumGradients) { 3646 const LLT S16 = LLT::scalar(16); 3647 const LLT V2S16 = LLT::vector(2, 16); 3648 3649 for (int I = AddrIdx; I < EndIdx; ++I) { 3650 MachineOperand &SrcOp = MI.getOperand(I); 3651 if (!SrcOp.isReg()) 3652 continue; // _L to _LZ may have eliminated this. 3653 3654 Register AddrReg = SrcOp.getReg(); 3655 3656 if (I < DimIdx) { 3657 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3658 PackedAddrs.push_back(AddrReg); 3659 } else { 3660 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3661 // derivatives dx/dh and dx/dv are packed with undef. 3662 if (((I + 1) >= EndIdx) || 3663 ((NumGradients / 2) % 2 == 1 && 3664 (I == DimIdx + (NumGradients / 2) - 1 || 3665 I == DimIdx + NumGradients - 1)) || 3666 // Check for _L to _LZ optimization 3667 !MI.getOperand(I + 1).isReg()) { 3668 PackedAddrs.push_back( 3669 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3670 .getReg(0)); 3671 } else { 3672 PackedAddrs.push_back( 3673 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3674 .getReg(0)); 3675 ++I; 3676 } 3677 } 3678 } 3679 } 3680 3681 /// Convert from separate vaddr components to a single vector address register, 3682 /// and replace the remaining operands with $noreg. 3683 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3684 int DimIdx, int NumVAddrs) { 3685 const LLT S32 = LLT::scalar(32); 3686 3687 SmallVector<Register, 8> AddrRegs; 3688 for (int I = 0; I != NumVAddrs; ++I) { 3689 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3690 if (SrcOp.isReg()) { 3691 AddrRegs.push_back(SrcOp.getReg()); 3692 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3693 } 3694 } 3695 3696 int NumAddrRegs = AddrRegs.size(); 3697 if (NumAddrRegs != 1) { 3698 // Round up to 8 elements for v5-v7 3699 // FIXME: Missing intermediate sized register classes and instructions. 3700 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3701 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3702 auto Undef = B.buildUndef(S32); 3703 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3704 NumAddrRegs = RoundedNumRegs; 3705 } 3706 3707 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3708 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3709 } 3710 3711 for (int I = 1; I != NumVAddrs; ++I) { 3712 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3713 if (SrcOp.isReg()) 3714 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3715 } 3716 } 3717 3718 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3719 /// 3720 /// Depending on the subtarget, load/store with 16-bit element data need to be 3721 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3722 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3723 /// registers. 3724 /// 3725 /// We don't want to directly select image instructions just yet, but also want 3726 /// to exposes all register repacking to the legalizer/combiners. We also don't 3727 /// want a selected instrution entering RegBankSelect. In order to avoid 3728 /// defining a multitude of intermediate image instructions, directly hack on 3729 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3730 /// now unnecessary arguments with $noreg. 3731 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3732 MachineInstr &MI, MachineIRBuilder &B, 3733 GISelChangeObserver &Observer, 3734 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3735 3736 const int NumDefs = MI.getNumExplicitDefs(); 3737 bool IsTFE = NumDefs == 2; 3738 // We are only processing the operands of d16 image operations on subtargets 3739 // that use the unpacked register layout, or need to repack the TFE result. 3740 3741 // TODO: Do we need to guard against already legalized intrinsics? 3742 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3743 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3744 3745 MachineRegisterInfo *MRI = B.getMRI(); 3746 const LLT S32 = LLT::scalar(32); 3747 const LLT S16 = LLT::scalar(16); 3748 const LLT V2S16 = LLT::vector(2, 16); 3749 3750 // Index of first address argument 3751 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3752 3753 int NumVAddrs, NumGradients; 3754 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3755 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3756 getDMaskIdx(BaseOpcode, NumDefs); 3757 unsigned DMask = 0; 3758 3759 // Check for 16 bit addresses and pack if true. 3760 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3761 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3762 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3763 const bool IsG16 = GradTy == S16; 3764 const bool IsA16 = AddrTy == S16; 3765 3766 int DMaskLanes = 0; 3767 if (!BaseOpcode->Atomic) { 3768 DMask = MI.getOperand(DMaskIdx).getImm(); 3769 if (BaseOpcode->Gather4) { 3770 DMaskLanes = 4; 3771 } else if (DMask != 0) { 3772 DMaskLanes = countPopulation(DMask); 3773 } else if (!IsTFE && !BaseOpcode->Store) { 3774 // If dmask is 0, this is a no-op load. This can be eliminated. 3775 B.buildUndef(MI.getOperand(0)); 3776 MI.eraseFromParent(); 3777 return true; 3778 } 3779 } 3780 3781 Observer.changingInstr(MI); 3782 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3783 3784 unsigned NewOpcode = NumDefs == 0 ? 3785 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3786 3787 // Track that we legalized this 3788 MI.setDesc(B.getTII().get(NewOpcode)); 3789 3790 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3791 // dmask to be at least 1 otherwise the instruction will fail 3792 if (IsTFE && DMask == 0) { 3793 DMask = 0x1; 3794 DMaskLanes = 1; 3795 MI.getOperand(DMaskIdx).setImm(DMask); 3796 } 3797 3798 if (BaseOpcode->Atomic) { 3799 Register VData0 = MI.getOperand(2).getReg(); 3800 LLT Ty = MRI->getType(VData0); 3801 3802 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3803 if (Ty.isVector()) 3804 return false; 3805 3806 if (BaseOpcode->AtomicX2) { 3807 Register VData1 = MI.getOperand(3).getReg(); 3808 // The two values are packed in one register. 3809 LLT PackedTy = LLT::vector(2, Ty); 3810 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3811 MI.getOperand(2).setReg(Concat.getReg(0)); 3812 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3813 } 3814 } 3815 3816 int CorrectedNumVAddrs = NumVAddrs; 3817 3818 // Optimize _L to _LZ when _L is zero 3819 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3820 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3821 const ConstantFP *ConstantLod; 3822 const int LodIdx = AddrIdx + NumVAddrs - 1; 3823 3824 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3825 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3826 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3827 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3828 LZMappingInfo->LZ, ImageDimIntr->Dim); 3829 3830 // The starting indexes should remain in the same place. 3831 --NumVAddrs; 3832 --CorrectedNumVAddrs; 3833 3834 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3835 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3836 MI.RemoveOperand(LodIdx); 3837 } 3838 } 3839 } 3840 3841 // Optimize _mip away, when 'lod' is zero 3842 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3843 int64_t ConstantLod; 3844 const int LodIdx = AddrIdx + NumVAddrs - 1; 3845 3846 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3847 if (ConstantLod == 0) { 3848 // TODO: Change intrinsic opcode and remove operand instead or replacing 3849 // it with 0, as the _L to _LZ handling is done above. 3850 MI.getOperand(LodIdx).ChangeToImmediate(0); 3851 --CorrectedNumVAddrs; 3852 } 3853 } 3854 } 3855 3856 // Rewrite the addressing register layout before doing anything else. 3857 if (IsA16 || IsG16) { 3858 if (IsA16) { 3859 // Target must support the feature and gradients need to be 16 bit too 3860 if (!ST.hasA16() || !IsG16) 3861 return false; 3862 } else if (!ST.hasG16()) 3863 return false; 3864 3865 if (NumVAddrs > 1) { 3866 SmallVector<Register, 4> PackedRegs; 3867 // Don't compress addresses for G16 3868 const int PackEndIdx = 3869 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3870 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3871 PackEndIdx, NumGradients); 3872 3873 if (!IsA16) { 3874 // Add uncompressed address 3875 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3876 int AddrReg = MI.getOperand(I).getReg(); 3877 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3878 PackedRegs.push_back(AddrReg); 3879 } 3880 } 3881 3882 // See also below in the non-a16 branch 3883 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3884 3885 if (!UseNSA && PackedRegs.size() > 1) { 3886 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3887 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3888 PackedRegs[0] = Concat.getReg(0); 3889 PackedRegs.resize(1); 3890 } 3891 3892 const int NumPacked = PackedRegs.size(); 3893 for (int I = 0; I != NumVAddrs; ++I) { 3894 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3895 if (!SrcOp.isReg()) { 3896 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3897 continue; 3898 } 3899 3900 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3901 3902 if (I < NumPacked) 3903 SrcOp.setReg(PackedRegs[I]); 3904 else 3905 SrcOp.setReg(AMDGPU::NoRegister); 3906 } 3907 } 3908 } else { 3909 // If the register allocator cannot place the address registers contiguously 3910 // without introducing moves, then using the non-sequential address encoding 3911 // is always preferable, since it saves VALU instructions and is usually a 3912 // wash in terms of code size or even better. 3913 // 3914 // However, we currently have no way of hinting to the register allocator 3915 // that MIMG addresses should be placed contiguously when it is possible to 3916 // do so, so force non-NSA for the common 2-address case as a heuristic. 3917 // 3918 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3919 // allocation when possible. 3920 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3921 3922 if (!UseNSA && NumVAddrs > 1) 3923 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3924 } 3925 3926 int Flags = 0; 3927 if (IsA16) 3928 Flags |= 1; 3929 if (IsG16) 3930 Flags |= 2; 3931 MI.addOperand(MachineOperand::CreateImm(Flags)); 3932 3933 if (BaseOpcode->Store) { // No TFE for stores? 3934 // TODO: Handle dmask trim 3935 Register VData = MI.getOperand(1).getReg(); 3936 LLT Ty = MRI->getType(VData); 3937 if (!Ty.isVector() || Ty.getElementType() != S16) 3938 return true; 3939 3940 Register RepackedReg = handleD16VData(B, *MRI, VData); 3941 if (RepackedReg != VData) { 3942 MI.getOperand(1).setReg(RepackedReg); 3943 } 3944 3945 return true; 3946 } 3947 3948 Register DstReg = MI.getOperand(0).getReg(); 3949 LLT Ty = MRI->getType(DstReg); 3950 const LLT EltTy = Ty.getScalarType(); 3951 const bool IsD16 = Ty.getScalarType() == S16; 3952 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3953 3954 // Confirm that the return type is large enough for the dmask specified 3955 if (NumElts < DMaskLanes) 3956 return false; 3957 3958 if (NumElts > 4 || DMaskLanes > 4) 3959 return false; 3960 3961 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3962 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3963 3964 // The raw dword aligned data component of the load. The only legal cases 3965 // where this matters should be when using the packed D16 format, for 3966 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3967 LLT RoundedTy; 3968 3969 // S32 vector to to cover all data, plus TFE result element. 3970 LLT TFETy; 3971 3972 // Register type to use for each loaded component. Will be S32 or V2S16. 3973 LLT RegTy; 3974 3975 if (IsD16 && ST.hasUnpackedD16VMem()) { 3976 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3977 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3978 RegTy = S32; 3979 } else { 3980 unsigned EltSize = EltTy.getSizeInBits(); 3981 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3982 unsigned RoundedSize = 32 * RoundedElts; 3983 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3984 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3985 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3986 } 3987 3988 // The return type does not need adjustment. 3989 // TODO: Should we change s16 case to s32 or <2 x s16>? 3990 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3991 return true; 3992 3993 Register Dst1Reg; 3994 3995 // Insert after the instruction. 3996 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3997 3998 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3999 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4000 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4001 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4002 4003 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4004 4005 MI.getOperand(0).setReg(NewResultReg); 4006 4007 // In the IR, TFE is supposed to be used with a 2 element struct return 4008 // type. The intruction really returns these two values in one contiguous 4009 // register, with one additional dword beyond the loaded data. Rewrite the 4010 // return type to use a single register result. 4011 4012 if (IsTFE) { 4013 Dst1Reg = MI.getOperand(1).getReg(); 4014 if (MRI->getType(Dst1Reg) != S32) 4015 return false; 4016 4017 // TODO: Make sure the TFE operand bit is set. 4018 MI.RemoveOperand(1); 4019 4020 // Handle the easy case that requires no repack instructions. 4021 if (Ty == S32) { 4022 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4023 return true; 4024 } 4025 } 4026 4027 // Now figure out how to copy the new result register back into the old 4028 // result. 4029 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4030 4031 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4032 4033 if (ResultNumRegs == 1) { 4034 assert(!IsTFE); 4035 ResultRegs[0] = NewResultReg; 4036 } else { 4037 // We have to repack into a new vector of some kind. 4038 for (int I = 0; I != NumDataRegs; ++I) 4039 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4040 B.buildUnmerge(ResultRegs, NewResultReg); 4041 4042 // Drop the final TFE element to get the data part. The TFE result is 4043 // directly written to the right place already. 4044 if (IsTFE) 4045 ResultRegs.resize(NumDataRegs); 4046 } 4047 4048 // For an s16 scalar result, we form an s32 result with a truncate regardless 4049 // of packed vs. unpacked. 4050 if (IsD16 && !Ty.isVector()) { 4051 B.buildTrunc(DstReg, ResultRegs[0]); 4052 return true; 4053 } 4054 4055 // Avoid a build/concat_vector of 1 entry. 4056 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4057 B.buildBitcast(DstReg, ResultRegs[0]); 4058 return true; 4059 } 4060 4061 assert(Ty.isVector()); 4062 4063 if (IsD16) { 4064 // For packed D16 results with TFE enabled, all the data components are 4065 // S32. Cast back to the expected type. 4066 // 4067 // TODO: We don't really need to use load s32 elements. We would only need one 4068 // cast for the TFE result if a multiple of v2s16 was used. 4069 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4070 for (Register &Reg : ResultRegs) 4071 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4072 } else if (ST.hasUnpackedD16VMem()) { 4073 for (Register &Reg : ResultRegs) 4074 Reg = B.buildTrunc(S16, Reg).getReg(0); 4075 } 4076 } 4077 4078 auto padWithUndef = [&](LLT Ty, int NumElts) { 4079 if (NumElts == 0) 4080 return; 4081 Register Undef = B.buildUndef(Ty).getReg(0); 4082 for (int I = 0; I != NumElts; ++I) 4083 ResultRegs.push_back(Undef); 4084 }; 4085 4086 // Pad out any elements eliminated due to the dmask. 4087 LLT ResTy = MRI->getType(ResultRegs[0]); 4088 if (!ResTy.isVector()) { 4089 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4090 B.buildBuildVector(DstReg, ResultRegs); 4091 return true; 4092 } 4093 4094 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4095 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4096 4097 // Deal with the one annoying legal case. 4098 const LLT V3S16 = LLT::vector(3, 16); 4099 if (Ty == V3S16) { 4100 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4101 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4102 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4103 return true; 4104 } 4105 4106 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4107 B.buildConcatVectors(DstReg, ResultRegs); 4108 return true; 4109 } 4110 4111 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4112 MachineInstr &MI, MachineIRBuilder &B, 4113 GISelChangeObserver &Observer) const { 4114 Register Dst = MI.getOperand(0).getReg(); 4115 LLT Ty = B.getMRI()->getType(Dst); 4116 unsigned Size = Ty.getSizeInBits(); 4117 MachineFunction &MF = B.getMF(); 4118 4119 Observer.changingInstr(MI); 4120 4121 // FIXME: We don't really need this intermediate instruction. The intrinsic 4122 // should be fixed to have a memory operand. Since it's readnone, we're not 4123 // allowed to add one. 4124 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4125 MI.RemoveOperand(1); // Remove intrinsic ID 4126 4127 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4128 // TODO: Should this use datalayout alignment? 4129 const unsigned MemSize = (Size + 7) / 8; 4130 const Align MemAlign(4); 4131 MachineMemOperand *MMO = MF.getMachineMemOperand( 4132 MachinePointerInfo(), 4133 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4134 MachineMemOperand::MOInvariant, 4135 MemSize, MemAlign); 4136 MI.addMemOperand(MF, MMO); 4137 4138 // There are no 96-bit result scalar loads, but widening to 128-bit should 4139 // always be legal. We may need to restore this to a 96-bit result if it turns 4140 // out this needs to be converted to a vector load during RegBankSelect. 4141 if (!isPowerOf2_32(Size)) { 4142 LegalizerHelper Helper(MF, *this, Observer, B); 4143 4144 if (Ty.isVector()) 4145 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4146 else 4147 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4148 } 4149 4150 Observer.changedInstr(MI); 4151 return true; 4152 } 4153 4154 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4155 MachineRegisterInfo &MRI, 4156 MachineIRBuilder &B) const { 4157 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4158 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4159 !ST.isTrapHandlerEnabled()) { 4160 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4161 } else { 4162 // Pass queue pointer to trap handler as input, and insert trap instruction 4163 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4164 const ArgDescriptor *Arg = 4165 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4166 if (!Arg) 4167 return false; 4168 MachineRegisterInfo &MRI = *B.getMRI(); 4169 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4170 Register LiveIn = getLiveInRegister( 4171 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4172 /*InsertLiveInCopy=*/false); 4173 if (!loadInputValue(LiveIn, B, Arg)) 4174 return false; 4175 B.buildCopy(SGPR01, LiveIn); 4176 B.buildInstr(AMDGPU::S_TRAP) 4177 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4178 .addReg(SGPR01, RegState::Implicit); 4179 } 4180 4181 MI.eraseFromParent(); 4182 return true; 4183 } 4184 4185 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4186 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4187 // Is non-HSA path or trap-handler disabled? then, report a warning 4188 // accordingly 4189 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4190 !ST.isTrapHandlerEnabled()) { 4191 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4192 "debugtrap handler not supported", 4193 MI.getDebugLoc(), DS_Warning); 4194 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4195 Ctx.diagnose(NoTrap); 4196 } else { 4197 // Insert debug-trap instruction 4198 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4199 } 4200 4201 MI.eraseFromParent(); 4202 return true; 4203 } 4204 4205 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4206 MachineInstr &MI) const { 4207 MachineIRBuilder &B = Helper.MIRBuilder; 4208 MachineRegisterInfo &MRI = *B.getMRI(); 4209 4210 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4211 auto IntrID = MI.getIntrinsicID(); 4212 switch (IntrID) { 4213 case Intrinsic::amdgcn_if: 4214 case Intrinsic::amdgcn_else: { 4215 MachineInstr *Br = nullptr; 4216 MachineBasicBlock *UncondBrTarget = nullptr; 4217 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4218 const SIRegisterInfo *TRI 4219 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4220 4221 Register Def = MI.getOperand(1).getReg(); 4222 Register Use = MI.getOperand(3).getReg(); 4223 4224 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4225 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4226 if (IntrID == Intrinsic::amdgcn_if) { 4227 B.buildInstr(AMDGPU::SI_IF) 4228 .addDef(Def) 4229 .addUse(Use) 4230 .addMBB(UncondBrTarget); 4231 } else { 4232 B.buildInstr(AMDGPU::SI_ELSE) 4233 .addDef(Def) 4234 .addUse(Use) 4235 .addMBB(UncondBrTarget) 4236 .addImm(0); 4237 } 4238 4239 if (Br) { 4240 Br->getOperand(0).setMBB(CondBrTarget); 4241 } else { 4242 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4243 // since we're swapping branch targets it needs to be reinserted. 4244 // FIXME: IRTranslator should probably not do this 4245 B.buildBr(*CondBrTarget); 4246 } 4247 4248 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4249 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4250 MI.eraseFromParent(); 4251 BrCond->eraseFromParent(); 4252 return true; 4253 } 4254 4255 return false; 4256 } 4257 case Intrinsic::amdgcn_loop: { 4258 MachineInstr *Br = nullptr; 4259 MachineBasicBlock *UncondBrTarget = nullptr; 4260 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4261 const SIRegisterInfo *TRI 4262 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4263 4264 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4265 Register Reg = MI.getOperand(2).getReg(); 4266 4267 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4268 B.buildInstr(AMDGPU::SI_LOOP) 4269 .addUse(Reg) 4270 .addMBB(UncondBrTarget); 4271 4272 if (Br) 4273 Br->getOperand(0).setMBB(CondBrTarget); 4274 else 4275 B.buildBr(*CondBrTarget); 4276 4277 MI.eraseFromParent(); 4278 BrCond->eraseFromParent(); 4279 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4280 return true; 4281 } 4282 4283 return false; 4284 } 4285 case Intrinsic::amdgcn_kernarg_segment_ptr: 4286 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4287 // This only makes sense to call in a kernel, so just lower to null. 4288 B.buildConstant(MI.getOperand(0).getReg(), 0); 4289 MI.eraseFromParent(); 4290 return true; 4291 } 4292 4293 return legalizePreloadedArgIntrin( 4294 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4295 case Intrinsic::amdgcn_implicitarg_ptr: 4296 return legalizeImplicitArgPtr(MI, MRI, B); 4297 case Intrinsic::amdgcn_workitem_id_x: 4298 return legalizePreloadedArgIntrin(MI, MRI, B, 4299 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4300 case Intrinsic::amdgcn_workitem_id_y: 4301 return legalizePreloadedArgIntrin(MI, MRI, B, 4302 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4303 case Intrinsic::amdgcn_workitem_id_z: 4304 return legalizePreloadedArgIntrin(MI, MRI, B, 4305 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4306 case Intrinsic::amdgcn_workgroup_id_x: 4307 return legalizePreloadedArgIntrin(MI, MRI, B, 4308 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4309 case Intrinsic::amdgcn_workgroup_id_y: 4310 return legalizePreloadedArgIntrin(MI, MRI, B, 4311 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4312 case Intrinsic::amdgcn_workgroup_id_z: 4313 return legalizePreloadedArgIntrin(MI, MRI, B, 4314 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4315 case Intrinsic::amdgcn_dispatch_ptr: 4316 return legalizePreloadedArgIntrin(MI, MRI, B, 4317 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4318 case Intrinsic::amdgcn_queue_ptr: 4319 return legalizePreloadedArgIntrin(MI, MRI, B, 4320 AMDGPUFunctionArgInfo::QUEUE_PTR); 4321 case Intrinsic::amdgcn_implicit_buffer_ptr: 4322 return legalizePreloadedArgIntrin( 4323 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4324 case Intrinsic::amdgcn_dispatch_id: 4325 return legalizePreloadedArgIntrin(MI, MRI, B, 4326 AMDGPUFunctionArgInfo::DISPATCH_ID); 4327 case Intrinsic::amdgcn_fdiv_fast: 4328 return legalizeFDIVFastIntrin(MI, MRI, B); 4329 case Intrinsic::amdgcn_is_shared: 4330 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4331 case Intrinsic::amdgcn_is_private: 4332 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4333 case Intrinsic::amdgcn_wavefrontsize: { 4334 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4335 MI.eraseFromParent(); 4336 return true; 4337 } 4338 case Intrinsic::amdgcn_s_buffer_load: 4339 return legalizeSBufferLoad(MI, B, Helper.Observer); 4340 case Intrinsic::amdgcn_raw_buffer_store: 4341 case Intrinsic::amdgcn_struct_buffer_store: 4342 return legalizeBufferStore(MI, MRI, B, false, false); 4343 case Intrinsic::amdgcn_raw_buffer_store_format: 4344 case Intrinsic::amdgcn_struct_buffer_store_format: 4345 return legalizeBufferStore(MI, MRI, B, false, true); 4346 case Intrinsic::amdgcn_raw_tbuffer_store: 4347 case Intrinsic::amdgcn_struct_tbuffer_store: 4348 return legalizeBufferStore(MI, MRI, B, true, true); 4349 case Intrinsic::amdgcn_raw_buffer_load: 4350 case Intrinsic::amdgcn_struct_buffer_load: 4351 return legalizeBufferLoad(MI, MRI, B, false, false); 4352 case Intrinsic::amdgcn_raw_buffer_load_format: 4353 case Intrinsic::amdgcn_struct_buffer_load_format: 4354 return legalizeBufferLoad(MI, MRI, B, true, false); 4355 case Intrinsic::amdgcn_raw_tbuffer_load: 4356 case Intrinsic::amdgcn_struct_tbuffer_load: 4357 return legalizeBufferLoad(MI, MRI, B, true, true); 4358 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4359 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4360 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4361 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4362 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4363 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4364 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4365 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4366 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4367 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4368 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4369 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4370 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4371 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4372 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4373 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4374 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4375 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4376 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4377 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4378 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4379 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4380 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4381 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4382 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4383 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4384 return legalizeBufferAtomic(MI, B, IntrID); 4385 case Intrinsic::amdgcn_atomic_inc: 4386 return legalizeAtomicIncDec(MI, B, true); 4387 case Intrinsic::amdgcn_atomic_dec: 4388 return legalizeAtomicIncDec(MI, B, false); 4389 case Intrinsic::trap: 4390 return legalizeTrapIntrinsic(MI, MRI, B); 4391 case Intrinsic::debugtrap: 4392 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4393 default: { 4394 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4395 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4396 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4397 return true; 4398 } 4399 } 4400 4401 return true; 4402 } 4403