1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .legalIf(isPointer(0)) 419 .clampScalar(0, S32, S256) 420 .widenScalarToNextPow2(0, 32) 421 .clampMaxNumElements(0, S32, 16) 422 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 423 .scalarize(0); 424 425 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 426 // Full set of gfx9 features. 427 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 428 .legalFor({S32, S16, V2S16}) 429 .clampScalar(0, S16, S32) 430 .clampMaxNumElements(0, S16, 2) 431 .scalarize(0) 432 .widenScalarToNextPow2(0, 32); 433 434 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 435 .legalFor({S32, S16, V2S16}) // Clamp modifier 436 .minScalar(0, S16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .widenScalarToNextPow2(0, 32) 440 .lower(); 441 } else if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 443 .legalFor({S32, S16}) 444 .clampScalar(0, S16, S32) 445 .scalarize(0) 446 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 447 448 // Technically the saturating operations require clamp bit support, but this 449 // was introduced at the same time as 16-bit operations. 450 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 451 .legalFor({S32, S16}) // Clamp modifier 452 .minScalar(0, S16) 453 .scalarize(0) 454 .widenScalarToNextPow2(0, 16) 455 .lower(); 456 457 // We're just lowering this, but it helps get a better result to try to 458 // coerce to the desired type first. 459 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 460 .minScalar(0, S16) 461 .scalarize(0) 462 .lower(); 463 } else { 464 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 465 .legalFor({S32}) 466 .clampScalar(0, S32, S32) 467 .scalarize(0); 468 469 if (ST.hasIntClamp()) { 470 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 471 .legalFor({S32}) // Clamp modifier. 472 .scalarize(0) 473 .minScalarOrElt(0, S32) 474 .lower(); 475 } else { 476 // Clamp bit support was added in VI, along with 16-bit operations. 477 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 478 .minScalar(0, S32) 479 .scalarize(0) 480 .lower(); 481 } 482 483 // FIXME: DAG expansion gets better results. The widening uses the smaller 484 // range values and goes for the min/max lowering directly. 485 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 486 .minScalar(0, S32) 487 .scalarize(0) 488 .lower(); 489 } 490 491 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 492 .customFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .widenScalarToNextPow2(0, 32) 495 .scalarize(0); 496 497 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 498 .legalFor({S32}) 499 .clampScalar(0, S32, S32) 500 .scalarize(0); 501 502 // Report legal for any types we can handle anywhere. For the cases only legal 503 // on the SALU, RegBankSelect will be able to re-legalize. 504 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 505 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 506 .clampScalar(0, S32, S64) 507 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 508 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 509 .widenScalarToNextPow2(0) 510 .scalarize(0); 511 512 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 513 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 514 .legalFor({{S32, S1}, {S32, S32}}) 515 .minScalar(0, S32) 516 // TODO: .scalarize(0) 517 .lower(); 518 519 getActionDefinitionsBuilder(G_BITCAST) 520 // Don't worry about the size constraint. 521 .legalIf(all(isRegisterType(0), isRegisterType(1))) 522 .lower(); 523 524 525 getActionDefinitionsBuilder(G_CONSTANT) 526 .legalFor({S1, S32, S64, S16, GlobalPtr, 527 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 528 .legalIf(isPointer(0)) 529 .clampScalar(0, S32, S64) 530 .widenScalarToNextPow2(0); 531 532 getActionDefinitionsBuilder(G_FCONSTANT) 533 .legalFor({S32, S64, S16}) 534 .clampScalar(0, S16, S64); 535 536 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 537 .legalIf(isRegisterType(0)) 538 // s1 and s16 are special cases because they have legal operations on 539 // them, but don't really occupy registers in the normal way. 540 .legalFor({S1, S16}) 541 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 542 .clampScalarOrElt(0, S32, MaxScalar) 543 .widenScalarToNextPow2(0, 32) 544 .clampMaxNumElements(0, S32, 16); 545 546 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 547 548 // If the amount is divergent, we have to do a wave reduction to get the 549 // maximum value, so this is expanded during RegBankSelect. 550 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 551 .legalFor({{PrivatePtr, S32}}); 552 553 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 554 .customIf(typeIsNot(0, PrivatePtr)); 555 556 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 557 558 auto &FPOpActions = getActionDefinitionsBuilder( 559 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 560 .legalFor({S32, S64}); 561 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 562 .customFor({S32, S64}); 563 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 564 .customFor({S32, S64}); 565 566 if (ST.has16BitInsts()) { 567 if (ST.hasVOP3PInsts()) 568 FPOpActions.legalFor({S16, V2S16}); 569 else 570 FPOpActions.legalFor({S16}); 571 572 TrigActions.customFor({S16}); 573 FDIVActions.customFor({S16}); 574 } 575 576 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 577 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 578 579 if (ST.hasVOP3PInsts()) { 580 MinNumMaxNum.customFor(FPTypesPK16) 581 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 582 .clampMaxNumElements(0, S16, 2) 583 .clampScalar(0, S16, S64) 584 .scalarize(0); 585 } else if (ST.has16BitInsts()) { 586 MinNumMaxNum.customFor(FPTypes16) 587 .clampScalar(0, S16, S64) 588 .scalarize(0); 589 } else { 590 MinNumMaxNum.customFor(FPTypesBase) 591 .clampScalar(0, S32, S64) 592 .scalarize(0); 593 } 594 595 if (ST.hasVOP3PInsts()) 596 FPOpActions.clampMaxNumElements(0, S16, 2); 597 598 FPOpActions 599 .scalarize(0) 600 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 601 602 TrigActions 603 .scalarize(0) 604 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 605 606 FDIVActions 607 .scalarize(0) 608 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 609 610 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 611 .legalFor(FPTypesPK16) 612 .clampMaxNumElements(0, S16, 2) 613 .scalarize(0) 614 .clampScalar(0, S16, S64); 615 616 if (ST.has16BitInsts()) { 617 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 618 .legalFor({S32, S64, S16}) 619 .scalarize(0) 620 .clampScalar(0, S16, S64); 621 } else { 622 getActionDefinitionsBuilder(G_FSQRT) 623 .legalFor({S32, S64}) 624 .scalarize(0) 625 .clampScalar(0, S32, S64); 626 627 if (ST.hasFractBug()) { 628 getActionDefinitionsBuilder(G_FFLOOR) 629 .customFor({S64}) 630 .legalFor({S32, S64}) 631 .scalarize(0) 632 .clampScalar(0, S32, S64); 633 } else { 634 getActionDefinitionsBuilder(G_FFLOOR) 635 .legalFor({S32, S64}) 636 .scalarize(0) 637 .clampScalar(0, S32, S64); 638 } 639 } 640 641 getActionDefinitionsBuilder(G_FPTRUNC) 642 .legalFor({{S32, S64}, {S16, S32}}) 643 .scalarize(0) 644 .lower(); 645 646 getActionDefinitionsBuilder(G_FPEXT) 647 .legalFor({{S64, S32}, {S32, S16}}) 648 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 649 .scalarize(0); 650 651 getActionDefinitionsBuilder(G_FSUB) 652 // Use actual fsub instruction 653 .legalFor({S32}) 654 // Must use fadd + fneg 655 .lowerFor({S64, S16, V2S16}) 656 .scalarize(0) 657 .clampScalar(0, S32, S64); 658 659 // Whether this is legal depends on the floating point mode for the function. 660 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 661 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 662 FMad.customFor({S32, S16}); 663 else if (ST.hasMadMacF32Insts()) 664 FMad.customFor({S32}); 665 else if (ST.hasMadF16()) 666 FMad.customFor({S16}); 667 FMad.scalarize(0) 668 .lower(); 669 670 // TODO: Do we need to clamp maximum bitwidth? 671 getActionDefinitionsBuilder(G_TRUNC) 672 .legalIf(isScalar(0)) 673 .legalFor({{V2S16, V2S32}}) 674 .clampMaxNumElements(0, S16, 2) 675 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 676 // situations (like an invalid implicit use), we don't want to infinite loop 677 // in the legalizer. 678 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 679 .alwaysLegal(); 680 681 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 682 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 683 {S32, S1}, {S64, S1}, {S16, S1}}) 684 .scalarize(0) 685 .clampScalar(0, S32, S64) 686 .widenScalarToNextPow2(1, 32); 687 688 // TODO: Split s1->s64 during regbankselect for VALU. 689 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 690 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 691 .lowerFor({{S32, S64}}) 692 .lowerIf(typeIs(1, S1)) 693 .customFor({{S64, S64}}); 694 if (ST.has16BitInsts()) 695 IToFP.legalFor({{S16, S16}}); 696 IToFP.clampScalar(1, S32, S64) 697 .minScalar(0, S32) 698 .scalarize(0) 699 .widenScalarToNextPow2(1); 700 701 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 702 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 703 .customFor({{S64, S64}}) 704 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 705 if (ST.has16BitInsts()) 706 FPToI.legalFor({{S16, S16}}); 707 else 708 FPToI.minScalar(1, S32); 709 710 FPToI.minScalar(0, S32) 711 .scalarize(0) 712 .lower(); 713 714 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 715 .scalarize(0) 716 .lower(); 717 718 if (ST.has16BitInsts()) { 719 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 720 .legalFor({S16, S32, S64}) 721 .clampScalar(0, S16, S64) 722 .scalarize(0); 723 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 724 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 725 .legalFor({S32, S64}) 726 .clampScalar(0, S32, S64) 727 .scalarize(0); 728 } else { 729 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 730 .legalFor({S32}) 731 .customFor({S64}) 732 .clampScalar(0, S32, S64) 733 .scalarize(0); 734 } 735 736 getActionDefinitionsBuilder(G_PTR_ADD) 737 .legalIf(all(isPointer(0), sameSize(0, 1))) 738 .scalarize(0) 739 .scalarSameSizeAs(1, 0); 740 741 getActionDefinitionsBuilder(G_PTRMASK) 742 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 743 .scalarSameSizeAs(1, 0) 744 .scalarize(0); 745 746 auto &CmpBuilder = 747 getActionDefinitionsBuilder(G_ICMP) 748 // The compare output type differs based on the register bank of the output, 749 // so make both s1 and s32 legal. 750 // 751 // Scalar compares producing output in scc will be promoted to s32, as that 752 // is the allocatable register type that will be needed for the copy from 753 // scc. This will be promoted during RegBankSelect, and we assume something 754 // before that won't try to use s32 result types. 755 // 756 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 757 // bank. 758 .legalForCartesianProduct( 759 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 760 .legalForCartesianProduct( 761 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 762 if (ST.has16BitInsts()) { 763 CmpBuilder.legalFor({{S1, S16}}); 764 } 765 766 CmpBuilder 767 .widenScalarToNextPow2(1) 768 .clampScalar(1, S32, S64) 769 .scalarize(0) 770 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 771 772 getActionDefinitionsBuilder(G_FCMP) 773 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 774 .widenScalarToNextPow2(1) 775 .clampScalar(1, S32, S64) 776 .scalarize(0); 777 778 // FIXME: fpow has a selection pattern that should move to custom lowering. 779 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 780 if (ST.has16BitInsts()) 781 Exp2Ops.legalFor({S32, S16}); 782 else 783 Exp2Ops.legalFor({S32}); 784 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 785 Exp2Ops.scalarize(0); 786 787 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 788 if (ST.has16BitInsts()) 789 ExpOps.customFor({{S32}, {S16}}); 790 else 791 ExpOps.customFor({S32}); 792 ExpOps.clampScalar(0, MinScalarFPTy, S32) 793 .scalarize(0); 794 795 getActionDefinitionsBuilder(G_FPOWI) 796 .clampScalar(0, MinScalarFPTy, S32) 797 .lower(); 798 799 // The 64-bit versions produce 32-bit results, but only on the SALU. 800 getActionDefinitionsBuilder(G_CTPOP) 801 .legalFor({{S32, S32}, {S32, S64}}) 802 .clampScalar(0, S32, S32) 803 .clampScalar(1, S32, S64) 804 .scalarize(0) 805 .widenScalarToNextPow2(0, 32) 806 .widenScalarToNextPow2(1, 32); 807 808 // The hardware instructions return a different result on 0 than the generic 809 // instructions expect. The hardware produces -1, but these produce the 810 // bitwidth. 811 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 812 .scalarize(0) 813 .clampScalar(0, S32, S32) 814 .clampScalar(1, S32, S64) 815 .widenScalarToNextPow2(0, 32) 816 .widenScalarToNextPow2(1, 32) 817 .lower(); 818 819 // The 64-bit versions produce 32-bit results, but only on the SALU. 820 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 821 .legalFor({{S32, S32}, {S32, S64}}) 822 .clampScalar(0, S32, S32) 823 .clampScalar(1, S32, S64) 824 .scalarize(0) 825 .widenScalarToNextPow2(0, 32) 826 .widenScalarToNextPow2(1, 32); 827 828 getActionDefinitionsBuilder(G_BITREVERSE) 829 .legalFor({S32}) 830 .clampScalar(0, S32, S32) 831 .scalarize(0); 832 833 if (ST.has16BitInsts()) { 834 getActionDefinitionsBuilder(G_BSWAP) 835 .legalFor({S16, S32, V2S16}) 836 .clampMaxNumElements(0, S16, 2) 837 // FIXME: Fixing non-power-of-2 before clamp is workaround for 838 // narrowScalar limitation. 839 .widenScalarToNextPow2(0) 840 .clampScalar(0, S16, S32) 841 .scalarize(0); 842 843 if (ST.hasVOP3PInsts()) { 844 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 845 .legalFor({S32, S16, V2S16}) 846 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 847 .clampMaxNumElements(0, S16, 2) 848 .minScalar(0, S16) 849 .widenScalarToNextPow2(0) 850 .scalarize(0) 851 .lower(); 852 } else { 853 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 854 .legalFor({S32, S16}) 855 .widenScalarToNextPow2(0) 856 .minScalar(0, S16) 857 .scalarize(0) 858 .lower(); 859 } 860 } else { 861 // TODO: Should have same legality without v_perm_b32 862 getActionDefinitionsBuilder(G_BSWAP) 863 .legalFor({S32}) 864 .lowerIf(scalarNarrowerThan(0, 32)) 865 // FIXME: Fixing non-power-of-2 before clamp is workaround for 866 // narrowScalar limitation. 867 .widenScalarToNextPow2(0) 868 .maxScalar(0, S32) 869 .scalarize(0) 870 .lower(); 871 872 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 873 .legalFor({S32}) 874 .minScalar(0, S32) 875 .widenScalarToNextPow2(0) 876 .scalarize(0) 877 .lower(); 878 } 879 880 getActionDefinitionsBuilder(G_INTTOPTR) 881 // List the common cases 882 .legalForCartesianProduct(AddrSpaces64, {S64}) 883 .legalForCartesianProduct(AddrSpaces32, {S32}) 884 .scalarize(0) 885 // Accept any address space as long as the size matches 886 .legalIf(sameSize(0, 1)) 887 .widenScalarIf(smallerThan(1, 0), 888 [](const LegalityQuery &Query) { 889 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 890 }) 891 .narrowScalarIf(largerThan(1, 0), 892 [](const LegalityQuery &Query) { 893 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 894 }); 895 896 getActionDefinitionsBuilder(G_PTRTOINT) 897 // List the common cases 898 .legalForCartesianProduct(AddrSpaces64, {S64}) 899 .legalForCartesianProduct(AddrSpaces32, {S32}) 900 .scalarize(0) 901 // Accept any address space as long as the size matches 902 .legalIf(sameSize(0, 1)) 903 .widenScalarIf(smallerThan(0, 1), 904 [](const LegalityQuery &Query) { 905 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 906 }) 907 .narrowScalarIf( 908 largerThan(0, 1), 909 [](const LegalityQuery &Query) { 910 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 911 }); 912 913 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 914 .scalarize(0) 915 .custom(); 916 917 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 918 bool IsLoad) -> bool { 919 const LLT DstTy = Query.Types[0]; 920 921 // Split vector extloads. 922 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 923 unsigned Align = Query.MMODescrs[0].AlignInBits; 924 925 if (MemSize < DstTy.getSizeInBits()) 926 MemSize = std::max(MemSize, Align); 927 928 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 929 return true; 930 931 const LLT PtrTy = Query.Types[1]; 932 unsigned AS = PtrTy.getAddressSpace(); 933 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 934 return true; 935 936 // Catch weird sized loads that don't evenly divide into the access sizes 937 // TODO: May be able to widen depending on alignment etc. 938 unsigned NumRegs = (MemSize + 31) / 32; 939 if (NumRegs == 3) { 940 if (!ST.hasDwordx3LoadStores()) 941 return true; 942 } else { 943 // If the alignment allows, these should have been widened. 944 if (!isPowerOf2_32(NumRegs)) 945 return true; 946 } 947 948 if (Align < MemSize) { 949 const SITargetLowering *TLI = ST.getTargetLowering(); 950 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 951 } 952 953 return false; 954 }; 955 956 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 957 unsigned Opc) -> bool { 958 unsigned Size = Query.Types[0].getSizeInBits(); 959 if (isPowerOf2_32(Size)) 960 return false; 961 962 if (Size == 96 && ST.hasDwordx3LoadStores()) 963 return false; 964 965 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 966 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 967 return false; 968 969 unsigned Align = Query.MMODescrs[0].AlignInBits; 970 unsigned RoundedSize = NextPowerOf2(Size); 971 return (Align >= RoundedSize); 972 }; 973 974 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 975 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 976 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 977 978 // TODO: Refine based on subtargets which support unaligned access or 128-bit 979 // LDS 980 // TODO: Unsupported flat for SI. 981 982 for (unsigned Op : {G_LOAD, G_STORE}) { 983 const bool IsStore = Op == G_STORE; 984 985 auto &Actions = getActionDefinitionsBuilder(Op); 986 // Explicitly list some common cases. 987 // TODO: Does this help compile time at all? 988 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 989 {V2S32, GlobalPtr, 64, GlobalAlign32}, 990 {V4S32, GlobalPtr, 128, GlobalAlign32}, 991 {S64, GlobalPtr, 64, GlobalAlign32}, 992 {V2S64, GlobalPtr, 128, GlobalAlign32}, 993 {V2S16, GlobalPtr, 32, GlobalAlign32}, 994 {S32, GlobalPtr, 8, GlobalAlign8}, 995 {S32, GlobalPtr, 16, GlobalAlign16}, 996 997 {S32, LocalPtr, 32, 32}, 998 {S64, LocalPtr, 64, 32}, 999 {V2S32, LocalPtr, 64, 32}, 1000 {S32, LocalPtr, 8, 8}, 1001 {S32, LocalPtr, 16, 16}, 1002 {V2S16, LocalPtr, 32, 32}, 1003 1004 {S32, PrivatePtr, 32, 32}, 1005 {S32, PrivatePtr, 8, 8}, 1006 {S32, PrivatePtr, 16, 16}, 1007 {V2S16, PrivatePtr, 32, 32}, 1008 1009 {S32, ConstantPtr, 32, GlobalAlign32}, 1010 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1011 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1012 {S64, ConstantPtr, 64, GlobalAlign32}, 1013 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1014 Actions.legalIf( 1015 [=](const LegalityQuery &Query) -> bool { 1016 return isLoadStoreLegal(ST, Query, Op); 1017 }); 1018 1019 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1020 // 64-bits. 1021 // 1022 // TODO: Should generalize bitcast action into coerce, which will also cover 1023 // inserting addrspacecasts. 1024 Actions.customIf(typeIs(1, Constant32Ptr)); 1025 1026 // Turn any illegal element vectors into something easier to deal 1027 // with. These will ultimately produce 32-bit scalar shifts to extract the 1028 // parts anyway. 1029 // 1030 // For odd 16-bit element vectors, prefer to split those into pieces with 1031 // 16-bit vector parts. 1032 Actions.bitcastIf( 1033 [=](const LegalityQuery &Query) -> bool { 1034 const LLT Ty = Query.Types[0]; 1035 const unsigned Size = Ty.getSizeInBits(); 1036 1037 if (Size != Query.MMODescrs[0].SizeInBits) 1038 return Size <= 32 && Ty.isVector(); 1039 1040 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 1041 return true; 1042 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 1043 !isRegisterVectorElementType(Ty.getElementType()); 1044 }, bitcastToRegisterType(0)); 1045 1046 Actions 1047 .customIf(typeIs(1, Constant32Ptr)) 1048 // Widen suitably aligned loads by loading extra elements. 1049 .moreElementsIf([=](const LegalityQuery &Query) { 1050 const LLT Ty = Query.Types[0]; 1051 return Op == G_LOAD && Ty.isVector() && 1052 shouldWidenLoadResult(Query, Op); 1053 }, moreElementsToNextPow2(0)) 1054 .widenScalarIf([=](const LegalityQuery &Query) { 1055 const LLT Ty = Query.Types[0]; 1056 return Op == G_LOAD && !Ty.isVector() && 1057 shouldWidenLoadResult(Query, Op); 1058 }, widenScalarOrEltToNextPow2(0)) 1059 .narrowScalarIf( 1060 [=](const LegalityQuery &Query) -> bool { 1061 return !Query.Types[0].isVector() && 1062 needToSplitMemOp(Query, Op == G_LOAD); 1063 }, 1064 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1065 const LLT DstTy = Query.Types[0]; 1066 const LLT PtrTy = Query.Types[1]; 1067 1068 const unsigned DstSize = DstTy.getSizeInBits(); 1069 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1070 1071 // Split extloads. 1072 if (DstSize > MemSize) 1073 return std::make_pair(0, LLT::scalar(MemSize)); 1074 1075 if (!isPowerOf2_32(DstSize)) { 1076 // We're probably decomposing an odd sized store. Try to split 1077 // to the widest type. TODO: Account for alignment. As-is it 1078 // should be OK, since the new parts will be further legalized. 1079 unsigned FloorSize = PowerOf2Floor(DstSize); 1080 return std::make_pair(0, LLT::scalar(FloorSize)); 1081 } 1082 1083 if (DstSize > 32 && (DstSize % 32 != 0)) { 1084 // FIXME: Need a way to specify non-extload of larger size if 1085 // suitably aligned. 1086 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1087 } 1088 1089 unsigned MaxSize = maxSizeForAddrSpace(ST, 1090 PtrTy.getAddressSpace(), 1091 Op == G_LOAD); 1092 if (MemSize > MaxSize) 1093 return std::make_pair(0, LLT::scalar(MaxSize)); 1094 1095 unsigned Align = Query.MMODescrs[0].AlignInBits; 1096 return std::make_pair(0, LLT::scalar(Align)); 1097 }) 1098 .fewerElementsIf( 1099 [=](const LegalityQuery &Query) -> bool { 1100 return Query.Types[0].isVector() && 1101 needToSplitMemOp(Query, Op == G_LOAD); 1102 }, 1103 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1104 const LLT DstTy = Query.Types[0]; 1105 const LLT PtrTy = Query.Types[1]; 1106 1107 LLT EltTy = DstTy.getElementType(); 1108 unsigned MaxSize = maxSizeForAddrSpace(ST, 1109 PtrTy.getAddressSpace(), 1110 Op == G_LOAD); 1111 1112 // FIXME: Handle widened to power of 2 results better. This ends 1113 // up scalarizing. 1114 // FIXME: 3 element stores scalarized on SI 1115 1116 // Split if it's too large for the address space. 1117 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1118 unsigned NumElts = DstTy.getNumElements(); 1119 unsigned EltSize = EltTy.getSizeInBits(); 1120 1121 if (MaxSize % EltSize == 0) { 1122 return std::make_pair( 1123 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1124 } 1125 1126 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1127 1128 // FIXME: Refine when odd breakdowns handled 1129 // The scalars will need to be re-legalized. 1130 if (NumPieces == 1 || NumPieces >= NumElts || 1131 NumElts % NumPieces != 0) 1132 return std::make_pair(0, EltTy); 1133 1134 return std::make_pair(0, 1135 LLT::vector(NumElts / NumPieces, EltTy)); 1136 } 1137 1138 // FIXME: We could probably handle weird extending loads better. 1139 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1140 if (DstTy.getSizeInBits() > MemSize) 1141 return std::make_pair(0, EltTy); 1142 1143 unsigned EltSize = EltTy.getSizeInBits(); 1144 unsigned DstSize = DstTy.getSizeInBits(); 1145 if (!isPowerOf2_32(DstSize)) { 1146 // We're probably decomposing an odd sized store. Try to split 1147 // to the widest type. TODO: Account for alignment. As-is it 1148 // should be OK, since the new parts will be further legalized. 1149 unsigned FloorSize = PowerOf2Floor(DstSize); 1150 return std::make_pair( 1151 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1152 } 1153 1154 // Need to split because of alignment. 1155 unsigned Align = Query.MMODescrs[0].AlignInBits; 1156 if (EltSize > Align && 1157 (EltSize / Align < DstTy.getNumElements())) { 1158 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1159 } 1160 1161 // May need relegalization for the scalars. 1162 return std::make_pair(0, EltTy); 1163 }) 1164 .minScalar(0, S32); 1165 1166 if (IsStore) 1167 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1168 1169 // TODO: Need a bitcast lower option? 1170 Actions 1171 .widenScalarToNextPow2(0) 1172 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1173 } 1174 1175 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1176 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1177 {S32, GlobalPtr, 16, 2 * 8}, 1178 {S32, LocalPtr, 8, 8}, 1179 {S32, LocalPtr, 16, 16}, 1180 {S32, PrivatePtr, 8, 8}, 1181 {S32, PrivatePtr, 16, 16}, 1182 {S32, ConstantPtr, 8, 8}, 1183 {S32, ConstantPtr, 16, 2 * 8}}); 1184 if (ST.hasFlatAddressSpace()) { 1185 ExtLoads.legalForTypesWithMemDesc( 1186 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1187 } 1188 1189 ExtLoads.clampScalar(0, S32, S32) 1190 .widenScalarToNextPow2(0) 1191 .unsupportedIfMemSizeNotPow2() 1192 .lower(); 1193 1194 auto &Atomics = getActionDefinitionsBuilder( 1195 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1196 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1197 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1198 G_ATOMICRMW_UMIN}) 1199 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1200 {S64, GlobalPtr}, {S64, LocalPtr}, 1201 {S32, RegionPtr}, {S64, RegionPtr}}); 1202 if (ST.hasFlatAddressSpace()) { 1203 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1204 } 1205 1206 if (ST.hasLDSFPAtomics()) { 1207 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1208 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1209 } 1210 1211 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1212 // demarshalling 1213 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1214 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1215 {S32, FlatPtr}, {S64, FlatPtr}}) 1216 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1217 {S32, RegionPtr}, {S64, RegionPtr}}); 1218 // TODO: Pointer types, any 32-bit or 64-bit vector 1219 1220 // Condition should be s32 for scalar, s1 for vector. 1221 getActionDefinitionsBuilder(G_SELECT) 1222 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1223 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1224 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1225 .clampScalar(0, S16, S64) 1226 .scalarize(1) 1227 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1228 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1229 .clampMaxNumElements(0, S32, 2) 1230 .clampMaxNumElements(0, LocalPtr, 2) 1231 .clampMaxNumElements(0, PrivatePtr, 2) 1232 .scalarize(0) 1233 .widenScalarToNextPow2(0) 1234 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1235 1236 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1237 // be more flexible with the shift amount type. 1238 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1239 .legalFor({{S32, S32}, {S64, S32}}); 1240 if (ST.has16BitInsts()) { 1241 if (ST.hasVOP3PInsts()) { 1242 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1243 .clampMaxNumElements(0, S16, 2); 1244 } else 1245 Shifts.legalFor({{S16, S16}}); 1246 1247 // TODO: Support 16-bit shift amounts for all types 1248 Shifts.widenScalarIf( 1249 [=](const LegalityQuery &Query) { 1250 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1251 // 32-bit amount. 1252 const LLT ValTy = Query.Types[0]; 1253 const LLT AmountTy = Query.Types[1]; 1254 return ValTy.getSizeInBits() <= 16 && 1255 AmountTy.getSizeInBits() < 16; 1256 }, changeTo(1, S16)); 1257 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1258 Shifts.clampScalar(1, S32, S32); 1259 Shifts.clampScalar(0, S16, S64); 1260 Shifts.widenScalarToNextPow2(0, 16); 1261 } else { 1262 // Make sure we legalize the shift amount type first, as the general 1263 // expansion for the shifted type will produce much worse code if it hasn't 1264 // been truncated already. 1265 Shifts.clampScalar(1, S32, S32); 1266 Shifts.clampScalar(0, S32, S64); 1267 Shifts.widenScalarToNextPow2(0, 32); 1268 } 1269 Shifts.scalarize(0); 1270 1271 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1272 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1273 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1274 unsigned IdxTypeIdx = 2; 1275 1276 getActionDefinitionsBuilder(Op) 1277 .customIf([=](const LegalityQuery &Query) { 1278 const LLT EltTy = Query.Types[EltTypeIdx]; 1279 const LLT VecTy = Query.Types[VecTypeIdx]; 1280 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1281 return (EltTy.getSizeInBits() == 16 || 1282 EltTy.getSizeInBits() % 32 == 0) && 1283 VecTy.getSizeInBits() % 32 == 0 && 1284 VecTy.getSizeInBits() <= MaxRegisterSize && 1285 IdxTy.getSizeInBits() == 32; 1286 }) 1287 .clampScalar(EltTypeIdx, S32, S64) 1288 .clampScalar(VecTypeIdx, S32, S64) 1289 .clampScalar(IdxTypeIdx, S32, S32); 1290 } 1291 1292 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1293 .unsupportedIf([=](const LegalityQuery &Query) { 1294 const LLT &EltTy = Query.Types[1].getElementType(); 1295 return Query.Types[0] != EltTy; 1296 }); 1297 1298 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1299 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1300 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1301 1302 // FIXME: Doesn't handle extract of illegal sizes. 1303 getActionDefinitionsBuilder(Op) 1304 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1305 // FIXME: Multiples of 16 should not be legal. 1306 .legalIf([=](const LegalityQuery &Query) { 1307 const LLT BigTy = Query.Types[BigTyIdx]; 1308 const LLT LitTy = Query.Types[LitTyIdx]; 1309 return (BigTy.getSizeInBits() % 32 == 0) && 1310 (LitTy.getSizeInBits() % 16 == 0); 1311 }) 1312 .widenScalarIf( 1313 [=](const LegalityQuery &Query) { 1314 const LLT BigTy = Query.Types[BigTyIdx]; 1315 return (BigTy.getScalarSizeInBits() < 16); 1316 }, 1317 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1318 .widenScalarIf( 1319 [=](const LegalityQuery &Query) { 1320 const LLT LitTy = Query.Types[LitTyIdx]; 1321 return (LitTy.getScalarSizeInBits() < 16); 1322 }, 1323 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1324 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1325 .widenScalarToNextPow2(BigTyIdx, 32); 1326 1327 } 1328 1329 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1330 .legalForCartesianProduct(AllS32Vectors, {S32}) 1331 .legalForCartesianProduct(AllS64Vectors, {S64}) 1332 .clampNumElements(0, V16S32, V32S32) 1333 .clampNumElements(0, V2S64, V16S64) 1334 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1335 1336 if (ST.hasScalarPackInsts()) { 1337 BuildVector 1338 // FIXME: Should probably widen s1 vectors straight to s32 1339 .minScalarOrElt(0, S16) 1340 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1341 .minScalar(1, S32); 1342 1343 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1344 .legalFor({V2S16, S32}) 1345 .lower(); 1346 BuildVector.minScalarOrElt(0, S32); 1347 } else { 1348 BuildVector.customFor({V2S16, S16}); 1349 BuildVector.minScalarOrElt(0, S32); 1350 1351 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1352 .customFor({V2S16, S32}) 1353 .lower(); 1354 } 1355 1356 BuildVector.legalIf(isRegisterType(0)); 1357 1358 // FIXME: Clamp maximum size 1359 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1360 .legalIf(isRegisterType(0)); 1361 1362 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1363 // pre-legalize. 1364 if (ST.hasVOP3PInsts()) { 1365 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1366 .customFor({V2S16, V2S16}) 1367 .lower(); 1368 } else 1369 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1370 1371 // Merge/Unmerge 1372 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1373 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1374 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1375 1376 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1377 const LLT Ty = Query.Types[TypeIdx]; 1378 if (Ty.isVector()) { 1379 const LLT &EltTy = Ty.getElementType(); 1380 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1381 return true; 1382 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1383 return true; 1384 } 1385 return false; 1386 }; 1387 1388 auto &Builder = getActionDefinitionsBuilder(Op) 1389 .lowerFor({{S16, V2S16}}) 1390 .lowerIf([=](const LegalityQuery &Query) { 1391 const LLT BigTy = Query.Types[BigTyIdx]; 1392 return BigTy.getSizeInBits() == 32; 1393 }) 1394 // Try to widen to s16 first for small types. 1395 // TODO: Only do this on targets with legal s16 shifts 1396 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1397 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1398 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1399 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1400 elementTypeIs(1, S16)), 1401 changeTo(1, V2S16)) 1402 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1403 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1404 // valid. 1405 .clampScalar(LitTyIdx, S32, S512) 1406 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1407 // Break up vectors with weird elements into scalars 1408 .fewerElementsIf( 1409 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1410 scalarize(0)) 1411 .fewerElementsIf( 1412 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1413 scalarize(1)) 1414 .clampScalar(BigTyIdx, S32, MaxScalar); 1415 1416 if (Op == G_MERGE_VALUES) { 1417 Builder.widenScalarIf( 1418 // TODO: Use 16-bit shifts if legal for 8-bit values? 1419 [=](const LegalityQuery &Query) { 1420 const LLT Ty = Query.Types[LitTyIdx]; 1421 return Ty.getSizeInBits() < 32; 1422 }, 1423 changeTo(LitTyIdx, S32)); 1424 } 1425 1426 Builder.widenScalarIf( 1427 [=](const LegalityQuery &Query) { 1428 const LLT Ty = Query.Types[BigTyIdx]; 1429 return !isPowerOf2_32(Ty.getSizeInBits()) && 1430 Ty.getSizeInBits() % 16 != 0; 1431 }, 1432 [=](const LegalityQuery &Query) { 1433 // Pick the next power of 2, or a multiple of 64 over 128. 1434 // Whichever is smaller. 1435 const LLT &Ty = Query.Types[BigTyIdx]; 1436 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1437 if (NewSizeInBits >= 256) { 1438 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1439 if (RoundedTo < NewSizeInBits) 1440 NewSizeInBits = RoundedTo; 1441 } 1442 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1443 }) 1444 .legalIf([=](const LegalityQuery &Query) { 1445 const LLT &BigTy = Query.Types[BigTyIdx]; 1446 const LLT &LitTy = Query.Types[LitTyIdx]; 1447 1448 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1449 return false; 1450 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1451 return false; 1452 1453 return BigTy.getSizeInBits() % 16 == 0 && 1454 LitTy.getSizeInBits() % 16 == 0 && 1455 BigTy.getSizeInBits() <= MaxRegisterSize; 1456 }) 1457 // Any vectors left are the wrong size. Scalarize them. 1458 .scalarize(0) 1459 .scalarize(1); 1460 } 1461 1462 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1463 // RegBankSelect. 1464 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1465 .legalFor({{S32}, {S64}}); 1466 1467 if (ST.hasVOP3PInsts()) { 1468 SextInReg.lowerFor({{V2S16}}) 1469 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1470 // get more vector shift opportunities, since we'll get those when 1471 // expanded. 1472 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1473 } else if (ST.has16BitInsts()) { 1474 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1475 } else { 1476 // Prefer to promote to s32 before lowering if we don't have 16-bit 1477 // shifts. This avoid a lot of intermediate truncate and extend operations. 1478 SextInReg.lowerFor({{S32}, {S64}}); 1479 } 1480 1481 SextInReg 1482 .scalarize(0) 1483 .clampScalar(0, S32, S64) 1484 .lower(); 1485 1486 getActionDefinitionsBuilder(G_FSHR) 1487 .legalFor({{S32, S32}}) 1488 .scalarize(0) 1489 .lower(); 1490 1491 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1492 .legalFor({S64}); 1493 1494 getActionDefinitionsBuilder(G_FENCE) 1495 .alwaysLegal(); 1496 1497 getActionDefinitionsBuilder({ 1498 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1499 G_FCOPYSIGN, 1500 1501 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1502 G_ATOMICRMW_NAND, 1503 G_ATOMICRMW_FSUB, 1504 G_READ_REGISTER, 1505 G_WRITE_REGISTER, 1506 1507 G_SADDO, G_SSUBO, 1508 1509 // TODO: Implement 1510 G_FMINIMUM, G_FMAXIMUM, 1511 G_FSHL 1512 }).lower(); 1513 1514 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1515 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1516 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1517 .unsupported(); 1518 1519 computeTables(); 1520 verify(*ST.getInstrInfo()); 1521 } 1522 1523 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1524 MachineInstr &MI) const { 1525 MachineIRBuilder &B = Helper.MIRBuilder; 1526 MachineRegisterInfo &MRI = *B.getMRI(); 1527 GISelChangeObserver &Observer = Helper.Observer; 1528 1529 switch (MI.getOpcode()) { 1530 case TargetOpcode::G_ADDRSPACE_CAST: 1531 return legalizeAddrSpaceCast(MI, MRI, B); 1532 case TargetOpcode::G_FRINT: 1533 return legalizeFrint(MI, MRI, B); 1534 case TargetOpcode::G_FCEIL: 1535 return legalizeFceil(MI, MRI, B); 1536 case TargetOpcode::G_INTRINSIC_TRUNC: 1537 return legalizeIntrinsicTrunc(MI, MRI, B); 1538 case TargetOpcode::G_SITOFP: 1539 return legalizeITOFP(MI, MRI, B, true); 1540 case TargetOpcode::G_UITOFP: 1541 return legalizeITOFP(MI, MRI, B, false); 1542 case TargetOpcode::G_FPTOSI: 1543 return legalizeFPTOI(MI, MRI, B, true); 1544 case TargetOpcode::G_FPTOUI: 1545 return legalizeFPTOI(MI, MRI, B, false); 1546 case TargetOpcode::G_FMINNUM: 1547 case TargetOpcode::G_FMAXNUM: 1548 case TargetOpcode::G_FMINNUM_IEEE: 1549 case TargetOpcode::G_FMAXNUM_IEEE: 1550 return legalizeMinNumMaxNum(Helper, MI); 1551 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1552 return legalizeExtractVectorElt(MI, MRI, B); 1553 case TargetOpcode::G_INSERT_VECTOR_ELT: 1554 return legalizeInsertVectorElt(MI, MRI, B); 1555 case TargetOpcode::G_SHUFFLE_VECTOR: 1556 return legalizeShuffleVector(MI, MRI, B); 1557 case TargetOpcode::G_FSIN: 1558 case TargetOpcode::G_FCOS: 1559 return legalizeSinCos(MI, MRI, B); 1560 case TargetOpcode::G_GLOBAL_VALUE: 1561 return legalizeGlobalValue(MI, MRI, B); 1562 case TargetOpcode::G_LOAD: 1563 return legalizeLoad(MI, MRI, B, Observer); 1564 case TargetOpcode::G_FMAD: 1565 return legalizeFMad(MI, MRI, B); 1566 case TargetOpcode::G_FDIV: 1567 return legalizeFDIV(MI, MRI, B); 1568 case TargetOpcode::G_UDIV: 1569 case TargetOpcode::G_UREM: 1570 return legalizeUDIV_UREM(MI, MRI, B); 1571 case TargetOpcode::G_SDIV: 1572 case TargetOpcode::G_SREM: 1573 return legalizeSDIV_SREM(MI, MRI, B); 1574 case TargetOpcode::G_ATOMIC_CMPXCHG: 1575 return legalizeAtomicCmpXChg(MI, MRI, B); 1576 case TargetOpcode::G_FLOG: 1577 return legalizeFlog(MI, B, numbers::ln2f); 1578 case TargetOpcode::G_FLOG10: 1579 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1580 case TargetOpcode::G_FEXP: 1581 return legalizeFExp(MI, B); 1582 case TargetOpcode::G_FPOW: 1583 return legalizeFPow(MI, B); 1584 case TargetOpcode::G_FFLOOR: 1585 return legalizeFFloor(MI, MRI, B); 1586 case TargetOpcode::G_BUILD_VECTOR: 1587 return legalizeBuildVector(MI, MRI, B); 1588 default: 1589 return false; 1590 } 1591 1592 llvm_unreachable("expected switch to return"); 1593 } 1594 1595 Register AMDGPULegalizerInfo::getSegmentAperture( 1596 unsigned AS, 1597 MachineRegisterInfo &MRI, 1598 MachineIRBuilder &B) const { 1599 MachineFunction &MF = B.getMF(); 1600 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1601 const LLT S32 = LLT::scalar(32); 1602 1603 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1604 1605 if (ST.hasApertureRegs()) { 1606 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1607 // getreg. 1608 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1609 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1610 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1611 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1612 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1613 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1614 unsigned Encoding = 1615 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1616 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1617 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1618 1619 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1620 1621 B.buildInstr(AMDGPU::S_GETREG_B32) 1622 .addDef(GetReg) 1623 .addImm(Encoding); 1624 MRI.setType(GetReg, S32); 1625 1626 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1627 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1628 } 1629 1630 Register QueuePtr = MRI.createGenericVirtualRegister( 1631 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1632 1633 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1634 return Register(); 1635 1636 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1637 // private_segment_aperture_base_hi. 1638 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1639 1640 // TODO: can we be smarter about machine pointer info? 1641 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1642 MachineMemOperand *MMO = MF.getMachineMemOperand( 1643 PtrInfo, 1644 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1645 MachineMemOperand::MOInvariant, 1646 4, commonAlignment(Align(64), StructOffset)); 1647 1648 Register LoadAddr; 1649 1650 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1651 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1652 } 1653 1654 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1655 MachineInstr &MI, MachineRegisterInfo &MRI, 1656 MachineIRBuilder &B) const { 1657 MachineFunction &MF = B.getMF(); 1658 1659 const LLT S32 = LLT::scalar(32); 1660 Register Dst = MI.getOperand(0).getReg(); 1661 Register Src = MI.getOperand(1).getReg(); 1662 1663 LLT DstTy = MRI.getType(Dst); 1664 LLT SrcTy = MRI.getType(Src); 1665 unsigned DestAS = DstTy.getAddressSpace(); 1666 unsigned SrcAS = SrcTy.getAddressSpace(); 1667 1668 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1669 // vector element. 1670 assert(!DstTy.isVector()); 1671 1672 const AMDGPUTargetMachine &TM 1673 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1674 1675 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1676 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1677 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1678 return true; 1679 } 1680 1681 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1682 // Truncate. 1683 B.buildExtract(Dst, Src, 0); 1684 MI.eraseFromParent(); 1685 return true; 1686 } 1687 1688 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1689 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1690 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1691 1692 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1693 // another. Merge operands are required to be the same type, but creating an 1694 // extra ptrtoint would be kind of pointless. 1695 auto HighAddr = B.buildConstant( 1696 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1697 B.buildMerge(Dst, {Src, HighAddr}); 1698 MI.eraseFromParent(); 1699 return true; 1700 } 1701 1702 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1703 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1704 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1705 unsigned NullVal = TM.getNullPointerValue(DestAS); 1706 1707 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1708 auto FlatNull = B.buildConstant(SrcTy, 0); 1709 1710 // Extract low 32-bits of the pointer. 1711 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1712 1713 auto CmpRes = 1714 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1715 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1716 1717 MI.eraseFromParent(); 1718 return true; 1719 } 1720 1721 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1722 return false; 1723 1724 if (!ST.hasFlatAddressSpace()) 1725 return false; 1726 1727 auto SegmentNull = 1728 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1729 auto FlatNull = 1730 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1731 1732 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1733 if (!ApertureReg.isValid()) 1734 return false; 1735 1736 auto CmpRes = 1737 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1738 1739 // Coerce the type of the low half of the result so we can use merge_values. 1740 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1741 1742 // TODO: Should we allow mismatched types but matching sizes in merges to 1743 // avoid the ptrtoint? 1744 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1745 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1746 1747 MI.eraseFromParent(); 1748 return true; 1749 } 1750 1751 bool AMDGPULegalizerInfo::legalizeFrint( 1752 MachineInstr &MI, MachineRegisterInfo &MRI, 1753 MachineIRBuilder &B) const { 1754 Register Src = MI.getOperand(1).getReg(); 1755 LLT Ty = MRI.getType(Src); 1756 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1757 1758 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1759 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1760 1761 auto C1 = B.buildFConstant(Ty, C1Val); 1762 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1763 1764 // TODO: Should this propagate fast-math-flags? 1765 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1766 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1767 1768 auto C2 = B.buildFConstant(Ty, C2Val); 1769 auto Fabs = B.buildFAbs(Ty, Src); 1770 1771 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1772 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1773 MI.eraseFromParent(); 1774 return true; 1775 } 1776 1777 bool AMDGPULegalizerInfo::legalizeFceil( 1778 MachineInstr &MI, MachineRegisterInfo &MRI, 1779 MachineIRBuilder &B) const { 1780 1781 const LLT S1 = LLT::scalar(1); 1782 const LLT S64 = LLT::scalar(64); 1783 1784 Register Src = MI.getOperand(1).getReg(); 1785 assert(MRI.getType(Src) == S64); 1786 1787 // result = trunc(src) 1788 // if (src > 0.0 && src != result) 1789 // result += 1.0 1790 1791 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1792 1793 const auto Zero = B.buildFConstant(S64, 0.0); 1794 const auto One = B.buildFConstant(S64, 1.0); 1795 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1796 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1797 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1798 auto Add = B.buildSelect(S64, And, One, Zero); 1799 1800 // TODO: Should this propagate fast-math-flags? 1801 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1802 return true; 1803 } 1804 1805 static MachineInstrBuilder extractF64Exponent(Register Hi, 1806 MachineIRBuilder &B) { 1807 const unsigned FractBits = 52; 1808 const unsigned ExpBits = 11; 1809 LLT S32 = LLT::scalar(32); 1810 1811 auto Const0 = B.buildConstant(S32, FractBits - 32); 1812 auto Const1 = B.buildConstant(S32, ExpBits); 1813 1814 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1815 .addUse(Hi) 1816 .addUse(Const0.getReg(0)) 1817 .addUse(Const1.getReg(0)); 1818 1819 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1820 } 1821 1822 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1823 MachineInstr &MI, MachineRegisterInfo &MRI, 1824 MachineIRBuilder &B) const { 1825 const LLT S1 = LLT::scalar(1); 1826 const LLT S32 = LLT::scalar(32); 1827 const LLT S64 = LLT::scalar(64); 1828 1829 Register Src = MI.getOperand(1).getReg(); 1830 assert(MRI.getType(Src) == S64); 1831 1832 // TODO: Should this use extract since the low half is unused? 1833 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1834 Register Hi = Unmerge.getReg(1); 1835 1836 // Extract the upper half, since this is where we will find the sign and 1837 // exponent. 1838 auto Exp = extractF64Exponent(Hi, B); 1839 1840 const unsigned FractBits = 52; 1841 1842 // Extract the sign bit. 1843 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1844 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1845 1846 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1847 1848 const auto Zero32 = B.buildConstant(S32, 0); 1849 1850 // Extend back to 64-bits. 1851 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1852 1853 auto Shr = B.buildAShr(S64, FractMask, Exp); 1854 auto Not = B.buildNot(S64, Shr); 1855 auto Tmp0 = B.buildAnd(S64, Src, Not); 1856 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1857 1858 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1859 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1860 1861 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1862 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1863 MI.eraseFromParent(); 1864 return true; 1865 } 1866 1867 bool AMDGPULegalizerInfo::legalizeITOFP( 1868 MachineInstr &MI, MachineRegisterInfo &MRI, 1869 MachineIRBuilder &B, bool Signed) const { 1870 1871 Register Dst = MI.getOperand(0).getReg(); 1872 Register Src = MI.getOperand(1).getReg(); 1873 1874 const LLT S64 = LLT::scalar(64); 1875 const LLT S32 = LLT::scalar(32); 1876 1877 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1878 1879 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1880 1881 auto CvtHi = Signed ? 1882 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1883 B.buildUITOFP(S64, Unmerge.getReg(1)); 1884 1885 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1886 1887 auto ThirtyTwo = B.buildConstant(S32, 32); 1888 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1889 .addUse(CvtHi.getReg(0)) 1890 .addUse(ThirtyTwo.getReg(0)); 1891 1892 // TODO: Should this propagate fast-math-flags? 1893 B.buildFAdd(Dst, LdExp, CvtLo); 1894 MI.eraseFromParent(); 1895 return true; 1896 } 1897 1898 // TODO: Copied from DAG implementation. Verify logic and document how this 1899 // actually works. 1900 bool AMDGPULegalizerInfo::legalizeFPTOI( 1901 MachineInstr &MI, MachineRegisterInfo &MRI, 1902 MachineIRBuilder &B, bool Signed) const { 1903 1904 Register Dst = MI.getOperand(0).getReg(); 1905 Register Src = MI.getOperand(1).getReg(); 1906 1907 const LLT S64 = LLT::scalar(64); 1908 const LLT S32 = LLT::scalar(32); 1909 1910 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1911 1912 unsigned Flags = MI.getFlags(); 1913 1914 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1915 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1916 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1917 1918 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1919 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1920 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1921 1922 auto Hi = Signed ? 1923 B.buildFPTOSI(S32, FloorMul) : 1924 B.buildFPTOUI(S32, FloorMul); 1925 auto Lo = B.buildFPTOUI(S32, Fma); 1926 1927 B.buildMerge(Dst, { Lo, Hi }); 1928 MI.eraseFromParent(); 1929 1930 return true; 1931 } 1932 1933 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1934 MachineInstr &MI) const { 1935 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1936 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1937 1938 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1939 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1940 1941 // With ieee_mode disabled, the instructions have the correct behavior 1942 // already for G_FMINNUM/G_FMAXNUM 1943 if (!MFI->getMode().IEEE) 1944 return !IsIEEEOp; 1945 1946 if (IsIEEEOp) 1947 return true; 1948 1949 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1950 } 1951 1952 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1953 MachineInstr &MI, MachineRegisterInfo &MRI, 1954 MachineIRBuilder &B) const { 1955 // TODO: Should move some of this into LegalizerHelper. 1956 1957 // TODO: Promote dynamic indexing of s16 to s32 1958 1959 // FIXME: Artifact combiner probably should have replaced the truncated 1960 // constant before this, so we shouldn't need 1961 // getConstantVRegValWithLookThrough. 1962 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1963 MI.getOperand(2).getReg(), MRI); 1964 if (!IdxVal) // Dynamic case will be selected to register indexing. 1965 return true; 1966 1967 Register Dst = MI.getOperand(0).getReg(); 1968 Register Vec = MI.getOperand(1).getReg(); 1969 1970 LLT VecTy = MRI.getType(Vec); 1971 LLT EltTy = VecTy.getElementType(); 1972 assert(EltTy == MRI.getType(Dst)); 1973 1974 if (IdxVal->Value < VecTy.getNumElements()) 1975 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1976 else 1977 B.buildUndef(Dst); 1978 1979 MI.eraseFromParent(); 1980 return true; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1984 MachineInstr &MI, MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B) const { 1986 // TODO: Should move some of this into LegalizerHelper. 1987 1988 // TODO: Promote dynamic indexing of s16 to s32 1989 1990 // FIXME: Artifact combiner probably should have replaced the truncated 1991 // constant before this, so we shouldn't need 1992 // getConstantVRegValWithLookThrough. 1993 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1994 MI.getOperand(3).getReg(), MRI); 1995 if (!IdxVal) // Dynamic case will be selected to register indexing. 1996 return true; 1997 1998 Register Dst = MI.getOperand(0).getReg(); 1999 Register Vec = MI.getOperand(1).getReg(); 2000 Register Ins = MI.getOperand(2).getReg(); 2001 2002 LLT VecTy = MRI.getType(Vec); 2003 LLT EltTy = VecTy.getElementType(); 2004 assert(EltTy == MRI.getType(Ins)); 2005 2006 if (IdxVal->Value < VecTy.getNumElements()) 2007 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 2008 else 2009 B.buildUndef(Dst); 2010 2011 MI.eraseFromParent(); 2012 return true; 2013 } 2014 2015 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2016 MachineInstr &MI, MachineRegisterInfo &MRI, 2017 MachineIRBuilder &B) const { 2018 const LLT V2S16 = LLT::vector(2, 16); 2019 2020 Register Dst = MI.getOperand(0).getReg(); 2021 Register Src0 = MI.getOperand(1).getReg(); 2022 LLT DstTy = MRI.getType(Dst); 2023 LLT SrcTy = MRI.getType(Src0); 2024 2025 if (SrcTy == V2S16 && DstTy == V2S16 && 2026 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2027 return true; 2028 2029 MachineIRBuilder HelperBuilder(MI); 2030 GISelObserverWrapper DummyObserver; 2031 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2032 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2033 } 2034 2035 bool AMDGPULegalizerInfo::legalizeSinCos( 2036 MachineInstr &MI, MachineRegisterInfo &MRI, 2037 MachineIRBuilder &B) const { 2038 2039 Register DstReg = MI.getOperand(0).getReg(); 2040 Register SrcReg = MI.getOperand(1).getReg(); 2041 LLT Ty = MRI.getType(DstReg); 2042 unsigned Flags = MI.getFlags(); 2043 2044 Register TrigVal; 2045 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2046 if (ST.hasTrigReducedRange()) { 2047 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2048 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2049 .addUse(MulVal.getReg(0)) 2050 .setMIFlags(Flags).getReg(0); 2051 } else 2052 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2053 2054 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2055 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2056 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2057 .addUse(TrigVal) 2058 .setMIFlags(Flags); 2059 MI.eraseFromParent(); 2060 return true; 2061 } 2062 2063 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2064 MachineIRBuilder &B, 2065 const GlobalValue *GV, 2066 int64_t Offset, 2067 unsigned GAFlags) const { 2068 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2069 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2070 // to the following code sequence: 2071 // 2072 // For constant address space: 2073 // s_getpc_b64 s[0:1] 2074 // s_add_u32 s0, s0, $symbol 2075 // s_addc_u32 s1, s1, 0 2076 // 2077 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2078 // a fixup or relocation is emitted to replace $symbol with a literal 2079 // constant, which is a pc-relative offset from the encoding of the $symbol 2080 // operand to the global variable. 2081 // 2082 // For global address space: 2083 // s_getpc_b64 s[0:1] 2084 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2085 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2086 // 2087 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2088 // fixups or relocations are emitted to replace $symbol@*@lo and 2089 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2090 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2091 // operand to the global variable. 2092 // 2093 // What we want here is an offset from the value returned by s_getpc 2094 // (which is the address of the s_add_u32 instruction) to the global 2095 // variable, but since the encoding of $symbol starts 4 bytes after the start 2096 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2097 // small. This requires us to add 4 to the global variable offset in order to 2098 // compute the correct address. 2099 2100 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2101 2102 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2103 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2104 2105 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2106 .addDef(PCReg); 2107 2108 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2109 if (GAFlags == SIInstrInfo::MO_NONE) 2110 MIB.addImm(0); 2111 else 2112 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2113 2114 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2115 2116 if (PtrTy.getSizeInBits() == 32) 2117 B.buildExtract(DstReg, PCReg, 0); 2118 return true; 2119 } 2120 2121 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2122 MachineInstr &MI, MachineRegisterInfo &MRI, 2123 MachineIRBuilder &B) const { 2124 Register DstReg = MI.getOperand(0).getReg(); 2125 LLT Ty = MRI.getType(DstReg); 2126 unsigned AS = Ty.getAddressSpace(); 2127 2128 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2129 MachineFunction &MF = B.getMF(); 2130 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2131 2132 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2133 if (!MFI->isEntryFunction()) { 2134 const Function &Fn = MF.getFunction(); 2135 DiagnosticInfoUnsupported BadLDSDecl( 2136 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2137 DS_Warning); 2138 Fn.getContext().diagnose(BadLDSDecl); 2139 2140 // We currently don't have a way to correctly allocate LDS objects that 2141 // aren't directly associated with a kernel. We do force inlining of 2142 // functions that use local objects. However, if these dead functions are 2143 // not eliminated, we don't want a compile time error. Just emit a warning 2144 // and a trap, since there should be no callable path here. 2145 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2146 B.buildUndef(DstReg); 2147 MI.eraseFromParent(); 2148 return true; 2149 } 2150 2151 // TODO: We could emit code to handle the initialization somewhere. 2152 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2153 const SITargetLowering *TLI = ST.getTargetLowering(); 2154 if (!TLI->shouldUseLDSConstAddress(GV)) { 2155 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2156 return true; // Leave in place; 2157 } 2158 2159 B.buildConstant( 2160 DstReg, 2161 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2162 MI.eraseFromParent(); 2163 return true; 2164 } 2165 2166 const Function &Fn = MF.getFunction(); 2167 DiagnosticInfoUnsupported BadInit( 2168 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2169 Fn.getContext().diagnose(BadInit); 2170 return true; 2171 } 2172 2173 const SITargetLowering *TLI = ST.getTargetLowering(); 2174 2175 if (TLI->shouldEmitFixup(GV)) { 2176 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2177 MI.eraseFromParent(); 2178 return true; 2179 } 2180 2181 if (TLI->shouldEmitPCReloc(GV)) { 2182 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2183 MI.eraseFromParent(); 2184 return true; 2185 } 2186 2187 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2188 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2189 2190 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2191 MachinePointerInfo::getGOT(MF), 2192 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2193 MachineMemOperand::MOInvariant, 2194 8 /*Size*/, Align(8)); 2195 2196 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2197 2198 if (Ty.getSizeInBits() == 32) { 2199 // Truncate if this is a 32-bit constant adrdess. 2200 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2201 B.buildExtract(DstReg, Load, 0); 2202 } else 2203 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeLoad( 2210 MachineInstr &MI, MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2212 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2213 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2214 Observer.changingInstr(MI); 2215 MI.getOperand(1).setReg(Cast.getReg(0)); 2216 Observer.changedInstr(MI); 2217 return true; 2218 } 2219 2220 bool AMDGPULegalizerInfo::legalizeFMad( 2221 MachineInstr &MI, MachineRegisterInfo &MRI, 2222 MachineIRBuilder &B) const { 2223 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2224 assert(Ty.isScalar()); 2225 2226 MachineFunction &MF = B.getMF(); 2227 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2228 2229 // TODO: Always legal with future ftz flag. 2230 // FIXME: Do we need just output? 2231 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2232 return true; 2233 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2234 return true; 2235 2236 MachineIRBuilder HelperBuilder(MI); 2237 GISelObserverWrapper DummyObserver; 2238 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2239 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2244 Register DstReg = MI.getOperand(0).getReg(); 2245 Register PtrReg = MI.getOperand(1).getReg(); 2246 Register CmpVal = MI.getOperand(2).getReg(); 2247 Register NewVal = MI.getOperand(3).getReg(); 2248 2249 assert(SITargetLowering::isFlatGlobalAddrSpace( 2250 MRI.getType(PtrReg).getAddressSpace()) && 2251 "this should not have been custom lowered"); 2252 2253 LLT ValTy = MRI.getType(CmpVal); 2254 LLT VecTy = LLT::vector(2, ValTy); 2255 2256 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2257 2258 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2259 .addDef(DstReg) 2260 .addUse(PtrReg) 2261 .addUse(PackedVal) 2262 .setMemRefs(MI.memoperands()); 2263 2264 MI.eraseFromParent(); 2265 return true; 2266 } 2267 2268 bool AMDGPULegalizerInfo::legalizeFlog( 2269 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2270 Register Dst = MI.getOperand(0).getReg(); 2271 Register Src = MI.getOperand(1).getReg(); 2272 LLT Ty = B.getMRI()->getType(Dst); 2273 unsigned Flags = MI.getFlags(); 2274 2275 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2276 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2277 2278 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2279 MI.eraseFromParent(); 2280 return true; 2281 } 2282 2283 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2284 MachineIRBuilder &B) const { 2285 Register Dst = MI.getOperand(0).getReg(); 2286 Register Src = MI.getOperand(1).getReg(); 2287 unsigned Flags = MI.getFlags(); 2288 LLT Ty = B.getMRI()->getType(Dst); 2289 2290 auto K = B.buildFConstant(Ty, numbers::log2e); 2291 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2292 B.buildFExp2(Dst, Mul, Flags); 2293 MI.eraseFromParent(); 2294 return true; 2295 } 2296 2297 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2298 MachineIRBuilder &B) const { 2299 Register Dst = MI.getOperand(0).getReg(); 2300 Register Src0 = MI.getOperand(1).getReg(); 2301 Register Src1 = MI.getOperand(2).getReg(); 2302 unsigned Flags = MI.getFlags(); 2303 LLT Ty = B.getMRI()->getType(Dst); 2304 const LLT S16 = LLT::scalar(16); 2305 const LLT S32 = LLT::scalar(32); 2306 2307 if (Ty == S32) { 2308 auto Log = B.buildFLog2(S32, Src0, Flags); 2309 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2310 .addUse(Log.getReg(0)) 2311 .addUse(Src1) 2312 .setMIFlags(Flags); 2313 B.buildFExp2(Dst, Mul, Flags); 2314 } else if (Ty == S16) { 2315 // There's no f16 fmul_legacy, so we need to convert for it. 2316 auto Log = B.buildFLog2(S16, Src0, Flags); 2317 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2318 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2319 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2320 .addUse(Ext0.getReg(0)) 2321 .addUse(Ext1.getReg(0)) 2322 .setMIFlags(Flags); 2323 2324 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2325 } else 2326 return false; 2327 2328 MI.eraseFromParent(); 2329 return true; 2330 } 2331 2332 // Find a source register, ignoring any possible source modifiers. 2333 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2334 Register ModSrc = OrigSrc; 2335 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2336 ModSrc = SrcFNeg->getOperand(1).getReg(); 2337 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2338 ModSrc = SrcFAbs->getOperand(1).getReg(); 2339 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2340 ModSrc = SrcFAbs->getOperand(1).getReg(); 2341 return ModSrc; 2342 } 2343 2344 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2345 MachineRegisterInfo &MRI, 2346 MachineIRBuilder &B) const { 2347 2348 const LLT S1 = LLT::scalar(1); 2349 const LLT S64 = LLT::scalar(64); 2350 Register Dst = MI.getOperand(0).getReg(); 2351 Register OrigSrc = MI.getOperand(1).getReg(); 2352 unsigned Flags = MI.getFlags(); 2353 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2354 "this should not have been custom lowered"); 2355 2356 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2357 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2358 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2359 // V_FRACT bug is: 2360 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2361 // 2362 // Convert floor(x) to (x - fract(x)) 2363 2364 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2365 .addUse(OrigSrc) 2366 .setMIFlags(Flags); 2367 2368 // Give source modifier matching some assistance before obscuring a foldable 2369 // pattern. 2370 2371 // TODO: We can avoid the neg on the fract? The input sign to fract 2372 // shouldn't matter? 2373 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2374 2375 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2376 2377 Register Min = MRI.createGenericVirtualRegister(S64); 2378 2379 // We don't need to concern ourselves with the snan handling difference, so 2380 // use the one which will directly select. 2381 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2382 if (MFI->getMode().IEEE) 2383 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2384 else 2385 B.buildFMinNum(Min, Fract, Const, Flags); 2386 2387 Register CorrectedFract = Min; 2388 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2389 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2390 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2391 } 2392 2393 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2394 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2395 2396 MI.eraseFromParent(); 2397 return true; 2398 } 2399 2400 // Turn an illegal packed v2s16 build vector into bit operations. 2401 // TODO: This should probably be a bitcast action in LegalizerHelper. 2402 bool AMDGPULegalizerInfo::legalizeBuildVector( 2403 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2404 Register Dst = MI.getOperand(0).getReg(); 2405 const LLT S32 = LLT::scalar(32); 2406 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2407 2408 Register Src0 = MI.getOperand(1).getReg(); 2409 Register Src1 = MI.getOperand(2).getReg(); 2410 assert(MRI.getType(Src0) == LLT::scalar(16)); 2411 2412 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2413 B.buildBitcast(Dst, Merge); 2414 2415 MI.eraseFromParent(); 2416 return true; 2417 } 2418 2419 // Return the use branch instruction, otherwise null if the usage is invalid. 2420 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2421 MachineRegisterInfo &MRI, 2422 MachineInstr *&Br, 2423 MachineBasicBlock *&UncondBrTarget) { 2424 Register CondDef = MI.getOperand(0).getReg(); 2425 if (!MRI.hasOneNonDBGUse(CondDef)) 2426 return nullptr; 2427 2428 MachineBasicBlock *Parent = MI.getParent(); 2429 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2430 if (UseMI.getParent() != Parent || 2431 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2432 return nullptr; 2433 2434 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2435 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2436 if (Next == Parent->end()) { 2437 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2438 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2439 return nullptr; 2440 UncondBrTarget = &*NextMBB; 2441 } else { 2442 if (Next->getOpcode() != AMDGPU::G_BR) 2443 return nullptr; 2444 Br = &*Next; 2445 UncondBrTarget = Br->getOperand(0).getMBB(); 2446 } 2447 2448 return &UseMI; 2449 } 2450 2451 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2452 MachineRegisterInfo &MRI, 2453 Register LiveIn, 2454 Register PhyReg) const { 2455 assert(PhyReg.isPhysical() && "Physical register expected"); 2456 2457 // Insert the live-in copy, if required, by defining destination virtual 2458 // register. 2459 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2460 if (!MRI.getVRegDef(LiveIn)) { 2461 // FIXME: Should have scoped insert pt 2462 MachineBasicBlock &OrigInsBB = B.getMBB(); 2463 auto OrigInsPt = B.getInsertPt(); 2464 2465 MachineBasicBlock &EntryMBB = B.getMF().front(); 2466 EntryMBB.addLiveIn(PhyReg); 2467 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2468 B.buildCopy(LiveIn, PhyReg); 2469 2470 B.setInsertPt(OrigInsBB, OrigInsPt); 2471 } 2472 2473 return LiveIn; 2474 } 2475 2476 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2477 MachineRegisterInfo &MRI, 2478 Register PhyReg, LLT Ty, 2479 bool InsertLiveInCopy) const { 2480 assert(PhyReg.isPhysical() && "Physical register expected"); 2481 2482 // Get or create virtual live-in regester 2483 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2484 if (!LiveIn) { 2485 LiveIn = MRI.createGenericVirtualRegister(Ty); 2486 MRI.addLiveIn(PhyReg, LiveIn); 2487 } 2488 2489 // When the actual true copy required is from virtual register to physical 2490 // register (to be inserted later), live-in copy insertion from physical 2491 // to register virtual register is not required 2492 if (!InsertLiveInCopy) 2493 return LiveIn; 2494 2495 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2496 } 2497 2498 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2499 const ArgDescriptor *Arg, 2500 const TargetRegisterClass *ArgRC, 2501 LLT ArgTy) const { 2502 MCRegister SrcReg = Arg->getRegister(); 2503 assert(SrcReg.isPhysical() && "Physical register expected"); 2504 assert(DstReg.isVirtual() && "Virtual register expected"); 2505 2506 MachineRegisterInfo &MRI = *B.getMRI(); 2507 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); 2508 2509 if (Arg->isMasked()) { 2510 // TODO: Should we try to emit this once in the entry block? 2511 const LLT S32 = LLT::scalar(32); 2512 const unsigned Mask = Arg->getMask(); 2513 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2514 2515 Register AndMaskSrc = LiveIn; 2516 2517 if (Shift != 0) { 2518 auto ShiftAmt = B.buildConstant(S32, Shift); 2519 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2520 } 2521 2522 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2523 } else { 2524 B.buildCopy(DstReg, LiveIn); 2525 } 2526 2527 return true; 2528 } 2529 2530 bool AMDGPULegalizerInfo::loadInputValue( 2531 Register DstReg, MachineIRBuilder &B, 2532 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2533 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2534 const ArgDescriptor *Arg; 2535 const TargetRegisterClass *ArgRC; 2536 LLT ArgTy; 2537 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2538 2539 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2540 return false; // TODO: Handle these 2541 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2542 } 2543 2544 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2545 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2546 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2547 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2548 return false; 2549 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2555 MachineRegisterInfo &MRI, 2556 MachineIRBuilder &B) const { 2557 Register Dst = MI.getOperand(0).getReg(); 2558 LLT DstTy = MRI.getType(Dst); 2559 LLT S16 = LLT::scalar(16); 2560 LLT S32 = LLT::scalar(32); 2561 LLT S64 = LLT::scalar(64); 2562 2563 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2564 return true; 2565 2566 if (DstTy == S16) 2567 return legalizeFDIV16(MI, MRI, B); 2568 if (DstTy == S32) 2569 return legalizeFDIV32(MI, MRI, B); 2570 if (DstTy == S64) 2571 return legalizeFDIV64(MI, MRI, B); 2572 2573 return false; 2574 } 2575 2576 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2577 Register DstReg, 2578 Register X, 2579 Register Y, 2580 bool IsDiv) const { 2581 const LLT S1 = LLT::scalar(1); 2582 const LLT S32 = LLT::scalar(32); 2583 2584 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2585 // algorithm used here. 2586 2587 // Initial estimate of inv(y). 2588 auto FloatY = B.buildUITOFP(S32, Y); 2589 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2590 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2591 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2592 auto Z = B.buildFPTOUI(S32, ScaledY); 2593 2594 // One round of UNR. 2595 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2596 auto NegYZ = B.buildMul(S32, NegY, Z); 2597 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2598 2599 // Quotient/remainder estimate. 2600 auto Q = B.buildUMulH(S32, X, Z); 2601 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2602 2603 // First quotient/remainder refinement. 2604 auto One = B.buildConstant(S32, 1); 2605 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2606 if (IsDiv) 2607 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2608 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2609 2610 // Second quotient/remainder refinement. 2611 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2612 if (IsDiv) 2613 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2614 else 2615 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2616 } 2617 2618 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2619 MachineRegisterInfo &MRI, 2620 MachineIRBuilder &B) const { 2621 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2622 Register DstReg = MI.getOperand(0).getReg(); 2623 Register Num = MI.getOperand(1).getReg(); 2624 Register Den = MI.getOperand(2).getReg(); 2625 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2626 MI.eraseFromParent(); 2627 return true; 2628 } 2629 2630 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2631 // 2632 // Return lo, hi of result 2633 // 2634 // %cvt.lo = G_UITOFP Val.lo 2635 // %cvt.hi = G_UITOFP Val.hi 2636 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2637 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2638 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2639 // %mul2 = G_FMUL %mul1, 2**(-32) 2640 // %trunc = G_INTRINSIC_TRUNC %mul2 2641 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2642 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2643 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2644 Register Val) { 2645 const LLT S32 = LLT::scalar(32); 2646 auto Unmerge = B.buildUnmerge(S32, Val); 2647 2648 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2649 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2650 2651 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2652 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2653 2654 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2655 auto Mul1 = 2656 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2657 2658 // 2**(-32) 2659 auto Mul2 = 2660 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2661 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2662 2663 // -(2**32) 2664 auto Mad2 = B.buildFMAD(S32, Trunc, 2665 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2666 2667 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2668 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2669 2670 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2671 } 2672 2673 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2674 Register DstReg, 2675 Register Numer, 2676 Register Denom, 2677 bool IsDiv) const { 2678 const LLT S32 = LLT::scalar(32); 2679 const LLT S64 = LLT::scalar(64); 2680 const LLT S1 = LLT::scalar(1); 2681 Register RcpLo, RcpHi; 2682 2683 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2684 2685 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2686 2687 auto Zero64 = B.buildConstant(S64, 0); 2688 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2689 2690 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2691 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2692 2693 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2694 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2695 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2696 2697 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2698 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2699 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2700 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2701 2702 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2703 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2704 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2705 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2706 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2707 2708 auto Zero32 = B.buildConstant(S32, 0); 2709 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2710 auto Add2_HiC = 2711 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2712 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2713 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2714 2715 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2716 Register NumerLo = UnmergeNumer.getReg(0); 2717 Register NumerHi = UnmergeNumer.getReg(1); 2718 2719 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2720 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2721 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2722 Register Mul3_Lo = UnmergeMul3.getReg(0); 2723 Register Mul3_Hi = UnmergeMul3.getReg(1); 2724 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2725 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2726 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2727 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2728 2729 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2730 Register DenomLo = UnmergeDenom.getReg(0); 2731 Register DenomHi = UnmergeDenom.getReg(1); 2732 2733 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2734 auto C1 = B.buildSExt(S32, CmpHi); 2735 2736 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2737 auto C2 = B.buildSExt(S32, CmpLo); 2738 2739 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2740 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2741 2742 // TODO: Here and below portions of the code can be enclosed into if/endif. 2743 // Currently control flow is unconditional and we have 4 selects after 2744 // potential endif to substitute PHIs. 2745 2746 // if C3 != 0 ... 2747 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2748 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2749 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2750 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2751 2752 auto One64 = B.buildConstant(S64, 1); 2753 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2754 2755 auto C4 = 2756 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2757 auto C5 = 2758 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2759 auto C6 = B.buildSelect( 2760 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2761 2762 // if (C6 != 0) 2763 auto Add4 = B.buildAdd(S64, Add3, One64); 2764 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2765 2766 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2767 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2768 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2769 2770 // endif C6 2771 // endif C3 2772 2773 if (IsDiv) { 2774 auto Sel1 = B.buildSelect( 2775 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2776 B.buildSelect(DstReg, 2777 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2778 } else { 2779 auto Sel2 = B.buildSelect( 2780 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2781 B.buildSelect(DstReg, 2782 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2783 } 2784 } 2785 2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2787 MachineRegisterInfo &MRI, 2788 MachineIRBuilder &B) const { 2789 const LLT S64 = LLT::scalar(64); 2790 const LLT S32 = LLT::scalar(32); 2791 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2792 Register DstReg = MI.getOperand(0).getReg(); 2793 Register Num = MI.getOperand(1).getReg(); 2794 Register Den = MI.getOperand(2).getReg(); 2795 LLT Ty = MRI.getType(DstReg); 2796 2797 if (Ty == S32) 2798 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2799 else if (Ty == S64) 2800 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2801 else 2802 return false; 2803 2804 MI.eraseFromParent(); 2805 return true; 2806 2807 } 2808 2809 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2810 MachineRegisterInfo &MRI, 2811 MachineIRBuilder &B) const { 2812 const LLT S64 = LLT::scalar(64); 2813 const LLT S32 = LLT::scalar(32); 2814 2815 Register DstReg = MI.getOperand(0).getReg(); 2816 const LLT Ty = MRI.getType(DstReg); 2817 if (Ty != S32 && Ty != S64) 2818 return false; 2819 2820 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2821 2822 Register LHS = MI.getOperand(1).getReg(); 2823 Register RHS = MI.getOperand(2).getReg(); 2824 2825 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2826 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2827 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2828 2829 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2830 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2831 2832 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2833 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2834 2835 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2836 if (Ty == S32) 2837 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2838 else 2839 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2840 2841 Register Sign; 2842 if (IsDiv) 2843 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2844 else 2845 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2846 2847 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2848 B.buildSub(DstReg, UDivRem, Sign); 2849 2850 MI.eraseFromParent(); 2851 return true; 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2855 MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register Res = MI.getOperand(0).getReg(); 2858 Register LHS = MI.getOperand(1).getReg(); 2859 Register RHS = MI.getOperand(2).getReg(); 2860 2861 uint16_t Flags = MI.getFlags(); 2862 2863 LLT ResTy = MRI.getType(Res); 2864 LLT S32 = LLT::scalar(32); 2865 LLT S64 = LLT::scalar(64); 2866 2867 const MachineFunction &MF = B.getMF(); 2868 bool Unsafe = 2869 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2870 2871 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2872 return false; 2873 2874 if (!Unsafe && ResTy == S32 && 2875 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2876 return false; 2877 2878 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2879 // 1 / x -> RCP(x) 2880 if (CLHS->isExactlyValue(1.0)) { 2881 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2882 .addUse(RHS) 2883 .setMIFlags(Flags); 2884 2885 MI.eraseFromParent(); 2886 return true; 2887 } 2888 2889 // -1 / x -> RCP( FNEG(x) ) 2890 if (CLHS->isExactlyValue(-1.0)) { 2891 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2892 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2893 .addUse(FNeg.getReg(0)) 2894 .setMIFlags(Flags); 2895 2896 MI.eraseFromParent(); 2897 return true; 2898 } 2899 } 2900 2901 // x / y -> x * (1.0 / y) 2902 if (Unsafe) { 2903 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2904 .addUse(RHS) 2905 .setMIFlags(Flags); 2906 B.buildFMul(Res, LHS, RCP, Flags); 2907 2908 MI.eraseFromParent(); 2909 return true; 2910 } 2911 2912 return false; 2913 } 2914 2915 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2916 MachineRegisterInfo &MRI, 2917 MachineIRBuilder &B) const { 2918 Register Res = MI.getOperand(0).getReg(); 2919 Register LHS = MI.getOperand(1).getReg(); 2920 Register RHS = MI.getOperand(2).getReg(); 2921 2922 uint16_t Flags = MI.getFlags(); 2923 2924 LLT S16 = LLT::scalar(16); 2925 LLT S32 = LLT::scalar(32); 2926 2927 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2928 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2929 2930 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2931 .addUse(RHSExt.getReg(0)) 2932 .setMIFlags(Flags); 2933 2934 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2935 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2936 2937 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2938 .addUse(RDst.getReg(0)) 2939 .addUse(RHS) 2940 .addUse(LHS) 2941 .setMIFlags(Flags); 2942 2943 MI.eraseFromParent(); 2944 return true; 2945 } 2946 2947 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2948 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2949 static void toggleSPDenormMode(bool Enable, 2950 MachineIRBuilder &B, 2951 const GCNSubtarget &ST, 2952 AMDGPU::SIModeRegisterDefaults Mode) { 2953 // Set SP denorm mode to this value. 2954 unsigned SPDenormMode = 2955 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2956 2957 if (ST.hasDenormModeInst()) { 2958 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2959 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2960 2961 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2962 B.buildInstr(AMDGPU::S_DENORM_MODE) 2963 .addImm(NewDenormModeValue); 2964 2965 } else { 2966 // Select FP32 bit field in mode register. 2967 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2968 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2969 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2970 2971 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2972 .addImm(SPDenormMode) 2973 .addImm(SPDenormModeBitField); 2974 } 2975 } 2976 2977 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2978 MachineRegisterInfo &MRI, 2979 MachineIRBuilder &B) const { 2980 Register Res = MI.getOperand(0).getReg(); 2981 Register LHS = MI.getOperand(1).getReg(); 2982 Register RHS = MI.getOperand(2).getReg(); 2983 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2984 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2985 2986 uint16_t Flags = MI.getFlags(); 2987 2988 LLT S32 = LLT::scalar(32); 2989 LLT S1 = LLT::scalar(1); 2990 2991 auto One = B.buildFConstant(S32, 1.0f); 2992 2993 auto DenominatorScaled = 2994 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2995 .addUse(LHS) 2996 .addUse(RHS) 2997 .addImm(0) 2998 .setMIFlags(Flags); 2999 auto NumeratorScaled = 3000 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3001 .addUse(LHS) 3002 .addUse(RHS) 3003 .addImm(1) 3004 .setMIFlags(Flags); 3005 3006 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3007 .addUse(DenominatorScaled.getReg(0)) 3008 .setMIFlags(Flags); 3009 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3010 3011 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3012 // aren't modeled as reading it. 3013 if (!Mode.allFP32Denormals()) 3014 toggleSPDenormMode(true, B, ST, Mode); 3015 3016 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3017 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3018 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3019 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3020 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3021 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3022 3023 if (!Mode.allFP32Denormals()) 3024 toggleSPDenormMode(false, B, ST, Mode); 3025 3026 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3027 .addUse(Fma4.getReg(0)) 3028 .addUse(Fma1.getReg(0)) 3029 .addUse(Fma3.getReg(0)) 3030 .addUse(NumeratorScaled.getReg(1)) 3031 .setMIFlags(Flags); 3032 3033 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3034 .addUse(Fmas.getReg(0)) 3035 .addUse(RHS) 3036 .addUse(LHS) 3037 .setMIFlags(Flags); 3038 3039 MI.eraseFromParent(); 3040 return true; 3041 } 3042 3043 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3044 MachineRegisterInfo &MRI, 3045 MachineIRBuilder &B) const { 3046 Register Res = MI.getOperand(0).getReg(); 3047 Register LHS = MI.getOperand(1).getReg(); 3048 Register RHS = MI.getOperand(2).getReg(); 3049 3050 uint16_t Flags = MI.getFlags(); 3051 3052 LLT S64 = LLT::scalar(64); 3053 LLT S1 = LLT::scalar(1); 3054 3055 auto One = B.buildFConstant(S64, 1.0); 3056 3057 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3058 .addUse(LHS) 3059 .addUse(RHS) 3060 .addImm(0) 3061 .setMIFlags(Flags); 3062 3063 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3064 3065 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3066 .addUse(DivScale0.getReg(0)) 3067 .setMIFlags(Flags); 3068 3069 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3070 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3071 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3072 3073 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3074 .addUse(LHS) 3075 .addUse(RHS) 3076 .addImm(1) 3077 .setMIFlags(Flags); 3078 3079 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3080 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3081 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3082 3083 Register Scale; 3084 if (!ST.hasUsableDivScaleConditionOutput()) { 3085 // Workaround a hardware bug on SI where the condition output from div_scale 3086 // is not usable. 3087 3088 LLT S32 = LLT::scalar(32); 3089 3090 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3091 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3092 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3093 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3094 3095 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3096 Scale1Unmerge.getReg(1)); 3097 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3098 Scale0Unmerge.getReg(1)); 3099 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3100 } else { 3101 Scale = DivScale1.getReg(1); 3102 } 3103 3104 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3105 .addUse(Fma4.getReg(0)) 3106 .addUse(Fma3.getReg(0)) 3107 .addUse(Mul.getReg(0)) 3108 .addUse(Scale) 3109 .setMIFlags(Flags); 3110 3111 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3112 .addUse(Fmas.getReg(0)) 3113 .addUse(RHS) 3114 .addUse(LHS) 3115 .setMIFlags(Flags); 3116 3117 MI.eraseFromParent(); 3118 return true; 3119 } 3120 3121 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3122 MachineRegisterInfo &MRI, 3123 MachineIRBuilder &B) const { 3124 Register Res = MI.getOperand(0).getReg(); 3125 Register LHS = MI.getOperand(2).getReg(); 3126 Register RHS = MI.getOperand(3).getReg(); 3127 uint16_t Flags = MI.getFlags(); 3128 3129 LLT S32 = LLT::scalar(32); 3130 LLT S1 = LLT::scalar(1); 3131 3132 auto Abs = B.buildFAbs(S32, RHS, Flags); 3133 const APFloat C0Val(1.0f); 3134 3135 auto C0 = B.buildConstant(S32, 0x6f800000); 3136 auto C1 = B.buildConstant(S32, 0x2f800000); 3137 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3138 3139 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3140 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3141 3142 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3143 3144 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3145 .addUse(Mul0.getReg(0)) 3146 .setMIFlags(Flags); 3147 3148 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3149 3150 B.buildFMul(Res, Sel, Mul1, Flags); 3151 3152 MI.eraseFromParent(); 3153 return true; 3154 } 3155 3156 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3157 MachineRegisterInfo &MRI, 3158 MachineIRBuilder &B) const { 3159 uint64_t Offset = 3160 ST.getTargetLowering()->getImplicitParameterOffset( 3161 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3162 LLT DstTy = MRI.getType(DstReg); 3163 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3164 3165 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3166 if (!loadInputValue(KernargPtrReg, B, 3167 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3168 return false; 3169 3170 // FIXME: This should be nuw 3171 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3172 return true; 3173 } 3174 3175 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3176 MachineRegisterInfo &MRI, 3177 MachineIRBuilder &B) const { 3178 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3179 if (!MFI->isEntryFunction()) { 3180 return legalizePreloadedArgIntrin(MI, MRI, B, 3181 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3182 } 3183 3184 Register DstReg = MI.getOperand(0).getReg(); 3185 if (!getImplicitArgPtr(DstReg, MRI, B)) 3186 return false; 3187 3188 MI.eraseFromParent(); 3189 return true; 3190 } 3191 3192 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3193 MachineRegisterInfo &MRI, 3194 MachineIRBuilder &B, 3195 unsigned AddrSpace) const { 3196 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3197 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3198 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3199 MI.eraseFromParent(); 3200 return true; 3201 } 3202 3203 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3204 // offset (the offset that is included in bounds checking and swizzling, to be 3205 // split between the instruction's voffset and immoffset fields) and soffset 3206 // (the offset that is excluded from bounds checking and swizzling, to go in 3207 // the instruction's soffset field). This function takes the first kind of 3208 // offset and figures out how to split it between voffset and immoffset. 3209 std::tuple<Register, unsigned, unsigned> 3210 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3211 Register OrigOffset) const { 3212 const unsigned MaxImm = 4095; 3213 Register BaseReg; 3214 unsigned TotalConstOffset; 3215 MachineInstr *OffsetDef; 3216 const LLT S32 = LLT::scalar(32); 3217 3218 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3219 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3220 3221 unsigned ImmOffset = TotalConstOffset; 3222 3223 // If the immediate value is too big for the immoffset field, put the value 3224 // and -4096 into the immoffset field so that the value that is copied/added 3225 // for the voffset field is a multiple of 4096, and it stands more chance 3226 // of being CSEd with the copy/add for another similar load/store. 3227 // However, do not do that rounding down to a multiple of 4096 if that is a 3228 // negative number, as it appears to be illegal to have a negative offset 3229 // in the vgpr, even if adding the immediate offset makes it positive. 3230 unsigned Overflow = ImmOffset & ~MaxImm; 3231 ImmOffset -= Overflow; 3232 if ((int32_t)Overflow < 0) { 3233 Overflow += ImmOffset; 3234 ImmOffset = 0; 3235 } 3236 3237 if (Overflow != 0) { 3238 if (!BaseReg) { 3239 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3240 } else { 3241 auto OverflowVal = B.buildConstant(S32, Overflow); 3242 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3243 } 3244 } 3245 3246 if (!BaseReg) 3247 BaseReg = B.buildConstant(S32, 0).getReg(0); 3248 3249 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3250 } 3251 3252 /// Handle register layout difference for f16 images for some subtargets. 3253 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3254 MachineRegisterInfo &MRI, 3255 Register Reg) const { 3256 if (!ST.hasUnpackedD16VMem()) 3257 return Reg; 3258 3259 const LLT S16 = LLT::scalar(16); 3260 const LLT S32 = LLT::scalar(32); 3261 LLT StoreVT = MRI.getType(Reg); 3262 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3263 3264 auto Unmerge = B.buildUnmerge(S16, Reg); 3265 3266 SmallVector<Register, 4> WideRegs; 3267 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3268 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3269 3270 int NumElts = StoreVT.getNumElements(); 3271 3272 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3273 } 3274 3275 Register AMDGPULegalizerInfo::fixStoreSourceType( 3276 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3277 MachineRegisterInfo *MRI = B.getMRI(); 3278 LLT Ty = MRI->getType(VData); 3279 3280 const LLT S16 = LLT::scalar(16); 3281 3282 // Fixup illegal register types for i8 stores. 3283 if (Ty == LLT::scalar(8) || Ty == S16) { 3284 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3285 return AnyExt; 3286 } 3287 3288 if (Ty.isVector()) { 3289 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3290 if (IsFormat) 3291 return handleD16VData(B, *MRI, VData); 3292 } 3293 } 3294 3295 return VData; 3296 } 3297 3298 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3299 MachineRegisterInfo &MRI, 3300 MachineIRBuilder &B, 3301 bool IsTyped, 3302 bool IsFormat) const { 3303 Register VData = MI.getOperand(1).getReg(); 3304 LLT Ty = MRI.getType(VData); 3305 LLT EltTy = Ty.getScalarType(); 3306 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3307 const LLT S32 = LLT::scalar(32); 3308 3309 VData = fixStoreSourceType(B, VData, IsFormat); 3310 Register RSrc = MI.getOperand(2).getReg(); 3311 3312 MachineMemOperand *MMO = *MI.memoperands_begin(); 3313 const int MemSize = MMO->getSize(); 3314 3315 unsigned ImmOffset; 3316 unsigned TotalOffset; 3317 3318 // The typed intrinsics add an immediate after the registers. 3319 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3320 3321 // The struct intrinsic variants add one additional operand over raw. 3322 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3323 Register VIndex; 3324 int OpOffset = 0; 3325 if (HasVIndex) { 3326 VIndex = MI.getOperand(3).getReg(); 3327 OpOffset = 1; 3328 } 3329 3330 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3331 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3332 3333 unsigned Format = 0; 3334 if (IsTyped) { 3335 Format = MI.getOperand(5 + OpOffset).getImm(); 3336 ++OpOffset; 3337 } 3338 3339 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3340 3341 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3342 if (TotalOffset != 0) 3343 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3344 3345 unsigned Opc; 3346 if (IsTyped) { 3347 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3348 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3349 } else if (IsFormat) { 3350 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3351 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3352 } else { 3353 switch (MemSize) { 3354 case 1: 3355 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3356 break; 3357 case 2: 3358 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3359 break; 3360 default: 3361 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3362 break; 3363 } 3364 } 3365 3366 if (!VIndex) 3367 VIndex = B.buildConstant(S32, 0).getReg(0); 3368 3369 auto MIB = B.buildInstr(Opc) 3370 .addUse(VData) // vdata 3371 .addUse(RSrc) // rsrc 3372 .addUse(VIndex) // vindex 3373 .addUse(VOffset) // voffset 3374 .addUse(SOffset) // soffset 3375 .addImm(ImmOffset); // offset(imm) 3376 3377 if (IsTyped) 3378 MIB.addImm(Format); 3379 3380 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3381 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3382 .addMemOperand(MMO); 3383 3384 MI.eraseFromParent(); 3385 return true; 3386 } 3387 3388 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3389 MachineRegisterInfo &MRI, 3390 MachineIRBuilder &B, 3391 bool IsFormat, 3392 bool IsTyped) const { 3393 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3394 MachineMemOperand *MMO = *MI.memoperands_begin(); 3395 const int MemSize = MMO->getSize(); 3396 const LLT S32 = LLT::scalar(32); 3397 3398 Register Dst = MI.getOperand(0).getReg(); 3399 Register RSrc = MI.getOperand(2).getReg(); 3400 3401 // The typed intrinsics add an immediate after the registers. 3402 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3403 3404 // The struct intrinsic variants add one additional operand over raw. 3405 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3406 Register VIndex; 3407 int OpOffset = 0; 3408 if (HasVIndex) { 3409 VIndex = MI.getOperand(3).getReg(); 3410 OpOffset = 1; 3411 } 3412 3413 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3414 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3415 3416 unsigned Format = 0; 3417 if (IsTyped) { 3418 Format = MI.getOperand(5 + OpOffset).getImm(); 3419 ++OpOffset; 3420 } 3421 3422 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3423 unsigned ImmOffset; 3424 unsigned TotalOffset; 3425 3426 LLT Ty = MRI.getType(Dst); 3427 LLT EltTy = Ty.getScalarType(); 3428 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3429 const bool Unpacked = ST.hasUnpackedD16VMem(); 3430 3431 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3432 if (TotalOffset != 0) 3433 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3434 3435 unsigned Opc; 3436 3437 if (IsTyped) { 3438 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3439 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3440 } else if (IsFormat) { 3441 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3442 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3443 } else { 3444 switch (MemSize) { 3445 case 1: 3446 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3447 break; 3448 case 2: 3449 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3450 break; 3451 default: 3452 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3453 break; 3454 } 3455 } 3456 3457 Register LoadDstReg; 3458 3459 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3460 LLT UnpackedTy = Ty.changeElementSize(32); 3461 3462 if (IsExtLoad) 3463 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3464 else if (Unpacked && IsD16 && Ty.isVector()) 3465 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3466 else 3467 LoadDstReg = Dst; 3468 3469 if (!VIndex) 3470 VIndex = B.buildConstant(S32, 0).getReg(0); 3471 3472 auto MIB = B.buildInstr(Opc) 3473 .addDef(LoadDstReg) // vdata 3474 .addUse(RSrc) // rsrc 3475 .addUse(VIndex) // vindex 3476 .addUse(VOffset) // voffset 3477 .addUse(SOffset) // soffset 3478 .addImm(ImmOffset); // offset(imm) 3479 3480 if (IsTyped) 3481 MIB.addImm(Format); 3482 3483 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3484 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3485 .addMemOperand(MMO); 3486 3487 if (LoadDstReg != Dst) { 3488 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3489 3490 // Widen result for extending loads was widened. 3491 if (IsExtLoad) 3492 B.buildTrunc(Dst, LoadDstReg); 3493 else { 3494 // Repack to original 16-bit vector result 3495 // FIXME: G_TRUNC should work, but legalization currently fails 3496 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3497 SmallVector<Register, 4> Repack; 3498 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3499 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3500 B.buildMerge(Dst, Repack); 3501 } 3502 } 3503 3504 MI.eraseFromParent(); 3505 return true; 3506 } 3507 3508 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3509 MachineIRBuilder &B, 3510 bool IsInc) const { 3511 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3512 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3513 B.buildInstr(Opc) 3514 .addDef(MI.getOperand(0).getReg()) 3515 .addUse(MI.getOperand(2).getReg()) 3516 .addUse(MI.getOperand(3).getReg()) 3517 .cloneMemRefs(MI); 3518 MI.eraseFromParent(); 3519 return true; 3520 } 3521 3522 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3523 switch (IntrID) { 3524 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3525 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3526 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3527 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3528 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3529 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3530 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3531 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3532 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3533 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3534 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3536 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3537 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3538 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3539 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3540 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3541 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3542 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3543 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3544 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3545 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3546 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3547 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3548 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3549 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3550 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3551 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3552 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3553 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3554 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3555 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3556 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3557 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3558 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3559 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3560 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3561 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3562 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3563 default: 3564 llvm_unreachable("unhandled atomic opcode"); 3565 } 3566 } 3567 3568 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3569 MachineIRBuilder &B, 3570 Intrinsic::ID IID) const { 3571 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3572 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3573 3574 Register Dst = MI.getOperand(0).getReg(); 3575 Register VData = MI.getOperand(2).getReg(); 3576 3577 Register CmpVal; 3578 int OpOffset = 0; 3579 3580 if (IsCmpSwap) { 3581 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3582 ++OpOffset; 3583 } 3584 3585 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3586 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3587 3588 // The struct intrinsic variants add one additional operand over raw. 3589 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3590 Register VIndex; 3591 if (HasVIndex) { 3592 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3593 ++OpOffset; 3594 } 3595 3596 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3597 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3598 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3599 3600 MachineMemOperand *MMO = *MI.memoperands_begin(); 3601 3602 unsigned ImmOffset; 3603 unsigned TotalOffset; 3604 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3605 if (TotalOffset != 0) 3606 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3607 3608 if (!VIndex) 3609 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3610 3611 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3612 .addDef(Dst) 3613 .addUse(VData); // vdata 3614 3615 if (IsCmpSwap) 3616 MIB.addReg(CmpVal); 3617 3618 MIB.addUse(RSrc) // rsrc 3619 .addUse(VIndex) // vindex 3620 .addUse(VOffset) // voffset 3621 .addUse(SOffset) // soffset 3622 .addImm(ImmOffset) // offset(imm) 3623 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3624 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3625 .addMemOperand(MMO); 3626 3627 MI.eraseFromParent(); 3628 return true; 3629 } 3630 3631 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3632 /// vector with s16 typed elements. 3633 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3634 SmallVectorImpl<Register> &PackedAddrs, 3635 int AddrIdx, int DimIdx, int EndIdx, 3636 int NumGradients) { 3637 const LLT S16 = LLT::scalar(16); 3638 const LLT V2S16 = LLT::vector(2, 16); 3639 3640 for (int I = AddrIdx; I < EndIdx; ++I) { 3641 MachineOperand &SrcOp = MI.getOperand(I); 3642 if (!SrcOp.isReg()) 3643 continue; // _L to _LZ may have eliminated this. 3644 3645 Register AddrReg = SrcOp.getReg(); 3646 3647 if (I < DimIdx) { 3648 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3649 PackedAddrs.push_back(AddrReg); 3650 } else { 3651 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3652 // derivatives dx/dh and dx/dv are packed with undef. 3653 if (((I + 1) >= EndIdx) || 3654 ((NumGradients / 2) % 2 == 1 && 3655 (I == DimIdx + (NumGradients / 2) - 1 || 3656 I == DimIdx + NumGradients - 1)) || 3657 // Check for _L to _LZ optimization 3658 !MI.getOperand(I + 1).isReg()) { 3659 PackedAddrs.push_back( 3660 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3661 .getReg(0)); 3662 } else { 3663 PackedAddrs.push_back( 3664 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3665 .getReg(0)); 3666 ++I; 3667 } 3668 } 3669 } 3670 } 3671 3672 /// Convert from separate vaddr components to a single vector address register, 3673 /// and replace the remaining operands with $noreg. 3674 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3675 int DimIdx, int NumVAddrs) { 3676 const LLT S32 = LLT::scalar(32); 3677 3678 SmallVector<Register, 8> AddrRegs; 3679 for (int I = 0; I != NumVAddrs; ++I) { 3680 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3681 if (SrcOp.isReg()) { 3682 AddrRegs.push_back(SrcOp.getReg()); 3683 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3684 } 3685 } 3686 3687 int NumAddrRegs = AddrRegs.size(); 3688 if (NumAddrRegs != 1) { 3689 // Round up to 8 elements for v5-v7 3690 // FIXME: Missing intermediate sized register classes and instructions. 3691 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3692 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3693 auto Undef = B.buildUndef(S32); 3694 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3695 NumAddrRegs = RoundedNumRegs; 3696 } 3697 3698 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3699 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3700 } 3701 3702 for (int I = 1; I != NumVAddrs; ++I) { 3703 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3704 if (SrcOp.isReg()) 3705 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3706 } 3707 } 3708 3709 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3710 /// 3711 /// Depending on the subtarget, load/store with 16-bit element data need to be 3712 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3713 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3714 /// registers. 3715 /// 3716 /// We don't want to directly select image instructions just yet, but also want 3717 /// to exposes all register repacking to the legalizer/combiners. We also don't 3718 /// want a selected instrution entering RegBankSelect. In order to avoid 3719 /// defining a multitude of intermediate image instructions, directly hack on 3720 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3721 /// now unnecessary arguments with $noreg. 3722 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3723 MachineInstr &MI, MachineIRBuilder &B, 3724 GISelChangeObserver &Observer, 3725 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3726 3727 const int NumDefs = MI.getNumExplicitDefs(); 3728 bool IsTFE = NumDefs == 2; 3729 // We are only processing the operands of d16 image operations on subtargets 3730 // that use the unpacked register layout, or need to repack the TFE result. 3731 3732 // TODO: Do we need to guard against already legalized intrinsics? 3733 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3734 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3735 3736 MachineRegisterInfo *MRI = B.getMRI(); 3737 const LLT S32 = LLT::scalar(32); 3738 const LLT S16 = LLT::scalar(16); 3739 const LLT V2S16 = LLT::vector(2, 16); 3740 3741 // Index of first address argument 3742 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3743 3744 int NumVAddrs, NumGradients; 3745 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3746 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3747 getDMaskIdx(BaseOpcode, NumDefs); 3748 unsigned DMask = 0; 3749 3750 // Check for 16 bit addresses and pack if true. 3751 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3752 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3753 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3754 const bool IsG16 = GradTy == S16; 3755 const bool IsA16 = AddrTy == S16; 3756 3757 int DMaskLanes = 0; 3758 if (!BaseOpcode->Atomic) { 3759 DMask = MI.getOperand(DMaskIdx).getImm(); 3760 if (BaseOpcode->Gather4) { 3761 DMaskLanes = 4; 3762 } else if (DMask != 0) { 3763 DMaskLanes = countPopulation(DMask); 3764 } else if (!IsTFE && !BaseOpcode->Store) { 3765 // If dmask is 0, this is a no-op load. This can be eliminated. 3766 B.buildUndef(MI.getOperand(0)); 3767 MI.eraseFromParent(); 3768 return true; 3769 } 3770 } 3771 3772 Observer.changingInstr(MI); 3773 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3774 3775 unsigned NewOpcode = NumDefs == 0 ? 3776 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3777 3778 // Track that we legalized this 3779 MI.setDesc(B.getTII().get(NewOpcode)); 3780 3781 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3782 // dmask to be at least 1 otherwise the instruction will fail 3783 if (IsTFE && DMask == 0) { 3784 DMask = 0x1; 3785 DMaskLanes = 1; 3786 MI.getOperand(DMaskIdx).setImm(DMask); 3787 } 3788 3789 if (BaseOpcode->Atomic) { 3790 Register VData0 = MI.getOperand(2).getReg(); 3791 LLT Ty = MRI->getType(VData0); 3792 3793 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3794 if (Ty.isVector()) 3795 return false; 3796 3797 if (BaseOpcode->AtomicX2) { 3798 Register VData1 = MI.getOperand(3).getReg(); 3799 // The two values are packed in one register. 3800 LLT PackedTy = LLT::vector(2, Ty); 3801 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3802 MI.getOperand(2).setReg(Concat.getReg(0)); 3803 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3804 } 3805 } 3806 3807 int CorrectedNumVAddrs = NumVAddrs; 3808 3809 // Optimize _L to _LZ when _L is zero 3810 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3811 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3812 const ConstantFP *ConstantLod; 3813 const int LodIdx = AddrIdx + NumVAddrs - 1; 3814 3815 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3816 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3817 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3818 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3819 LZMappingInfo->LZ, ImageDimIntr->Dim); 3820 3821 // The starting indexes should remain in the same place. 3822 --NumVAddrs; 3823 --CorrectedNumVAddrs; 3824 3825 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3826 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3827 MI.RemoveOperand(LodIdx); 3828 } 3829 } 3830 } 3831 3832 // Optimize _mip away, when 'lod' is zero 3833 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3834 int64_t ConstantLod; 3835 const int LodIdx = AddrIdx + NumVAddrs - 1; 3836 3837 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3838 if (ConstantLod == 0) { 3839 // TODO: Change intrinsic opcode and remove operand instead or replacing 3840 // it with 0, as the _L to _LZ handling is done above. 3841 MI.getOperand(LodIdx).ChangeToImmediate(0); 3842 --CorrectedNumVAddrs; 3843 } 3844 } 3845 } 3846 3847 // Rewrite the addressing register layout before doing anything else. 3848 if (IsA16 || IsG16) { 3849 if (IsA16) { 3850 // Target must support the feature and gradients need to be 16 bit too 3851 if (!ST.hasA16() || !IsG16) 3852 return false; 3853 } else if (!ST.hasG16()) 3854 return false; 3855 3856 if (NumVAddrs > 1) { 3857 SmallVector<Register, 4> PackedRegs; 3858 // Don't compress addresses for G16 3859 const int PackEndIdx = 3860 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3861 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3862 PackEndIdx, NumGradients); 3863 3864 if (!IsA16) { 3865 // Add uncompressed address 3866 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3867 int AddrReg = MI.getOperand(I).getReg(); 3868 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3869 PackedRegs.push_back(AddrReg); 3870 } 3871 } 3872 3873 // See also below in the non-a16 branch 3874 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3875 3876 if (!UseNSA && PackedRegs.size() > 1) { 3877 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3878 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3879 PackedRegs[0] = Concat.getReg(0); 3880 PackedRegs.resize(1); 3881 } 3882 3883 const int NumPacked = PackedRegs.size(); 3884 for (int I = 0; I != NumVAddrs; ++I) { 3885 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3886 if (!SrcOp.isReg()) { 3887 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3888 continue; 3889 } 3890 3891 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3892 3893 if (I < NumPacked) 3894 SrcOp.setReg(PackedRegs[I]); 3895 else 3896 SrcOp.setReg(AMDGPU::NoRegister); 3897 } 3898 } 3899 } else { 3900 // If the register allocator cannot place the address registers contiguously 3901 // without introducing moves, then using the non-sequential address encoding 3902 // is always preferable, since it saves VALU instructions and is usually a 3903 // wash in terms of code size or even better. 3904 // 3905 // However, we currently have no way of hinting to the register allocator 3906 // that MIMG addresses should be placed contiguously when it is possible to 3907 // do so, so force non-NSA for the common 2-address case as a heuristic. 3908 // 3909 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3910 // allocation when possible. 3911 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3912 3913 if (!UseNSA && NumVAddrs > 1) 3914 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3915 } 3916 3917 int Flags = 0; 3918 if (IsA16) 3919 Flags |= 1; 3920 if (IsG16) 3921 Flags |= 2; 3922 MI.addOperand(MachineOperand::CreateImm(Flags)); 3923 3924 if (BaseOpcode->Store) { // No TFE for stores? 3925 // TODO: Handle dmask trim 3926 Register VData = MI.getOperand(1).getReg(); 3927 LLT Ty = MRI->getType(VData); 3928 if (!Ty.isVector() || Ty.getElementType() != S16) 3929 return true; 3930 3931 Register RepackedReg = handleD16VData(B, *MRI, VData); 3932 if (RepackedReg != VData) { 3933 MI.getOperand(1).setReg(RepackedReg); 3934 } 3935 3936 return true; 3937 } 3938 3939 Register DstReg = MI.getOperand(0).getReg(); 3940 LLT Ty = MRI->getType(DstReg); 3941 const LLT EltTy = Ty.getScalarType(); 3942 const bool IsD16 = Ty.getScalarType() == S16; 3943 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3944 3945 // Confirm that the return type is large enough for the dmask specified 3946 if (NumElts < DMaskLanes) 3947 return false; 3948 3949 if (NumElts > 4 || DMaskLanes > 4) 3950 return false; 3951 3952 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3953 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3954 3955 // The raw dword aligned data component of the load. The only legal cases 3956 // where this matters should be when using the packed D16 format, for 3957 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3958 LLT RoundedTy; 3959 3960 // S32 vector to to cover all data, plus TFE result element. 3961 LLT TFETy; 3962 3963 // Register type to use for each loaded component. Will be S32 or V2S16. 3964 LLT RegTy; 3965 3966 if (IsD16 && ST.hasUnpackedD16VMem()) { 3967 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3968 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3969 RegTy = S32; 3970 } else { 3971 unsigned EltSize = EltTy.getSizeInBits(); 3972 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3973 unsigned RoundedSize = 32 * RoundedElts; 3974 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3975 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3976 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3977 } 3978 3979 // The return type does not need adjustment. 3980 // TODO: Should we change s16 case to s32 or <2 x s16>? 3981 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3982 return true; 3983 3984 Register Dst1Reg; 3985 3986 // Insert after the instruction. 3987 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3988 3989 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3990 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3991 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3992 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3993 3994 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3995 3996 MI.getOperand(0).setReg(NewResultReg); 3997 3998 // In the IR, TFE is supposed to be used with a 2 element struct return 3999 // type. The intruction really returns these two values in one contiguous 4000 // register, with one additional dword beyond the loaded data. Rewrite the 4001 // return type to use a single register result. 4002 4003 if (IsTFE) { 4004 Dst1Reg = MI.getOperand(1).getReg(); 4005 if (MRI->getType(Dst1Reg) != S32) 4006 return false; 4007 4008 // TODO: Make sure the TFE operand bit is set. 4009 MI.RemoveOperand(1); 4010 4011 // Handle the easy case that requires no repack instructions. 4012 if (Ty == S32) { 4013 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4014 return true; 4015 } 4016 } 4017 4018 // Now figure out how to copy the new result register back into the old 4019 // result. 4020 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4021 4022 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4023 4024 if (ResultNumRegs == 1) { 4025 assert(!IsTFE); 4026 ResultRegs[0] = NewResultReg; 4027 } else { 4028 // We have to repack into a new vector of some kind. 4029 for (int I = 0; I != NumDataRegs; ++I) 4030 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4031 B.buildUnmerge(ResultRegs, NewResultReg); 4032 4033 // Drop the final TFE element to get the data part. The TFE result is 4034 // directly written to the right place already. 4035 if (IsTFE) 4036 ResultRegs.resize(NumDataRegs); 4037 } 4038 4039 // For an s16 scalar result, we form an s32 result with a truncate regardless 4040 // of packed vs. unpacked. 4041 if (IsD16 && !Ty.isVector()) { 4042 B.buildTrunc(DstReg, ResultRegs[0]); 4043 return true; 4044 } 4045 4046 // Avoid a build/concat_vector of 1 entry. 4047 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4048 B.buildBitcast(DstReg, ResultRegs[0]); 4049 return true; 4050 } 4051 4052 assert(Ty.isVector()); 4053 4054 if (IsD16) { 4055 // For packed D16 results with TFE enabled, all the data components are 4056 // S32. Cast back to the expected type. 4057 // 4058 // TODO: We don't really need to use load s32 elements. We would only need one 4059 // cast for the TFE result if a multiple of v2s16 was used. 4060 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4061 for (Register &Reg : ResultRegs) 4062 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4063 } else if (ST.hasUnpackedD16VMem()) { 4064 for (Register &Reg : ResultRegs) 4065 Reg = B.buildTrunc(S16, Reg).getReg(0); 4066 } 4067 } 4068 4069 auto padWithUndef = [&](LLT Ty, int NumElts) { 4070 if (NumElts == 0) 4071 return; 4072 Register Undef = B.buildUndef(Ty).getReg(0); 4073 for (int I = 0; I != NumElts; ++I) 4074 ResultRegs.push_back(Undef); 4075 }; 4076 4077 // Pad out any elements eliminated due to the dmask. 4078 LLT ResTy = MRI->getType(ResultRegs[0]); 4079 if (!ResTy.isVector()) { 4080 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4081 B.buildBuildVector(DstReg, ResultRegs); 4082 return true; 4083 } 4084 4085 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4086 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4087 4088 // Deal with the one annoying legal case. 4089 const LLT V3S16 = LLT::vector(3, 16); 4090 if (Ty == V3S16) { 4091 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4092 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4093 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4094 return true; 4095 } 4096 4097 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4098 B.buildConcatVectors(DstReg, ResultRegs); 4099 return true; 4100 } 4101 4102 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4103 MachineInstr &MI, MachineIRBuilder &B, 4104 GISelChangeObserver &Observer) const { 4105 Register Dst = MI.getOperand(0).getReg(); 4106 LLT Ty = B.getMRI()->getType(Dst); 4107 unsigned Size = Ty.getSizeInBits(); 4108 MachineFunction &MF = B.getMF(); 4109 4110 Observer.changingInstr(MI); 4111 4112 // FIXME: We don't really need this intermediate instruction. The intrinsic 4113 // should be fixed to have a memory operand. Since it's readnone, we're not 4114 // allowed to add one. 4115 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4116 MI.RemoveOperand(1); // Remove intrinsic ID 4117 4118 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4119 // TODO: Should this use datalayout alignment? 4120 const unsigned MemSize = (Size + 7) / 8; 4121 const Align MemAlign(4); 4122 MachineMemOperand *MMO = MF.getMachineMemOperand( 4123 MachinePointerInfo(), 4124 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4125 MachineMemOperand::MOInvariant, 4126 MemSize, MemAlign); 4127 MI.addMemOperand(MF, MMO); 4128 4129 // There are no 96-bit result scalar loads, but widening to 128-bit should 4130 // always be legal. We may need to restore this to a 96-bit result if it turns 4131 // out this needs to be converted to a vector load during RegBankSelect. 4132 if (!isPowerOf2_32(Size)) { 4133 LegalizerHelper Helper(MF, *this, Observer, B); 4134 4135 if (Ty.isVector()) 4136 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4137 else 4138 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4139 } 4140 4141 Observer.changedInstr(MI); 4142 return true; 4143 } 4144 4145 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4146 MachineRegisterInfo &MRI, 4147 MachineIRBuilder &B) const { 4148 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4149 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4150 !ST.isTrapHandlerEnabled()) { 4151 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4152 } else { 4153 // Pass queue pointer to trap handler as input, and insert trap instruction 4154 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4155 MachineRegisterInfo &MRI = *B.getMRI(); 4156 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4157 Register LiveIn = getLiveInRegister( 4158 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4159 /*InsertLiveInCopy=*/false); 4160 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4161 return false; 4162 B.buildCopy(SGPR01, LiveIn); 4163 B.buildInstr(AMDGPU::S_TRAP) 4164 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4165 .addReg(SGPR01, RegState::Implicit); 4166 } 4167 4168 MI.eraseFromParent(); 4169 return true; 4170 } 4171 4172 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4173 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4174 // Is non-HSA path or trap-handler disabled? then, report a warning 4175 // accordingly 4176 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4177 !ST.isTrapHandlerEnabled()) { 4178 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4179 "debugtrap handler not supported", 4180 MI.getDebugLoc(), DS_Warning); 4181 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4182 Ctx.diagnose(NoTrap); 4183 } else { 4184 // Insert debug-trap instruction 4185 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4186 } 4187 4188 MI.eraseFromParent(); 4189 return true; 4190 } 4191 4192 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4193 MachineInstr &MI) const { 4194 MachineIRBuilder &B = Helper.MIRBuilder; 4195 MachineRegisterInfo &MRI = *B.getMRI(); 4196 4197 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4198 auto IntrID = MI.getIntrinsicID(); 4199 switch (IntrID) { 4200 case Intrinsic::amdgcn_if: 4201 case Intrinsic::amdgcn_else: { 4202 MachineInstr *Br = nullptr; 4203 MachineBasicBlock *UncondBrTarget = nullptr; 4204 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4205 const SIRegisterInfo *TRI 4206 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4207 4208 Register Def = MI.getOperand(1).getReg(); 4209 Register Use = MI.getOperand(3).getReg(); 4210 4211 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4212 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4213 if (IntrID == Intrinsic::amdgcn_if) { 4214 B.buildInstr(AMDGPU::SI_IF) 4215 .addDef(Def) 4216 .addUse(Use) 4217 .addMBB(UncondBrTarget); 4218 } else { 4219 B.buildInstr(AMDGPU::SI_ELSE) 4220 .addDef(Def) 4221 .addUse(Use) 4222 .addMBB(UncondBrTarget) 4223 .addImm(0); 4224 } 4225 4226 if (Br) { 4227 Br->getOperand(0).setMBB(CondBrTarget); 4228 } else { 4229 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4230 // since we're swapping branch targets it needs to be reinserted. 4231 // FIXME: IRTranslator should probably not do this 4232 B.buildBr(*CondBrTarget); 4233 } 4234 4235 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4236 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4237 MI.eraseFromParent(); 4238 BrCond->eraseFromParent(); 4239 return true; 4240 } 4241 4242 return false; 4243 } 4244 case Intrinsic::amdgcn_loop: { 4245 MachineInstr *Br = nullptr; 4246 MachineBasicBlock *UncondBrTarget = nullptr; 4247 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4248 const SIRegisterInfo *TRI 4249 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4250 4251 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4252 Register Reg = MI.getOperand(2).getReg(); 4253 4254 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4255 B.buildInstr(AMDGPU::SI_LOOP) 4256 .addUse(Reg) 4257 .addMBB(UncondBrTarget); 4258 4259 if (Br) 4260 Br->getOperand(0).setMBB(CondBrTarget); 4261 else 4262 B.buildBr(*CondBrTarget); 4263 4264 MI.eraseFromParent(); 4265 BrCond->eraseFromParent(); 4266 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4267 return true; 4268 } 4269 4270 return false; 4271 } 4272 case Intrinsic::amdgcn_kernarg_segment_ptr: 4273 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4274 // This only makes sense to call in a kernel, so just lower to null. 4275 B.buildConstant(MI.getOperand(0).getReg(), 0); 4276 MI.eraseFromParent(); 4277 return true; 4278 } 4279 4280 return legalizePreloadedArgIntrin( 4281 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4282 case Intrinsic::amdgcn_implicitarg_ptr: 4283 return legalizeImplicitArgPtr(MI, MRI, B); 4284 case Intrinsic::amdgcn_workitem_id_x: 4285 return legalizePreloadedArgIntrin(MI, MRI, B, 4286 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4287 case Intrinsic::amdgcn_workitem_id_y: 4288 return legalizePreloadedArgIntrin(MI, MRI, B, 4289 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4290 case Intrinsic::amdgcn_workitem_id_z: 4291 return legalizePreloadedArgIntrin(MI, MRI, B, 4292 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4293 case Intrinsic::amdgcn_workgroup_id_x: 4294 return legalizePreloadedArgIntrin(MI, MRI, B, 4295 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4296 case Intrinsic::amdgcn_workgroup_id_y: 4297 return legalizePreloadedArgIntrin(MI, MRI, B, 4298 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4299 case Intrinsic::amdgcn_workgroup_id_z: 4300 return legalizePreloadedArgIntrin(MI, MRI, B, 4301 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4302 case Intrinsic::amdgcn_dispatch_ptr: 4303 return legalizePreloadedArgIntrin(MI, MRI, B, 4304 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4305 case Intrinsic::amdgcn_queue_ptr: 4306 return legalizePreloadedArgIntrin(MI, MRI, B, 4307 AMDGPUFunctionArgInfo::QUEUE_PTR); 4308 case Intrinsic::amdgcn_implicit_buffer_ptr: 4309 return legalizePreloadedArgIntrin( 4310 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4311 case Intrinsic::amdgcn_dispatch_id: 4312 return legalizePreloadedArgIntrin(MI, MRI, B, 4313 AMDGPUFunctionArgInfo::DISPATCH_ID); 4314 case Intrinsic::amdgcn_fdiv_fast: 4315 return legalizeFDIVFastIntrin(MI, MRI, B); 4316 case Intrinsic::amdgcn_is_shared: 4317 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4318 case Intrinsic::amdgcn_is_private: 4319 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4320 case Intrinsic::amdgcn_wavefrontsize: { 4321 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4322 MI.eraseFromParent(); 4323 return true; 4324 } 4325 case Intrinsic::amdgcn_s_buffer_load: 4326 return legalizeSBufferLoad(MI, B, Helper.Observer); 4327 case Intrinsic::amdgcn_raw_buffer_store: 4328 case Intrinsic::amdgcn_struct_buffer_store: 4329 return legalizeBufferStore(MI, MRI, B, false, false); 4330 case Intrinsic::amdgcn_raw_buffer_store_format: 4331 case Intrinsic::amdgcn_struct_buffer_store_format: 4332 return legalizeBufferStore(MI, MRI, B, false, true); 4333 case Intrinsic::amdgcn_raw_tbuffer_store: 4334 case Intrinsic::amdgcn_struct_tbuffer_store: 4335 return legalizeBufferStore(MI, MRI, B, true, true); 4336 case Intrinsic::amdgcn_raw_buffer_load: 4337 case Intrinsic::amdgcn_struct_buffer_load: 4338 return legalizeBufferLoad(MI, MRI, B, false, false); 4339 case Intrinsic::amdgcn_raw_buffer_load_format: 4340 case Intrinsic::amdgcn_struct_buffer_load_format: 4341 return legalizeBufferLoad(MI, MRI, B, true, false); 4342 case Intrinsic::amdgcn_raw_tbuffer_load: 4343 case Intrinsic::amdgcn_struct_tbuffer_load: 4344 return legalizeBufferLoad(MI, MRI, B, true, true); 4345 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4346 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4347 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4348 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4349 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4350 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4351 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4352 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4353 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4354 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4355 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4356 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4357 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4358 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4359 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4369 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4370 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4371 return legalizeBufferAtomic(MI, B, IntrID); 4372 case Intrinsic::amdgcn_atomic_inc: 4373 return legalizeAtomicIncDec(MI, B, true); 4374 case Intrinsic::amdgcn_atomic_dec: 4375 return legalizeAtomicIncDec(MI, B, false); 4376 case Intrinsic::trap: 4377 return legalizeTrapIntrinsic(MI, MRI, B); 4378 case Intrinsic::debugtrap: 4379 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4380 default: { 4381 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4382 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4383 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4384 return true; 4385 } 4386 } 4387 4388 return true; 4389 } 4390