1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isMultiple32(unsigned TypeIdx, 54 unsigned MaxSize = 1024) { 55 return [=](const LegalityQuery &Query) { 56 const LLT Ty = Query.Types[TypeIdx]; 57 const LLT EltTy = Ty.getScalarType(); 58 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 59 }; 60 } 61 62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 63 return [=](const LegalityQuery &Query) { 64 const LLT Ty = Query.Types[TypeIdx]; 65 return Ty.isVector() && 66 Ty.getNumElements() % 2 != 0 && 67 Ty.getElementType().getSizeInBits() < 32 && 68 Ty.getSizeInBits() % 32 != 0; 69 }; 70 } 71 72 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 const LLT EltTy = Ty.getScalarType(); 76 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 77 }; 78 } 79 80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getElementType(); 84 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 85 }; 86 } 87 88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 unsigned Size = Ty.getSizeInBits(); 93 unsigned Pieces = (Size + 63) / 64; 94 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 95 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 96 }; 97 } 98 99 // Increase the number of vector elements to reach the next multiple of 32-bit 100 // type. 101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 105 const LLT EltTy = Ty.getElementType(); 106 const int Size = Ty.getSizeInBits(); 107 const int EltSize = EltTy.getSizeInBits(); 108 const int NextMul32 = (Size + 31) / 32; 109 110 assert(EltSize < 32); 111 112 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 113 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 114 }; 115 } 116 117 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 118 return [=](const LegalityQuery &Query) { 119 const LLT Ty = Query.Types[TypeIdx]; 120 unsigned Size = Ty.getSizeInBits(); 121 122 LLT CoercedTy; 123 if (Size < 32) { 124 // <2 x s8> -> s16 125 assert(Size == 16); 126 CoercedTy = LLT::scalar(16); 127 } else 128 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 129 130 return std::make_pair(TypeIdx, CoercedTy); 131 }; 132 } 133 134 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 135 return [=](const LegalityQuery &Query) { 136 const LLT QueryTy = Query.Types[TypeIdx]; 137 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 138 }; 139 } 140 141 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 142 return [=](const LegalityQuery &Query) { 143 const LLT QueryTy = Query.Types[TypeIdx]; 144 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 145 }; 146 } 147 148 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 149 return [=](const LegalityQuery &Query) { 150 const LLT QueryTy = Query.Types[TypeIdx]; 151 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 152 }; 153 } 154 155 static bool isRegisterSize(unsigned Size) { 156 return Size % 32 == 0 && Size <= 1024; 157 } 158 159 static bool isRegisterVectorElementType(LLT EltTy) { 160 const int EltSize = EltTy.getSizeInBits(); 161 return EltSize == 16 || EltSize % 32 == 0; 162 } 163 164 static bool isRegisterVectorType(LLT Ty) { 165 const int EltSize = Ty.getElementType().getSizeInBits(); 166 return EltSize == 32 || EltSize == 64 || 167 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 168 EltSize == 128 || EltSize == 256; 169 } 170 171 static bool isRegisterType(LLT Ty) { 172 if (!isRegisterSize(Ty.getSizeInBits())) 173 return false; 174 175 if (Ty.isVector()) 176 return isRegisterVectorType(Ty); 177 178 return true; 179 } 180 181 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 182 // v2s16. 183 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 return isRegisterType(Query.Types[TypeIdx]); 186 }; 187 } 188 189 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 190 return [=](const LegalityQuery &Query) { 191 const LLT QueryTy = Query.Types[TypeIdx]; 192 if (!QueryTy.isVector()) 193 return false; 194 const LLT EltTy = QueryTy.getElementType(); 195 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 196 }; 197 } 198 199 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 200 return [=](const LegalityQuery &Query) { 201 const LLT Ty = Query.Types[TypeIdx]; 202 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 203 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 204 }; 205 } 206 207 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 208 // handle some operations by just promoting the register during 209 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 210 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 211 bool IsLoad) { 212 switch (AS) { 213 case AMDGPUAS::PRIVATE_ADDRESS: 214 // FIXME: Private element size. 215 return 32; 216 case AMDGPUAS::LOCAL_ADDRESS: 217 return ST.useDS128() ? 128 : 64; 218 case AMDGPUAS::GLOBAL_ADDRESS: 219 case AMDGPUAS::CONSTANT_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 221 // Treat constant and global as identical. SMRD loads are sometimes usable for 222 // global loads (ideally constant address space should be eliminated) 223 // depending on the context. Legality cannot be context dependent, but 224 // RegBankSelect can split the load as necessary depending on the pointer 225 // register bank/uniformity and if the memory is invariant or not written in a 226 // kernel. 227 return IsLoad ? 512 : 128; 228 default: 229 // Flat addresses may contextually need to be split to 32-bit parts if they 230 // may alias scratch depending on the subtarget. 231 return 128; 232 } 233 } 234 235 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 236 const LegalityQuery &Query, 237 unsigned Opcode) { 238 const LLT Ty = Query.Types[0]; 239 240 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 241 const bool IsLoad = Opcode != AMDGPU::G_STORE; 242 243 unsigned RegSize = Ty.getSizeInBits(); 244 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 245 unsigned Align = Query.MMODescrs[0].AlignInBits; 246 unsigned AS = Query.Types[1].getAddressSpace(); 247 248 // All of these need to be custom lowered to cast the pointer operand. 249 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 250 return false; 251 252 // TODO: We should be able to widen loads if the alignment is high enough, but 253 // we also need to modify the memory access size. 254 #if 0 255 // Accept widening loads based on alignment. 256 if (IsLoad && MemSize < Size) 257 MemSize = std::max(MemSize, Align); 258 #endif 259 260 // Only 1-byte and 2-byte to 32-bit extloads are valid. 261 if (MemSize != RegSize && RegSize != 32) 262 return false; 263 264 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 265 return false; 266 267 switch (MemSize) { 268 case 8: 269 case 16: 270 case 32: 271 case 64: 272 case 128: 273 break; 274 case 96: 275 if (!ST.hasDwordx3LoadStores()) 276 return false; 277 break; 278 case 256: 279 case 512: 280 // These may contextually need to be broken down. 281 break; 282 default: 283 return false; 284 } 285 286 assert(RegSize >= MemSize); 287 288 if (Align < MemSize) { 289 const SITargetLowering *TLI = ST.getTargetLowering(); 290 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 291 return false; 292 } 293 294 return true; 295 } 296 297 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 298 unsigned Opcode) { 299 const LLT Ty = Query.Types[0]; 300 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode); 301 } 302 303 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 304 const GCNTargetMachine &TM) 305 : ST(ST_) { 306 using namespace TargetOpcode; 307 308 auto GetAddrSpacePtr = [&TM](unsigned AS) { 309 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 310 }; 311 312 const LLT S1 = LLT::scalar(1); 313 const LLT S16 = LLT::scalar(16); 314 const LLT S32 = LLT::scalar(32); 315 const LLT S64 = LLT::scalar(64); 316 const LLT S128 = LLT::scalar(128); 317 const LLT S256 = LLT::scalar(256); 318 const LLT S512 = LLT::scalar(512); 319 const LLT S1024 = LLT::scalar(1024); 320 321 const LLT V2S16 = LLT::vector(2, 16); 322 const LLT V4S16 = LLT::vector(4, 16); 323 324 const LLT V2S32 = LLT::vector(2, 32); 325 const LLT V3S32 = LLT::vector(3, 32); 326 const LLT V4S32 = LLT::vector(4, 32); 327 const LLT V5S32 = LLT::vector(5, 32); 328 const LLT V6S32 = LLT::vector(6, 32); 329 const LLT V7S32 = LLT::vector(7, 32); 330 const LLT V8S32 = LLT::vector(8, 32); 331 const LLT V9S32 = LLT::vector(9, 32); 332 const LLT V10S32 = LLT::vector(10, 32); 333 const LLT V11S32 = LLT::vector(11, 32); 334 const LLT V12S32 = LLT::vector(12, 32); 335 const LLT V13S32 = LLT::vector(13, 32); 336 const LLT V14S32 = LLT::vector(14, 32); 337 const LLT V15S32 = LLT::vector(15, 32); 338 const LLT V16S32 = LLT::vector(16, 32); 339 const LLT V32S32 = LLT::vector(32, 32); 340 341 const LLT V2S64 = LLT::vector(2, 64); 342 const LLT V3S64 = LLT::vector(3, 64); 343 const LLT V4S64 = LLT::vector(4, 64); 344 const LLT V5S64 = LLT::vector(5, 64); 345 const LLT V6S64 = LLT::vector(6, 64); 346 const LLT V7S64 = LLT::vector(7, 64); 347 const LLT V8S64 = LLT::vector(8, 64); 348 const LLT V16S64 = LLT::vector(16, 64); 349 350 std::initializer_list<LLT> AllS32Vectors = 351 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 352 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 353 std::initializer_list<LLT> AllS64Vectors = 354 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 355 356 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 357 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 358 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 359 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 360 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 361 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 362 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 363 364 const LLT CodePtr = FlatPtr; 365 366 const std::initializer_list<LLT> AddrSpaces64 = { 367 GlobalPtr, ConstantPtr, FlatPtr 368 }; 369 370 const std::initializer_list<LLT> AddrSpaces32 = { 371 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 372 }; 373 374 const std::initializer_list<LLT> FPTypesBase = { 375 S32, S64 376 }; 377 378 const std::initializer_list<LLT> FPTypes16 = { 379 S32, S64, S16 380 }; 381 382 const std::initializer_list<LLT> FPTypesPK16 = { 383 S32, S64, S16, V2S16 384 }; 385 386 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 387 388 setAction({G_BRCOND, S1}, Legal); // VCC branches 389 setAction({G_BRCOND, S32}, Legal); // SCC branches 390 391 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 392 // elements for v3s16 393 getActionDefinitionsBuilder(G_PHI) 394 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 395 .legalFor(AllS32Vectors) 396 .legalFor(AllS64Vectors) 397 .legalFor(AddrSpaces64) 398 .legalFor(AddrSpaces32) 399 .clampScalar(0, S32, S256) 400 .widenScalarToNextPow2(0, 32) 401 .clampMaxNumElements(0, S32, 16) 402 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 403 .legalIf(isPointer(0)); 404 405 if (ST.hasVOP3PInsts()) { 406 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 407 .legalFor({S32, S16, V2S16}) 408 .clampScalar(0, S16, S32) 409 .clampMaxNumElements(0, S16, 2) 410 .scalarize(0) 411 .widenScalarToNextPow2(0, 32); 412 } else if (ST.has16BitInsts()) { 413 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 414 .legalFor({S32, S16}) 415 .clampScalar(0, S16, S32) 416 .scalarize(0) 417 .widenScalarToNextPow2(0, 32); 418 } else { 419 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 420 .legalFor({S32}) 421 .clampScalar(0, S32, S32) 422 .scalarize(0); 423 } 424 425 // FIXME: Not really legal. Placeholder for custom lowering. 426 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 427 .customFor({S32, S64}) 428 .clampScalar(0, S32, S64) 429 .widenScalarToNextPow2(0, 32) 430 .scalarize(0); 431 432 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 433 .legalFor({S32}) 434 .clampScalar(0, S32, S32) 435 .scalarize(0); 436 437 // Report legal for any types we can handle anywhere. For the cases only legal 438 // on the SALU, RegBankSelect will be able to re-legalize. 439 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 440 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 441 .clampScalar(0, S32, S64) 442 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 443 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 444 .widenScalarToNextPow2(0) 445 .scalarize(0); 446 447 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 448 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 449 .legalFor({{S32, S1}, {S32, S32}}) 450 .minScalar(0, S32) 451 // TODO: .scalarize(0) 452 .lower(); 453 454 getActionDefinitionsBuilder(G_BITCAST) 455 // Don't worry about the size constraint. 456 .legalIf(all(isRegisterType(0), isRegisterType(1))) 457 .lower(); 458 459 460 getActionDefinitionsBuilder(G_CONSTANT) 461 .legalFor({S1, S32, S64, S16, GlobalPtr, 462 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 463 .clampScalar(0, S32, S64) 464 .widenScalarToNextPow2(0) 465 .legalIf(isPointer(0)); 466 467 getActionDefinitionsBuilder(G_FCONSTANT) 468 .legalFor({S32, S64, S16}) 469 .clampScalar(0, S16, S64); 470 471 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 472 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 473 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 474 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 475 .clampScalarOrElt(0, S32, S1024) 476 .legalIf(isMultiple32(0)) 477 .widenScalarToNextPow2(0, 32) 478 .clampMaxNumElements(0, S32, 16); 479 480 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 481 482 // If the amount is divergent, we have to do a wave reduction to get the 483 // maximum value, so this is expanded during RegBankSelect. 484 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 485 .legalFor({{PrivatePtr, S32}}); 486 487 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 488 .unsupportedFor({PrivatePtr}) 489 .custom(); 490 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 491 492 auto &FPOpActions = getActionDefinitionsBuilder( 493 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 494 .legalFor({S32, S64}); 495 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 496 .customFor({S32, S64}); 497 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 498 .customFor({S32, S64}); 499 500 if (ST.has16BitInsts()) { 501 if (ST.hasVOP3PInsts()) 502 FPOpActions.legalFor({S16, V2S16}); 503 else 504 FPOpActions.legalFor({S16}); 505 506 TrigActions.customFor({S16}); 507 FDIVActions.customFor({S16}); 508 } 509 510 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 511 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 512 513 if (ST.hasVOP3PInsts()) { 514 MinNumMaxNum.customFor(FPTypesPK16) 515 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 516 .clampMaxNumElements(0, S16, 2) 517 .clampScalar(0, S16, S64) 518 .scalarize(0); 519 } else if (ST.has16BitInsts()) { 520 MinNumMaxNum.customFor(FPTypes16) 521 .clampScalar(0, S16, S64) 522 .scalarize(0); 523 } else { 524 MinNumMaxNum.customFor(FPTypesBase) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } 528 529 if (ST.hasVOP3PInsts()) 530 FPOpActions.clampMaxNumElements(0, S16, 2); 531 532 FPOpActions 533 .scalarize(0) 534 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 535 536 TrigActions 537 .scalarize(0) 538 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 539 540 FDIVActions 541 .scalarize(0) 542 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 543 544 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 545 .legalFor(FPTypesPK16) 546 .clampMaxNumElements(0, S16, 2) 547 .scalarize(0) 548 .clampScalar(0, S16, S64); 549 550 if (ST.has16BitInsts()) { 551 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 552 .legalFor({S32, S64, S16}) 553 .scalarize(0) 554 .clampScalar(0, S16, S64); 555 } else { 556 getActionDefinitionsBuilder(G_FSQRT) 557 .legalFor({S32, S64}) 558 .scalarize(0) 559 .clampScalar(0, S32, S64); 560 561 if (ST.hasFractBug()) { 562 getActionDefinitionsBuilder(G_FFLOOR) 563 .customFor({S64}) 564 .legalFor({S32, S64}) 565 .scalarize(0) 566 .clampScalar(0, S32, S64); 567 } else { 568 getActionDefinitionsBuilder(G_FFLOOR) 569 .legalFor({S32, S64}) 570 .scalarize(0) 571 .clampScalar(0, S32, S64); 572 } 573 } 574 575 getActionDefinitionsBuilder(G_FPTRUNC) 576 .legalFor({{S32, S64}, {S16, S32}}) 577 .scalarize(0) 578 .lower(); 579 580 getActionDefinitionsBuilder(G_FPEXT) 581 .legalFor({{S64, S32}, {S32, S16}}) 582 .lowerFor({{S64, S16}}) // FIXME: Implement 583 .scalarize(0); 584 585 getActionDefinitionsBuilder(G_FSUB) 586 // Use actual fsub instruction 587 .legalFor({S32}) 588 // Must use fadd + fneg 589 .lowerFor({S64, S16, V2S16}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 593 // Whether this is legal depends on the floating point mode for the function. 594 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 595 if (ST.hasMadF16()) 596 FMad.customFor({S32, S16}); 597 else 598 FMad.customFor({S32}); 599 FMad.scalarize(0) 600 .lower(); 601 602 // TODO: Do we need to clamp maximum bitwidth? 603 getActionDefinitionsBuilder(G_TRUNC) 604 .legalIf(isScalar(0)) 605 .legalFor({{V2S16, V2S32}}) 606 .clampMaxNumElements(0, S16, 2) 607 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 608 // situations (like an invalid implicit use), we don't want to infinite loop 609 // in the legalizer. 610 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 611 .alwaysLegal(); 612 613 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 614 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 615 {S32, S1}, {S64, S1}, {S16, S1}}) 616 .scalarize(0) 617 .clampScalar(0, S32, S64) 618 .widenScalarToNextPow2(1, 32); 619 620 // TODO: Split s1->s64 during regbankselect for VALU. 621 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 622 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 623 .lowerFor({{S32, S64}}) 624 .lowerIf(typeIs(1, S1)) 625 .customFor({{S64, S64}}); 626 if (ST.has16BitInsts()) 627 IToFP.legalFor({{S16, S16}}); 628 IToFP.clampScalar(1, S32, S64) 629 .scalarize(0) 630 .widenScalarToNextPow2(1); 631 632 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 633 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 634 .customFor({{S64, S64}}); 635 if (ST.has16BitInsts()) 636 FPToI.legalFor({{S16, S16}}); 637 else 638 FPToI.minScalar(1, S32); 639 640 FPToI.minScalar(0, S32) 641 .scalarize(0) 642 .lower(); 643 644 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 645 .scalarize(0) 646 .lower(); 647 648 if (ST.has16BitInsts()) { 649 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 650 .legalFor({S16, S32, S64}) 651 .clampScalar(0, S16, S64) 652 .scalarize(0); 653 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 654 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 655 .legalFor({S32, S64}) 656 .clampScalar(0, S32, S64) 657 .scalarize(0); 658 } else { 659 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 660 .legalFor({S32}) 661 .customFor({S64}) 662 .clampScalar(0, S32, S64) 663 .scalarize(0); 664 } 665 666 // FIXME: Clamp offset operand. 667 getActionDefinitionsBuilder(G_PTR_ADD) 668 .legalIf(isPointer(0)) 669 .scalarize(0); 670 671 getActionDefinitionsBuilder(G_PTRMASK) 672 .legalIf(typeInSet(1, {S64, S32})) 673 .minScalar(1, S32) 674 .maxScalarIf(sizeIs(0, 32), 1, S32) 675 .maxScalarIf(sizeIs(0, 64), 1, S64) 676 .scalarize(0); 677 678 auto &CmpBuilder = 679 getActionDefinitionsBuilder(G_ICMP) 680 // The compare output type differs based on the register bank of the output, 681 // so make both s1 and s32 legal. 682 // 683 // Scalar compares producing output in scc will be promoted to s32, as that 684 // is the allocatable register type that will be needed for the copy from 685 // scc. This will be promoted during RegBankSelect, and we assume something 686 // before that won't try to use s32 result types. 687 // 688 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 689 // bank. 690 .legalForCartesianProduct( 691 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 692 .legalForCartesianProduct( 693 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 694 if (ST.has16BitInsts()) { 695 CmpBuilder.legalFor({{S1, S16}}); 696 } 697 698 CmpBuilder 699 .widenScalarToNextPow2(1) 700 .clampScalar(1, S32, S64) 701 .scalarize(0) 702 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 703 704 getActionDefinitionsBuilder(G_FCMP) 705 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 706 .widenScalarToNextPow2(1) 707 .clampScalar(1, S32, S64) 708 .scalarize(0); 709 710 // FIXME: fpow has a selection pattern that should move to custom lowering. 711 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 712 if (ST.has16BitInsts()) 713 Exp2Ops.legalFor({S32, S16}); 714 else 715 Exp2Ops.legalFor({S32}); 716 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 717 Exp2Ops.scalarize(0); 718 719 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 720 if (ST.has16BitInsts()) 721 ExpOps.customFor({{S32}, {S16}}); 722 else 723 ExpOps.customFor({S32}); 724 ExpOps.clampScalar(0, MinScalarFPTy, S32) 725 .scalarize(0); 726 727 // The 64-bit versions produce 32-bit results, but only on the SALU. 728 getActionDefinitionsBuilder(G_CTPOP) 729 .legalFor({{S32, S32}, {S32, S64}}) 730 .clampScalar(0, S32, S32) 731 .clampScalar(1, S32, S64) 732 .scalarize(0) 733 .widenScalarToNextPow2(0, 32) 734 .widenScalarToNextPow2(1, 32); 735 736 // The hardware instructions return a different result on 0 than the generic 737 // instructions expect. The hardware produces -1, but these produce the 738 // bitwidth. 739 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 740 .scalarize(0) 741 .clampScalar(0, S32, S32) 742 .clampScalar(1, S32, S64) 743 .widenScalarToNextPow2(0, 32) 744 .widenScalarToNextPow2(1, 32) 745 .lower(); 746 747 // The 64-bit versions produce 32-bit results, but only on the SALU. 748 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 749 .legalFor({{S32, S32}, {S32, S64}}) 750 .clampScalar(0, S32, S32) 751 .clampScalar(1, S32, S64) 752 .scalarize(0) 753 .widenScalarToNextPow2(0, 32) 754 .widenScalarToNextPow2(1, 32); 755 756 getActionDefinitionsBuilder(G_BITREVERSE) 757 .legalFor({S32}) 758 .clampScalar(0, S32, S32) 759 .scalarize(0); 760 761 if (ST.has16BitInsts()) { 762 getActionDefinitionsBuilder(G_BSWAP) 763 .legalFor({S16, S32, V2S16}) 764 .clampMaxNumElements(0, S16, 2) 765 // FIXME: Fixing non-power-of-2 before clamp is workaround for 766 // narrowScalar limitation. 767 .widenScalarToNextPow2(0) 768 .clampScalar(0, S16, S32) 769 .scalarize(0); 770 771 if (ST.hasVOP3PInsts()) { 772 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 773 .legalFor({S32, S16, V2S16}) 774 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 775 .clampMaxNumElements(0, S16, 2) 776 .minScalar(0, S16) 777 .widenScalarToNextPow2(0) 778 .scalarize(0) 779 .lower(); 780 } else { 781 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 782 .legalFor({S32, S16}) 783 .widenScalarToNextPow2(0) 784 .minScalar(0, S16) 785 .scalarize(0) 786 .lower(); 787 } 788 } else { 789 // TODO: Should have same legality without v_perm_b32 790 getActionDefinitionsBuilder(G_BSWAP) 791 .legalFor({S32}) 792 .lowerIf(scalarNarrowerThan(0, 32)) 793 // FIXME: Fixing non-power-of-2 before clamp is workaround for 794 // narrowScalar limitation. 795 .widenScalarToNextPow2(0) 796 .maxScalar(0, S32) 797 .scalarize(0) 798 .lower(); 799 800 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 801 .legalFor({S32}) 802 .minScalar(0, S32) 803 .widenScalarToNextPow2(0) 804 .scalarize(0) 805 .lower(); 806 } 807 808 getActionDefinitionsBuilder(G_INTTOPTR) 809 // List the common cases 810 .legalForCartesianProduct(AddrSpaces64, {S64}) 811 .legalForCartesianProduct(AddrSpaces32, {S32}) 812 .scalarize(0) 813 // Accept any address space as long as the size matches 814 .legalIf(sameSize(0, 1)) 815 .widenScalarIf(smallerThan(1, 0), 816 [](const LegalityQuery &Query) { 817 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 818 }) 819 .narrowScalarIf(largerThan(1, 0), 820 [](const LegalityQuery &Query) { 821 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 822 }); 823 824 getActionDefinitionsBuilder(G_PTRTOINT) 825 // List the common cases 826 .legalForCartesianProduct(AddrSpaces64, {S64}) 827 .legalForCartesianProduct(AddrSpaces32, {S32}) 828 .scalarize(0) 829 // Accept any address space as long as the size matches 830 .legalIf(sameSize(0, 1)) 831 .widenScalarIf(smallerThan(0, 1), 832 [](const LegalityQuery &Query) { 833 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 834 }) 835 .narrowScalarIf( 836 largerThan(0, 1), 837 [](const LegalityQuery &Query) { 838 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 839 }); 840 841 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 842 .scalarize(0) 843 .custom(); 844 845 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 846 bool IsLoad) -> bool { 847 const LLT DstTy = Query.Types[0]; 848 849 // Split vector extloads. 850 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 851 unsigned Align = Query.MMODescrs[0].AlignInBits; 852 853 if (MemSize < DstTy.getSizeInBits()) 854 MemSize = std::max(MemSize, Align); 855 856 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 857 return true; 858 859 const LLT PtrTy = Query.Types[1]; 860 unsigned AS = PtrTy.getAddressSpace(); 861 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 862 return true; 863 864 // Catch weird sized loads that don't evenly divide into the access sizes 865 // TODO: May be able to widen depending on alignment etc. 866 unsigned NumRegs = (MemSize + 31) / 32; 867 if (NumRegs == 3) { 868 if (!ST.hasDwordx3LoadStores()) 869 return true; 870 } else { 871 // If the alignment allows, these should have been widened. 872 if (!isPowerOf2_32(NumRegs)) 873 return true; 874 } 875 876 if (Align < MemSize) { 877 const SITargetLowering *TLI = ST.getTargetLowering(); 878 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 879 } 880 881 return false; 882 }; 883 884 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 885 unsigned Opc) -> bool { 886 unsigned Size = Query.Types[0].getSizeInBits(); 887 if (isPowerOf2_32(Size)) 888 return false; 889 890 if (Size == 96 && ST.hasDwordx3LoadStores()) 891 return false; 892 893 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 894 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 895 return false; 896 897 unsigned Align = Query.MMODescrs[0].AlignInBits; 898 unsigned RoundedSize = NextPowerOf2(Size); 899 return (Align >= RoundedSize); 900 }; 901 902 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 903 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 904 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 905 906 // TODO: Refine based on subtargets which support unaligned access or 128-bit 907 // LDS 908 // TODO: Unsupported flat for SI. 909 910 for (unsigned Op : {G_LOAD, G_STORE}) { 911 const bool IsStore = Op == G_STORE; 912 913 auto &Actions = getActionDefinitionsBuilder(Op); 914 // Whitelist some common cases. 915 // TODO: Does this help compile time at all? 916 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 917 {V2S32, GlobalPtr, 64, GlobalAlign32}, 918 {V4S32, GlobalPtr, 128, GlobalAlign32}, 919 {S64, GlobalPtr, 64, GlobalAlign32}, 920 {V2S64, GlobalPtr, 128, GlobalAlign32}, 921 {V2S16, GlobalPtr, 32, GlobalAlign32}, 922 {S32, GlobalPtr, 8, GlobalAlign8}, 923 {S32, GlobalPtr, 16, GlobalAlign16}, 924 925 {S32, LocalPtr, 32, 32}, 926 {S64, LocalPtr, 64, 32}, 927 {V2S32, LocalPtr, 64, 32}, 928 {S32, LocalPtr, 8, 8}, 929 {S32, LocalPtr, 16, 16}, 930 {V2S16, LocalPtr, 32, 32}, 931 932 {S32, PrivatePtr, 32, 32}, 933 {S32, PrivatePtr, 8, 8}, 934 {S32, PrivatePtr, 16, 16}, 935 {V2S16, PrivatePtr, 32, 32}, 936 937 {S32, ConstantPtr, 32, GlobalAlign32}, 938 {V2S32, ConstantPtr, 64, GlobalAlign32}, 939 {V4S32, ConstantPtr, 128, GlobalAlign32}, 940 {S64, ConstantPtr, 64, GlobalAlign32}, 941 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 942 Actions.legalIf( 943 [=](const LegalityQuery &Query) -> bool { 944 return isLoadStoreLegal(ST, Query, Op); 945 }); 946 947 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 948 // 64-bits. 949 // 950 // TODO: Should generalize bitcast action into coerce, which will also cover 951 // inserting addrspacecasts. 952 Actions.customIf(typeIs(1, Constant32Ptr)); 953 954 // Turn any illegal element vectors into something easier to deal 955 // with. These will ultimately produce 32-bit scalar shifts to extract the 956 // parts anyway. 957 // 958 // For odd 16-bit element vectors, prefer to split those into pieces with 959 // 16-bit vector parts. 960 Actions.bitcastIf( 961 [=](const LegalityQuery &Query) -> bool { 962 LLT Ty = Query.Types[0]; 963 return Ty.isVector() && 964 isRegisterSize(Ty.getSizeInBits()) && 965 !isRegisterVectorElementType(Ty.getElementType()); 966 }, bitcastToRegisterType(0)); 967 968 Actions 969 .customIf(typeIs(1, Constant32Ptr)) 970 // Widen suitably aligned loads by loading extra elements. 971 .moreElementsIf([=](const LegalityQuery &Query) { 972 const LLT Ty = Query.Types[0]; 973 return Op == G_LOAD && Ty.isVector() && 974 shouldWidenLoadResult(Query, Op); 975 }, moreElementsToNextPow2(0)) 976 .widenScalarIf([=](const LegalityQuery &Query) { 977 const LLT Ty = Query.Types[0]; 978 return Op == G_LOAD && !Ty.isVector() && 979 shouldWidenLoadResult(Query, Op); 980 }, widenScalarOrEltToNextPow2(0)) 981 .narrowScalarIf( 982 [=](const LegalityQuery &Query) -> bool { 983 return !Query.Types[0].isVector() && 984 needToSplitMemOp(Query, Op == G_LOAD); 985 }, 986 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 987 const LLT DstTy = Query.Types[0]; 988 const LLT PtrTy = Query.Types[1]; 989 990 const unsigned DstSize = DstTy.getSizeInBits(); 991 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 992 993 // Split extloads. 994 if (DstSize > MemSize) 995 return std::make_pair(0, LLT::scalar(MemSize)); 996 997 if (!isPowerOf2_32(DstSize)) { 998 // We're probably decomposing an odd sized store. Try to split 999 // to the widest type. TODO: Account for alignment. As-is it 1000 // should be OK, since the new parts will be further legalized. 1001 unsigned FloorSize = PowerOf2Floor(DstSize); 1002 return std::make_pair(0, LLT::scalar(FloorSize)); 1003 } 1004 1005 if (DstSize > 32 && (DstSize % 32 != 0)) { 1006 // FIXME: Need a way to specify non-extload of larger size if 1007 // suitably aligned. 1008 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1009 } 1010 1011 unsigned MaxSize = maxSizeForAddrSpace(ST, 1012 PtrTy.getAddressSpace(), 1013 Op == G_LOAD); 1014 if (MemSize > MaxSize) 1015 return std::make_pair(0, LLT::scalar(MaxSize)); 1016 1017 unsigned Align = Query.MMODescrs[0].AlignInBits; 1018 return std::make_pair(0, LLT::scalar(Align)); 1019 }) 1020 .fewerElementsIf( 1021 [=](const LegalityQuery &Query) -> bool { 1022 return Query.Types[0].isVector() && 1023 needToSplitMemOp(Query, Op == G_LOAD); 1024 }, 1025 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1026 const LLT DstTy = Query.Types[0]; 1027 const LLT PtrTy = Query.Types[1]; 1028 1029 LLT EltTy = DstTy.getElementType(); 1030 unsigned MaxSize = maxSizeForAddrSpace(ST, 1031 PtrTy.getAddressSpace(), 1032 Op == G_LOAD); 1033 1034 // FIXME: Handle widened to power of 2 results better. This ends 1035 // up scalarizing. 1036 // FIXME: 3 element stores scalarized on SI 1037 1038 // Split if it's too large for the address space. 1039 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1040 unsigned NumElts = DstTy.getNumElements(); 1041 unsigned EltSize = EltTy.getSizeInBits(); 1042 1043 if (MaxSize % EltSize == 0) { 1044 return std::make_pair( 1045 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1046 } 1047 1048 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1049 1050 // FIXME: Refine when odd breakdowns handled 1051 // The scalars will need to be re-legalized. 1052 if (NumPieces == 1 || NumPieces >= NumElts || 1053 NumElts % NumPieces != 0) 1054 return std::make_pair(0, EltTy); 1055 1056 return std::make_pair(0, 1057 LLT::vector(NumElts / NumPieces, EltTy)); 1058 } 1059 1060 // FIXME: We could probably handle weird extending loads better. 1061 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1062 if (DstTy.getSizeInBits() > MemSize) 1063 return std::make_pair(0, EltTy); 1064 1065 unsigned EltSize = EltTy.getSizeInBits(); 1066 unsigned DstSize = DstTy.getSizeInBits(); 1067 if (!isPowerOf2_32(DstSize)) { 1068 // We're probably decomposing an odd sized store. Try to split 1069 // to the widest type. TODO: Account for alignment. As-is it 1070 // should be OK, since the new parts will be further legalized. 1071 unsigned FloorSize = PowerOf2Floor(DstSize); 1072 return std::make_pair( 1073 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1074 } 1075 1076 // Need to split because of alignment. 1077 unsigned Align = Query.MMODescrs[0].AlignInBits; 1078 if (EltSize > Align && 1079 (EltSize / Align < DstTy.getNumElements())) { 1080 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1081 } 1082 1083 // May need relegalization for the scalars. 1084 return std::make_pair(0, EltTy); 1085 }) 1086 .minScalar(0, S32); 1087 1088 if (IsStore) 1089 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1090 1091 // TODO: Need a bitcast lower option? 1092 Actions 1093 .widenScalarToNextPow2(0) 1094 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1095 } 1096 1097 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1098 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1099 {S32, GlobalPtr, 16, 2 * 8}, 1100 {S32, LocalPtr, 8, 8}, 1101 {S32, LocalPtr, 16, 16}, 1102 {S32, PrivatePtr, 8, 8}, 1103 {S32, PrivatePtr, 16, 16}, 1104 {S32, ConstantPtr, 8, 8}, 1105 {S32, ConstantPtr, 16, 2 * 8}}); 1106 if (ST.hasFlatAddressSpace()) { 1107 ExtLoads.legalForTypesWithMemDesc( 1108 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1109 } 1110 1111 ExtLoads.clampScalar(0, S32, S32) 1112 .widenScalarToNextPow2(0) 1113 .unsupportedIfMemSizeNotPow2() 1114 .lower(); 1115 1116 auto &Atomics = getActionDefinitionsBuilder( 1117 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1118 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1119 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1120 G_ATOMICRMW_UMIN}) 1121 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1122 {S64, GlobalPtr}, {S64, LocalPtr}}); 1123 if (ST.hasFlatAddressSpace()) { 1124 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1125 } 1126 1127 if (ST.hasLDSFPAtomics()) { 1128 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1129 .legalFor({{S32, LocalPtr}}); 1130 } 1131 1132 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1133 // demarshalling 1134 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1135 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1136 {S32, FlatPtr}, {S64, FlatPtr}}) 1137 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1138 {S32, RegionPtr}, {S64, RegionPtr}}); 1139 // TODO: Pointer types, any 32-bit or 64-bit vector 1140 1141 // Condition should be s32 for scalar, s1 for vector. 1142 getActionDefinitionsBuilder(G_SELECT) 1143 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1144 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1145 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1146 .clampScalar(0, S16, S64) 1147 .scalarize(1) 1148 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1149 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1150 .clampMaxNumElements(0, S32, 2) 1151 .clampMaxNumElements(0, LocalPtr, 2) 1152 .clampMaxNumElements(0, PrivatePtr, 2) 1153 .scalarize(0) 1154 .widenScalarToNextPow2(0) 1155 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1156 1157 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1158 // be more flexible with the shift amount type. 1159 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1160 .legalFor({{S32, S32}, {S64, S32}}); 1161 if (ST.has16BitInsts()) { 1162 if (ST.hasVOP3PInsts()) { 1163 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1164 .clampMaxNumElements(0, S16, 2); 1165 } else 1166 Shifts.legalFor({{S16, S16}}); 1167 1168 // TODO: Support 16-bit shift amounts for all types 1169 Shifts.widenScalarIf( 1170 [=](const LegalityQuery &Query) { 1171 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1172 // 32-bit amount. 1173 const LLT ValTy = Query.Types[0]; 1174 const LLT AmountTy = Query.Types[1]; 1175 return ValTy.getSizeInBits() <= 16 && 1176 AmountTy.getSizeInBits() < 16; 1177 }, changeTo(1, S16)); 1178 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1179 Shifts.clampScalar(1, S32, S32); 1180 Shifts.clampScalar(0, S16, S64); 1181 Shifts.widenScalarToNextPow2(0, 16); 1182 } else { 1183 // Make sure we legalize the shift amount type first, as the general 1184 // expansion for the shifted type will produce much worse code if it hasn't 1185 // been truncated already. 1186 Shifts.clampScalar(1, S32, S32); 1187 Shifts.clampScalar(0, S32, S64); 1188 Shifts.widenScalarToNextPow2(0, 32); 1189 } 1190 Shifts.scalarize(0); 1191 1192 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1193 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1194 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1195 unsigned IdxTypeIdx = 2; 1196 1197 getActionDefinitionsBuilder(Op) 1198 .customIf([=](const LegalityQuery &Query) { 1199 const LLT EltTy = Query.Types[EltTypeIdx]; 1200 const LLT VecTy = Query.Types[VecTypeIdx]; 1201 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1202 return (EltTy.getSizeInBits() == 16 || 1203 EltTy.getSizeInBits() % 32 == 0) && 1204 VecTy.getSizeInBits() % 32 == 0 && 1205 VecTy.getSizeInBits() <= 1024 && 1206 IdxTy.getSizeInBits() == 32; 1207 }) 1208 .clampScalar(EltTypeIdx, S32, S64) 1209 .clampScalar(VecTypeIdx, S32, S64) 1210 .clampScalar(IdxTypeIdx, S32, S32); 1211 } 1212 1213 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1214 .unsupportedIf([=](const LegalityQuery &Query) { 1215 const LLT &EltTy = Query.Types[1].getElementType(); 1216 return Query.Types[0] != EltTy; 1217 }); 1218 1219 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1220 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1221 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1222 1223 // FIXME: Doesn't handle extract of illegal sizes. 1224 getActionDefinitionsBuilder(Op) 1225 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1226 // FIXME: Multiples of 16 should not be legal. 1227 .legalIf([=](const LegalityQuery &Query) { 1228 const LLT BigTy = Query.Types[BigTyIdx]; 1229 const LLT LitTy = Query.Types[LitTyIdx]; 1230 return (BigTy.getSizeInBits() % 32 == 0) && 1231 (LitTy.getSizeInBits() % 16 == 0); 1232 }) 1233 .widenScalarIf( 1234 [=](const LegalityQuery &Query) { 1235 const LLT BigTy = Query.Types[BigTyIdx]; 1236 return (BigTy.getScalarSizeInBits() < 16); 1237 }, 1238 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1239 .widenScalarIf( 1240 [=](const LegalityQuery &Query) { 1241 const LLT LitTy = Query.Types[LitTyIdx]; 1242 return (LitTy.getScalarSizeInBits() < 16); 1243 }, 1244 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1245 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1246 .widenScalarToNextPow2(BigTyIdx, 32); 1247 1248 } 1249 1250 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1251 .legalForCartesianProduct(AllS32Vectors, {S32}) 1252 .legalForCartesianProduct(AllS64Vectors, {S64}) 1253 .clampNumElements(0, V16S32, V32S32) 1254 .clampNumElements(0, V2S64, V16S64) 1255 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1256 1257 if (ST.hasScalarPackInsts()) { 1258 BuildVector 1259 // FIXME: Should probably widen s1 vectors straight to s32 1260 .minScalarOrElt(0, S16) 1261 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1262 .minScalar(1, S32); 1263 1264 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1265 .legalFor({V2S16, S32}) 1266 .lower(); 1267 BuildVector.minScalarOrElt(0, S32); 1268 } else { 1269 BuildVector.customFor({V2S16, S16}); 1270 BuildVector.minScalarOrElt(0, S32); 1271 1272 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1273 .customFor({V2S16, S32}) 1274 .lower(); 1275 } 1276 1277 BuildVector.legalIf(isRegisterType(0)); 1278 1279 // FIXME: Clamp maximum size 1280 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1281 .legalIf(isRegisterType(0)); 1282 1283 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1284 // pre-legalize. 1285 if (ST.hasVOP3PInsts()) { 1286 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1287 .customFor({V2S16, V2S16}) 1288 .lower(); 1289 } else 1290 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1291 1292 // Merge/Unmerge 1293 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1294 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1295 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1296 1297 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1298 const LLT Ty = Query.Types[TypeIdx]; 1299 if (Ty.isVector()) { 1300 const LLT &EltTy = Ty.getElementType(); 1301 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1302 return true; 1303 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1304 return true; 1305 } 1306 return false; 1307 }; 1308 1309 auto &Builder = getActionDefinitionsBuilder(Op) 1310 .lowerFor({{S16, V2S16}}) 1311 .lowerIf([=](const LegalityQuery &Query) { 1312 const LLT BigTy = Query.Types[BigTyIdx]; 1313 return BigTy.getSizeInBits() == 32; 1314 }) 1315 // Try to widen to s16 first for small types. 1316 // TODO: Only do this on targets with legal s16 shifts 1317 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1318 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1319 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1320 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1321 elementTypeIs(1, S16)), 1322 changeTo(1, V2S16)) 1323 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1324 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1325 // valid. 1326 .clampScalar(LitTyIdx, S32, S512) 1327 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1328 // Break up vectors with weird elements into scalars 1329 .fewerElementsIf( 1330 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1331 scalarize(0)) 1332 .fewerElementsIf( 1333 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1334 scalarize(1)) 1335 .clampScalar(BigTyIdx, S32, S1024); 1336 1337 if (Op == G_MERGE_VALUES) { 1338 Builder.widenScalarIf( 1339 // TODO: Use 16-bit shifts if legal for 8-bit values? 1340 [=](const LegalityQuery &Query) { 1341 const LLT Ty = Query.Types[LitTyIdx]; 1342 return Ty.getSizeInBits() < 32; 1343 }, 1344 changeTo(LitTyIdx, S32)); 1345 } 1346 1347 Builder.widenScalarIf( 1348 [=](const LegalityQuery &Query) { 1349 const LLT Ty = Query.Types[BigTyIdx]; 1350 return !isPowerOf2_32(Ty.getSizeInBits()) && 1351 Ty.getSizeInBits() % 16 != 0; 1352 }, 1353 [=](const LegalityQuery &Query) { 1354 // Pick the next power of 2, or a multiple of 64 over 128. 1355 // Whichever is smaller. 1356 const LLT &Ty = Query.Types[BigTyIdx]; 1357 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1358 if (NewSizeInBits >= 256) { 1359 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1360 if (RoundedTo < NewSizeInBits) 1361 NewSizeInBits = RoundedTo; 1362 } 1363 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1364 }) 1365 .legalIf([=](const LegalityQuery &Query) { 1366 const LLT &BigTy = Query.Types[BigTyIdx]; 1367 const LLT &LitTy = Query.Types[LitTyIdx]; 1368 1369 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1370 return false; 1371 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1372 return false; 1373 1374 return BigTy.getSizeInBits() % 16 == 0 && 1375 LitTy.getSizeInBits() % 16 == 0 && 1376 BigTy.getSizeInBits() <= 1024; 1377 }) 1378 // Any vectors left are the wrong size. Scalarize them. 1379 .scalarize(0) 1380 .scalarize(1); 1381 } 1382 1383 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1384 // RegBankSelect. 1385 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1386 .legalFor({{S32}, {S64}}); 1387 1388 if (ST.hasVOP3PInsts()) { 1389 SextInReg.lowerFor({{V2S16}}) 1390 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1391 // get more vector shift opportunities, since we'll get those when 1392 // expanded. 1393 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1394 } else if (ST.has16BitInsts()) { 1395 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1396 } else { 1397 // Prefer to promote to s32 before lowering if we don't have 16-bit 1398 // shifts. This avoid a lot of intermediate truncate and extend operations. 1399 SextInReg.lowerFor({{S32}, {S64}}); 1400 } 1401 1402 SextInReg 1403 .scalarize(0) 1404 .clampScalar(0, S32, S64) 1405 .lower(); 1406 1407 getActionDefinitionsBuilder(G_FSHR) 1408 .legalFor({{S32, S32}}) 1409 .scalarize(0) 1410 .lower(); 1411 1412 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1413 .legalFor({S64}); 1414 1415 getActionDefinitionsBuilder({ 1416 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1417 G_FCOPYSIGN, 1418 1419 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1420 G_READ_REGISTER, 1421 G_WRITE_REGISTER, 1422 1423 G_SADDO, G_SSUBO, 1424 1425 // TODO: Implement 1426 G_FMINIMUM, G_FMAXIMUM, 1427 G_FSHL 1428 }).lower(); 1429 1430 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1431 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1432 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1433 .unsupported(); 1434 1435 computeTables(); 1436 verify(*ST.getInstrInfo()); 1437 } 1438 1439 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1440 MachineRegisterInfo &MRI, 1441 MachineIRBuilder &B, 1442 GISelChangeObserver &Observer) const { 1443 switch (MI.getOpcode()) { 1444 case TargetOpcode::G_ADDRSPACE_CAST: 1445 return legalizeAddrSpaceCast(MI, MRI, B); 1446 case TargetOpcode::G_FRINT: 1447 return legalizeFrint(MI, MRI, B); 1448 case TargetOpcode::G_FCEIL: 1449 return legalizeFceil(MI, MRI, B); 1450 case TargetOpcode::G_INTRINSIC_TRUNC: 1451 return legalizeIntrinsicTrunc(MI, MRI, B); 1452 case TargetOpcode::G_SITOFP: 1453 return legalizeITOFP(MI, MRI, B, true); 1454 case TargetOpcode::G_UITOFP: 1455 return legalizeITOFP(MI, MRI, B, false); 1456 case TargetOpcode::G_FPTOSI: 1457 return legalizeFPTOI(MI, MRI, B, true); 1458 case TargetOpcode::G_FPTOUI: 1459 return legalizeFPTOI(MI, MRI, B, false); 1460 case TargetOpcode::G_FMINNUM: 1461 case TargetOpcode::G_FMAXNUM: 1462 case TargetOpcode::G_FMINNUM_IEEE: 1463 case TargetOpcode::G_FMAXNUM_IEEE: 1464 return legalizeMinNumMaxNum(MI, MRI, B); 1465 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1466 return legalizeExtractVectorElt(MI, MRI, B); 1467 case TargetOpcode::G_INSERT_VECTOR_ELT: 1468 return legalizeInsertVectorElt(MI, MRI, B); 1469 case TargetOpcode::G_SHUFFLE_VECTOR: 1470 return legalizeShuffleVector(MI, MRI, B); 1471 case TargetOpcode::G_FSIN: 1472 case TargetOpcode::G_FCOS: 1473 return legalizeSinCos(MI, MRI, B); 1474 case TargetOpcode::G_GLOBAL_VALUE: 1475 return legalizeGlobalValue(MI, MRI, B); 1476 case TargetOpcode::G_LOAD: 1477 return legalizeLoad(MI, MRI, B, Observer); 1478 case TargetOpcode::G_FMAD: 1479 return legalizeFMad(MI, MRI, B); 1480 case TargetOpcode::G_FDIV: 1481 return legalizeFDIV(MI, MRI, B); 1482 case TargetOpcode::G_UDIV: 1483 case TargetOpcode::G_UREM: 1484 return legalizeUDIV_UREM(MI, MRI, B); 1485 case TargetOpcode::G_SDIV: 1486 case TargetOpcode::G_SREM: 1487 return legalizeSDIV_SREM(MI, MRI, B); 1488 case TargetOpcode::G_ATOMIC_CMPXCHG: 1489 return legalizeAtomicCmpXChg(MI, MRI, B); 1490 case TargetOpcode::G_FLOG: 1491 return legalizeFlog(MI, B, numbers::ln2f); 1492 case TargetOpcode::G_FLOG10: 1493 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1494 case TargetOpcode::G_FEXP: 1495 return legalizeFExp(MI, B); 1496 case TargetOpcode::G_FPOW: 1497 return legalizeFPow(MI, B); 1498 case TargetOpcode::G_FFLOOR: 1499 return legalizeFFloor(MI, MRI, B); 1500 case TargetOpcode::G_BUILD_VECTOR: 1501 return legalizeBuildVector(MI, MRI, B); 1502 default: 1503 return false; 1504 } 1505 1506 llvm_unreachable("expected switch to return"); 1507 } 1508 1509 Register AMDGPULegalizerInfo::getSegmentAperture( 1510 unsigned AS, 1511 MachineRegisterInfo &MRI, 1512 MachineIRBuilder &B) const { 1513 MachineFunction &MF = B.getMF(); 1514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1515 const LLT S32 = LLT::scalar(32); 1516 1517 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1518 1519 if (ST.hasApertureRegs()) { 1520 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1521 // getreg. 1522 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1523 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1524 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1525 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1526 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1527 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1528 unsigned Encoding = 1529 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1530 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1531 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1532 1533 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1534 1535 B.buildInstr(AMDGPU::S_GETREG_B32) 1536 .addDef(GetReg) 1537 .addImm(Encoding); 1538 MRI.setType(GetReg, S32); 1539 1540 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1541 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1542 } 1543 1544 Register QueuePtr = MRI.createGenericVirtualRegister( 1545 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1546 1547 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1548 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1549 return Register(); 1550 1551 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1552 // private_segment_aperture_base_hi. 1553 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1554 1555 // TODO: can we be smarter about machine pointer info? 1556 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1557 MachineMemOperand *MMO = MF.getMachineMemOperand( 1558 PtrInfo, 1559 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1560 MachineMemOperand::MOInvariant, 1561 4, commonAlignment(Align(64), StructOffset)); 1562 1563 Register LoadAddr; 1564 1565 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1566 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 MachineFunction &MF = B.getMF(); 1573 1574 B.setInstr(MI); 1575 1576 const LLT S32 = LLT::scalar(32); 1577 Register Dst = MI.getOperand(0).getReg(); 1578 Register Src = MI.getOperand(1).getReg(); 1579 1580 LLT DstTy = MRI.getType(Dst); 1581 LLT SrcTy = MRI.getType(Src); 1582 unsigned DestAS = DstTy.getAddressSpace(); 1583 unsigned SrcAS = SrcTy.getAddressSpace(); 1584 1585 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1586 // vector element. 1587 assert(!DstTy.isVector()); 1588 1589 const AMDGPUTargetMachine &TM 1590 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1591 1592 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1593 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1594 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1595 return true; 1596 } 1597 1598 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1599 // Truncate. 1600 B.buildExtract(Dst, Src, 0); 1601 MI.eraseFromParent(); 1602 return true; 1603 } 1604 1605 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1606 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1607 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1608 1609 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1610 // another. Merge operands are required to be the same type, but creating an 1611 // extra ptrtoint would be kind of pointless. 1612 auto HighAddr = B.buildConstant( 1613 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1614 B.buildMerge(Dst, {Src, HighAddr}); 1615 MI.eraseFromParent(); 1616 return true; 1617 } 1618 1619 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1620 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1621 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1622 unsigned NullVal = TM.getNullPointerValue(DestAS); 1623 1624 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1625 auto FlatNull = B.buildConstant(SrcTy, 0); 1626 1627 // Extract low 32-bits of the pointer. 1628 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1629 1630 auto CmpRes = 1631 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1632 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1633 1634 MI.eraseFromParent(); 1635 return true; 1636 } 1637 1638 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1639 return false; 1640 1641 if (!ST.hasFlatAddressSpace()) 1642 return false; 1643 1644 auto SegmentNull = 1645 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1646 auto FlatNull = 1647 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1648 1649 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1650 if (!ApertureReg.isValid()) 1651 return false; 1652 1653 auto CmpRes = 1654 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1655 1656 // Coerce the type of the low half of the result so we can use merge_values. 1657 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1658 1659 // TODO: Should we allow mismatched types but matching sizes in merges to 1660 // avoid the ptrtoint? 1661 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1662 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1663 1664 MI.eraseFromParent(); 1665 return true; 1666 } 1667 1668 bool AMDGPULegalizerInfo::legalizeFrint( 1669 MachineInstr &MI, MachineRegisterInfo &MRI, 1670 MachineIRBuilder &B) const { 1671 B.setInstr(MI); 1672 1673 Register Src = MI.getOperand(1).getReg(); 1674 LLT Ty = MRI.getType(Src); 1675 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1676 1677 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1678 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1679 1680 auto C1 = B.buildFConstant(Ty, C1Val); 1681 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1682 1683 // TODO: Should this propagate fast-math-flags? 1684 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1685 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1686 1687 auto C2 = B.buildFConstant(Ty, C2Val); 1688 auto Fabs = B.buildFAbs(Ty, Src); 1689 1690 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1691 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1692 return true; 1693 } 1694 1695 bool AMDGPULegalizerInfo::legalizeFceil( 1696 MachineInstr &MI, MachineRegisterInfo &MRI, 1697 MachineIRBuilder &B) const { 1698 B.setInstr(MI); 1699 1700 const LLT S1 = LLT::scalar(1); 1701 const LLT S64 = LLT::scalar(64); 1702 1703 Register Src = MI.getOperand(1).getReg(); 1704 assert(MRI.getType(Src) == S64); 1705 1706 // result = trunc(src) 1707 // if (src > 0.0 && src != result) 1708 // result += 1.0 1709 1710 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1711 1712 const auto Zero = B.buildFConstant(S64, 0.0); 1713 const auto One = B.buildFConstant(S64, 1.0); 1714 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1715 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1716 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1717 auto Add = B.buildSelect(S64, And, One, Zero); 1718 1719 // TODO: Should this propagate fast-math-flags? 1720 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1721 return true; 1722 } 1723 1724 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1725 MachineIRBuilder &B) { 1726 const unsigned FractBits = 52; 1727 const unsigned ExpBits = 11; 1728 LLT S32 = LLT::scalar(32); 1729 1730 auto Const0 = B.buildConstant(S32, FractBits - 32); 1731 auto Const1 = B.buildConstant(S32, ExpBits); 1732 1733 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1734 .addUse(Const0.getReg(0)) 1735 .addUse(Const1.getReg(0)); 1736 1737 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1738 } 1739 1740 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1741 MachineInstr &MI, MachineRegisterInfo &MRI, 1742 MachineIRBuilder &B) const { 1743 B.setInstr(MI); 1744 1745 const LLT S1 = LLT::scalar(1); 1746 const LLT S32 = LLT::scalar(32); 1747 const LLT S64 = LLT::scalar(64); 1748 1749 Register Src = MI.getOperand(1).getReg(); 1750 assert(MRI.getType(Src) == S64); 1751 1752 // TODO: Should this use extract since the low half is unused? 1753 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1754 Register Hi = Unmerge.getReg(1); 1755 1756 // Extract the upper half, since this is where we will find the sign and 1757 // exponent. 1758 auto Exp = extractF64Exponent(Hi, B); 1759 1760 const unsigned FractBits = 52; 1761 1762 // Extract the sign bit. 1763 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1764 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1765 1766 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1767 1768 const auto Zero32 = B.buildConstant(S32, 0); 1769 1770 // Extend back to 64-bits. 1771 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1772 1773 auto Shr = B.buildAShr(S64, FractMask, Exp); 1774 auto Not = B.buildNot(S64, Shr); 1775 auto Tmp0 = B.buildAnd(S64, Src, Not); 1776 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1777 1778 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1779 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1780 1781 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1782 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1783 return true; 1784 } 1785 1786 bool AMDGPULegalizerInfo::legalizeITOFP( 1787 MachineInstr &MI, MachineRegisterInfo &MRI, 1788 MachineIRBuilder &B, bool Signed) const { 1789 B.setInstr(MI); 1790 1791 Register Dst = MI.getOperand(0).getReg(); 1792 Register Src = MI.getOperand(1).getReg(); 1793 1794 const LLT S64 = LLT::scalar(64); 1795 const LLT S32 = LLT::scalar(32); 1796 1797 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1798 1799 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1800 1801 auto CvtHi = Signed ? 1802 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1803 B.buildUITOFP(S64, Unmerge.getReg(1)); 1804 1805 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1806 1807 auto ThirtyTwo = B.buildConstant(S32, 32); 1808 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1809 .addUse(CvtHi.getReg(0)) 1810 .addUse(ThirtyTwo.getReg(0)); 1811 1812 // TODO: Should this propagate fast-math-flags? 1813 B.buildFAdd(Dst, LdExp, CvtLo); 1814 MI.eraseFromParent(); 1815 return true; 1816 } 1817 1818 // TODO: Copied from DAG implementation. Verify logic and document how this 1819 // actually works. 1820 bool AMDGPULegalizerInfo::legalizeFPTOI( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B, bool Signed) const { 1823 B.setInstr(MI); 1824 1825 Register Dst = MI.getOperand(0).getReg(); 1826 Register Src = MI.getOperand(1).getReg(); 1827 1828 const LLT S64 = LLT::scalar(64); 1829 const LLT S32 = LLT::scalar(32); 1830 1831 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1832 1833 unsigned Flags = MI.getFlags(); 1834 1835 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1836 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1837 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1838 1839 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1840 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1841 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1842 1843 auto Hi = Signed ? 1844 B.buildFPTOSI(S32, FloorMul) : 1845 B.buildFPTOUI(S32, FloorMul); 1846 auto Lo = B.buildFPTOUI(S32, Fma); 1847 1848 B.buildMerge(Dst, { Lo, Hi }); 1849 MI.eraseFromParent(); 1850 1851 return true; 1852 } 1853 1854 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1855 MachineInstr &MI, MachineRegisterInfo &MRI, 1856 MachineIRBuilder &B) const { 1857 MachineFunction &MF = B.getMF(); 1858 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1859 1860 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1861 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1862 1863 // With ieee_mode disabled, the instructions have the correct behavior 1864 // already for G_FMINNUM/G_FMAXNUM 1865 if (!MFI->getMode().IEEE) 1866 return !IsIEEEOp; 1867 1868 if (IsIEEEOp) 1869 return true; 1870 1871 MachineIRBuilder HelperBuilder(MI); 1872 GISelObserverWrapper DummyObserver; 1873 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1874 HelperBuilder.setInstr(MI); 1875 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1876 } 1877 1878 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1879 MachineInstr &MI, MachineRegisterInfo &MRI, 1880 MachineIRBuilder &B) const { 1881 // TODO: Should move some of this into LegalizerHelper. 1882 1883 // TODO: Promote dynamic indexing of s16 to s32 1884 1885 // FIXME: Artifact combiner probably should have replaced the truncated 1886 // constant before this, so we shouldn't need 1887 // getConstantVRegValWithLookThrough. 1888 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1889 MI.getOperand(2).getReg(), MRI); 1890 if (!IdxVal) // Dynamic case will be selected to register indexing. 1891 return true; 1892 1893 Register Dst = MI.getOperand(0).getReg(); 1894 Register Vec = MI.getOperand(1).getReg(); 1895 1896 LLT VecTy = MRI.getType(Vec); 1897 LLT EltTy = VecTy.getElementType(); 1898 assert(EltTy == MRI.getType(Dst)); 1899 1900 B.setInstr(MI); 1901 1902 if (IdxVal->Value < VecTy.getNumElements()) 1903 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1904 else 1905 B.buildUndef(Dst); 1906 1907 MI.eraseFromParent(); 1908 return true; 1909 } 1910 1911 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1912 MachineInstr &MI, MachineRegisterInfo &MRI, 1913 MachineIRBuilder &B) const { 1914 // TODO: Should move some of this into LegalizerHelper. 1915 1916 // TODO: Promote dynamic indexing of s16 to s32 1917 1918 // FIXME: Artifact combiner probably should have replaced the truncated 1919 // constant before this, so we shouldn't need 1920 // getConstantVRegValWithLookThrough. 1921 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1922 MI.getOperand(3).getReg(), MRI); 1923 if (!IdxVal) // Dynamic case will be selected to register indexing. 1924 return true; 1925 1926 Register Dst = MI.getOperand(0).getReg(); 1927 Register Vec = MI.getOperand(1).getReg(); 1928 Register Ins = MI.getOperand(2).getReg(); 1929 1930 LLT VecTy = MRI.getType(Vec); 1931 LLT EltTy = VecTy.getElementType(); 1932 assert(EltTy == MRI.getType(Ins)); 1933 1934 B.setInstr(MI); 1935 1936 if (IdxVal->Value < VecTy.getNumElements()) 1937 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1938 else 1939 B.buildUndef(Dst); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1946 MachineInstr &MI, MachineRegisterInfo &MRI, 1947 MachineIRBuilder &B) const { 1948 const LLT V2S16 = LLT::vector(2, 16); 1949 1950 Register Dst = MI.getOperand(0).getReg(); 1951 Register Src0 = MI.getOperand(1).getReg(); 1952 LLT DstTy = MRI.getType(Dst); 1953 LLT SrcTy = MRI.getType(Src0); 1954 1955 if (SrcTy == V2S16 && DstTy == V2S16 && 1956 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1957 return true; 1958 1959 MachineIRBuilder HelperBuilder(MI); 1960 GISelObserverWrapper DummyObserver; 1961 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1962 HelperBuilder.setInstr(MI); 1963 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1964 } 1965 1966 bool AMDGPULegalizerInfo::legalizeSinCos( 1967 MachineInstr &MI, MachineRegisterInfo &MRI, 1968 MachineIRBuilder &B) const { 1969 B.setInstr(MI); 1970 1971 Register DstReg = MI.getOperand(0).getReg(); 1972 Register SrcReg = MI.getOperand(1).getReg(); 1973 LLT Ty = MRI.getType(DstReg); 1974 unsigned Flags = MI.getFlags(); 1975 1976 Register TrigVal; 1977 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1978 if (ST.hasTrigReducedRange()) { 1979 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1980 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1981 .addUse(MulVal.getReg(0)) 1982 .setMIFlags(Flags).getReg(0); 1983 } else 1984 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1985 1986 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1987 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1988 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1989 .addUse(TrigVal) 1990 .setMIFlags(Flags); 1991 MI.eraseFromParent(); 1992 return true; 1993 } 1994 1995 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1996 Register DstReg, LLT PtrTy, 1997 MachineIRBuilder &B, const GlobalValue *GV, 1998 unsigned Offset, unsigned GAFlags) const { 1999 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2000 // to the following code sequence: 2001 // 2002 // For constant address space: 2003 // s_getpc_b64 s[0:1] 2004 // s_add_u32 s0, s0, $symbol 2005 // s_addc_u32 s1, s1, 0 2006 // 2007 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2008 // a fixup or relocation is emitted to replace $symbol with a literal 2009 // constant, which is a pc-relative offset from the encoding of the $symbol 2010 // operand to the global variable. 2011 // 2012 // For global address space: 2013 // s_getpc_b64 s[0:1] 2014 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2015 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2016 // 2017 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2018 // fixups or relocations are emitted to replace $symbol@*@lo and 2019 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2020 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2021 // operand to the global variable. 2022 // 2023 // What we want here is an offset from the value returned by s_getpc 2024 // (which is the address of the s_add_u32 instruction) to the global 2025 // variable, but since the encoding of $symbol starts 4 bytes after the start 2026 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2027 // small. This requires us to add 4 to the global variable offset in order to 2028 // compute the correct address. 2029 2030 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2031 2032 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2033 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2034 2035 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2036 .addDef(PCReg); 2037 2038 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2039 if (GAFlags == SIInstrInfo::MO_NONE) 2040 MIB.addImm(0); 2041 else 2042 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2043 2044 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2045 2046 if (PtrTy.getSizeInBits() == 32) 2047 B.buildExtract(DstReg, PCReg, 0); 2048 return true; 2049 } 2050 2051 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2052 MachineInstr &MI, MachineRegisterInfo &MRI, 2053 MachineIRBuilder &B) const { 2054 Register DstReg = MI.getOperand(0).getReg(); 2055 LLT Ty = MRI.getType(DstReg); 2056 unsigned AS = Ty.getAddressSpace(); 2057 2058 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2059 MachineFunction &MF = B.getMF(); 2060 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 B.setInstr(MI); 2062 2063 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2064 if (!MFI->isEntryFunction()) { 2065 const Function &Fn = MF.getFunction(); 2066 DiagnosticInfoUnsupported BadLDSDecl( 2067 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2068 DS_Warning); 2069 Fn.getContext().diagnose(BadLDSDecl); 2070 2071 // We currently don't have a way to correctly allocate LDS objects that 2072 // aren't directly associated with a kernel. We do force inlining of 2073 // functions that use local objects. However, if these dead functions are 2074 // not eliminated, we don't want a compile time error. Just emit a warning 2075 // and a trap, since there should be no callable path here. 2076 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2077 B.buildUndef(DstReg); 2078 MI.eraseFromParent(); 2079 return true; 2080 } 2081 2082 // TODO: We could emit code to handle the initialization somewhere. 2083 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2084 const SITargetLowering *TLI = ST.getTargetLowering(); 2085 if (!TLI->shouldUseLDSConstAddress(GV)) { 2086 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2087 return true; // Leave in place; 2088 } 2089 2090 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2091 MI.eraseFromParent(); 2092 return true; 2093 } 2094 2095 const Function &Fn = MF.getFunction(); 2096 DiagnosticInfoUnsupported BadInit( 2097 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2098 Fn.getContext().diagnose(BadInit); 2099 return true; 2100 } 2101 2102 const SITargetLowering *TLI = ST.getTargetLowering(); 2103 2104 if (TLI->shouldEmitFixup(GV)) { 2105 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2106 MI.eraseFromParent(); 2107 return true; 2108 } 2109 2110 if (TLI->shouldEmitPCReloc(GV)) { 2111 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2112 MI.eraseFromParent(); 2113 return true; 2114 } 2115 2116 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2117 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2118 2119 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2120 MachinePointerInfo::getGOT(MF), 2121 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2122 MachineMemOperand::MOInvariant, 2123 8 /*Size*/, Align(8)); 2124 2125 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2126 2127 if (Ty.getSizeInBits() == 32) { 2128 // Truncate if this is a 32-bit constant adrdess. 2129 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2130 B.buildExtract(DstReg, Load, 0); 2131 } else 2132 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2133 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeLoad( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2141 B.setInstr(MI); 2142 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2143 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2144 Observer.changingInstr(MI); 2145 MI.getOperand(1).setReg(Cast.getReg(0)); 2146 Observer.changedInstr(MI); 2147 return true; 2148 } 2149 2150 bool AMDGPULegalizerInfo::legalizeFMad( 2151 MachineInstr &MI, MachineRegisterInfo &MRI, 2152 MachineIRBuilder &B) const { 2153 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2154 assert(Ty.isScalar()); 2155 2156 MachineFunction &MF = B.getMF(); 2157 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2158 2159 // TODO: Always legal with future ftz flag. 2160 // FIXME: Do we need just output? 2161 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2162 return true; 2163 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2164 return true; 2165 2166 MachineIRBuilder HelperBuilder(MI); 2167 GISelObserverWrapper DummyObserver; 2168 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2169 HelperBuilder.setInstr(MI); 2170 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2171 } 2172 2173 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2174 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2175 Register DstReg = MI.getOperand(0).getReg(); 2176 Register PtrReg = MI.getOperand(1).getReg(); 2177 Register CmpVal = MI.getOperand(2).getReg(); 2178 Register NewVal = MI.getOperand(3).getReg(); 2179 2180 assert(SITargetLowering::isFlatGlobalAddrSpace( 2181 MRI.getType(PtrReg).getAddressSpace()) && 2182 "this should not have been custom lowered"); 2183 2184 LLT ValTy = MRI.getType(CmpVal); 2185 LLT VecTy = LLT::vector(2, ValTy); 2186 2187 B.setInstr(MI); 2188 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2189 2190 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2191 .addDef(DstReg) 2192 .addUse(PtrReg) 2193 .addUse(PackedVal) 2194 .setMemRefs(MI.memoperands()); 2195 2196 MI.eraseFromParent(); 2197 return true; 2198 } 2199 2200 bool AMDGPULegalizerInfo::legalizeFlog( 2201 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2202 Register Dst = MI.getOperand(0).getReg(); 2203 Register Src = MI.getOperand(1).getReg(); 2204 LLT Ty = B.getMRI()->getType(Dst); 2205 unsigned Flags = MI.getFlags(); 2206 B.setInstr(MI); 2207 2208 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2209 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2210 2211 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2212 MI.eraseFromParent(); 2213 return true; 2214 } 2215 2216 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2217 MachineIRBuilder &B) const { 2218 Register Dst = MI.getOperand(0).getReg(); 2219 Register Src = MI.getOperand(1).getReg(); 2220 unsigned Flags = MI.getFlags(); 2221 LLT Ty = B.getMRI()->getType(Dst); 2222 B.setInstr(MI); 2223 2224 auto K = B.buildFConstant(Ty, numbers::log2e); 2225 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2226 B.buildFExp2(Dst, Mul, Flags); 2227 MI.eraseFromParent(); 2228 return true; 2229 } 2230 2231 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2232 MachineIRBuilder &B) const { 2233 Register Dst = MI.getOperand(0).getReg(); 2234 Register Src0 = MI.getOperand(1).getReg(); 2235 Register Src1 = MI.getOperand(2).getReg(); 2236 unsigned Flags = MI.getFlags(); 2237 LLT Ty = B.getMRI()->getType(Dst); 2238 B.setInstr(MI); 2239 const LLT S16 = LLT::scalar(16); 2240 const LLT S32 = LLT::scalar(32); 2241 2242 if (Ty == S32) { 2243 auto Log = B.buildFLog2(S32, Src0, Flags); 2244 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2245 .addUse(Log.getReg(0)) 2246 .addUse(Src1) 2247 .setMIFlags(Flags); 2248 B.buildFExp2(Dst, Mul, Flags); 2249 } else if (Ty == S16) { 2250 // There's no f16 fmul_legacy, so we need to convert for it. 2251 auto Log = B.buildFLog2(S16, Src0, Flags); 2252 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2253 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2254 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2255 .addUse(Ext0.getReg(0)) 2256 .addUse(Ext1.getReg(0)) 2257 .setMIFlags(Flags); 2258 2259 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2260 } else 2261 return false; 2262 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 // Find a source register, ignoring any possible source modifiers. 2268 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2269 Register ModSrc = OrigSrc; 2270 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2271 ModSrc = SrcFNeg->getOperand(1).getReg(); 2272 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2273 ModSrc = SrcFAbs->getOperand(1).getReg(); 2274 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2275 ModSrc = SrcFAbs->getOperand(1).getReg(); 2276 return ModSrc; 2277 } 2278 2279 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2280 MachineRegisterInfo &MRI, 2281 MachineIRBuilder &B) const { 2282 B.setInstr(MI); 2283 2284 const LLT S1 = LLT::scalar(1); 2285 const LLT S64 = LLT::scalar(64); 2286 Register Dst = MI.getOperand(0).getReg(); 2287 Register OrigSrc = MI.getOperand(1).getReg(); 2288 unsigned Flags = MI.getFlags(); 2289 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2290 "this should not have been custom lowered"); 2291 2292 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2293 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2294 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2295 // V_FRACT bug is: 2296 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2297 // 2298 // Convert floor(x) to (x - fract(x)) 2299 2300 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2301 .addUse(OrigSrc) 2302 .setMIFlags(Flags); 2303 2304 // Give source modifier matching some assistance before obscuring a foldable 2305 // pattern. 2306 2307 // TODO: We can avoid the neg on the fract? The input sign to fract 2308 // shouldn't matter? 2309 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2310 2311 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2312 2313 Register Min = MRI.createGenericVirtualRegister(S64); 2314 2315 // We don't need to concern ourselves with the snan handling difference, so 2316 // use the one which will directly select. 2317 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2318 if (MFI->getMode().IEEE) 2319 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2320 else 2321 B.buildFMinNum(Min, Fract, Const, Flags); 2322 2323 Register CorrectedFract = Min; 2324 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2325 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2326 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2327 } 2328 2329 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2330 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2331 2332 MI.eraseFromParent(); 2333 return true; 2334 } 2335 2336 // Turn an illegal packed v2s16 build vector into bit operations. 2337 // TODO: This should probably be a bitcast action in LegalizerHelper. 2338 bool AMDGPULegalizerInfo::legalizeBuildVector( 2339 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2340 Register Dst = MI.getOperand(0).getReg(); 2341 const LLT S32 = LLT::scalar(32); 2342 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2343 2344 Register Src0 = MI.getOperand(1).getReg(); 2345 Register Src1 = MI.getOperand(2).getReg(); 2346 assert(MRI.getType(Src0) == LLT::scalar(16)); 2347 2348 B.setInstr(MI); 2349 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2350 B.buildBitcast(Dst, Merge); 2351 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 // Return the use branch instruction, otherwise null if the usage is invalid. 2357 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2358 MachineRegisterInfo &MRI, 2359 MachineInstr *&Br, 2360 MachineBasicBlock *&UncondBrTarget) { 2361 Register CondDef = MI.getOperand(0).getReg(); 2362 if (!MRI.hasOneNonDBGUse(CondDef)) 2363 return nullptr; 2364 2365 MachineBasicBlock *Parent = MI.getParent(); 2366 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2367 if (UseMI.getParent() != Parent || 2368 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2369 return nullptr; 2370 2371 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2372 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2373 if (Next == Parent->end()) { 2374 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2375 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2376 return nullptr; 2377 UncondBrTarget = &*NextMBB; 2378 } else { 2379 if (Next->getOpcode() != AMDGPU::G_BR) 2380 return nullptr; 2381 Br = &*Next; 2382 UncondBrTarget = Br->getOperand(0).getMBB(); 2383 } 2384 2385 return &UseMI; 2386 } 2387 2388 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2389 MachineRegisterInfo &MRI, 2390 Register LiveIn, 2391 Register PhyReg) const { 2392 assert(PhyReg.isPhysical() && "Physical register expected"); 2393 2394 // Insert the live-in copy, if required, by defining destination virtual 2395 // register. 2396 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2397 if (!MRI.getVRegDef(LiveIn)) { 2398 // FIXME: Should have scoped insert pt 2399 MachineBasicBlock &OrigInsBB = B.getMBB(); 2400 auto OrigInsPt = B.getInsertPt(); 2401 2402 MachineBasicBlock &EntryMBB = B.getMF().front(); 2403 EntryMBB.addLiveIn(PhyReg); 2404 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2405 B.buildCopy(LiveIn, PhyReg); 2406 2407 B.setInsertPt(OrigInsBB, OrigInsPt); 2408 } 2409 2410 return LiveIn; 2411 } 2412 2413 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2414 MachineRegisterInfo &MRI, 2415 Register PhyReg, LLT Ty, 2416 bool InsertLiveInCopy) const { 2417 assert(PhyReg.isPhysical() && "Physical register expected"); 2418 2419 // Get or create virtual live-in regester 2420 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2421 if (!LiveIn) { 2422 LiveIn = MRI.createGenericVirtualRegister(Ty); 2423 MRI.addLiveIn(PhyReg, LiveIn); 2424 } 2425 2426 // When the actual true copy required is from virtual register to physical 2427 // register (to be inserted later), live-in copy insertion from physical 2428 // to register virtual register is not required 2429 if (!InsertLiveInCopy) 2430 return LiveIn; 2431 2432 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2433 } 2434 2435 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2436 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2437 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2438 const ArgDescriptor *Arg; 2439 const TargetRegisterClass *RC; 2440 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2441 if (!Arg) { 2442 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2443 return nullptr; 2444 } 2445 return Arg; 2446 } 2447 2448 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2449 const ArgDescriptor *Arg) const { 2450 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2451 return false; // TODO: Handle these 2452 2453 Register SrcReg = Arg->getRegister(); 2454 assert(SrcReg.isPhysical() && "Physical register expected"); 2455 assert(DstReg.isVirtual() && "Virtual register expected"); 2456 2457 MachineRegisterInfo &MRI = *B.getMRI(); 2458 2459 LLT Ty = MRI.getType(DstReg); 2460 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2461 2462 if (Arg->isMasked()) { 2463 // TODO: Should we try to emit this once in the entry block? 2464 const LLT S32 = LLT::scalar(32); 2465 const unsigned Mask = Arg->getMask(); 2466 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2467 2468 Register AndMaskSrc = LiveIn; 2469 2470 if (Shift != 0) { 2471 auto ShiftAmt = B.buildConstant(S32, Shift); 2472 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2473 } 2474 2475 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2476 } else { 2477 B.buildCopy(DstReg, LiveIn); 2478 } 2479 2480 return true; 2481 } 2482 2483 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2484 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2485 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2486 B.setInstr(MI); 2487 2488 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2489 if (!Arg) 2490 return false; 2491 2492 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2493 return false; 2494 2495 MI.eraseFromParent(); 2496 return true; 2497 } 2498 2499 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2500 MachineRegisterInfo &MRI, 2501 MachineIRBuilder &B) const { 2502 B.setInstr(MI); 2503 Register Dst = MI.getOperand(0).getReg(); 2504 LLT DstTy = MRI.getType(Dst); 2505 LLT S16 = LLT::scalar(16); 2506 LLT S32 = LLT::scalar(32); 2507 LLT S64 = LLT::scalar(64); 2508 2509 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2510 return true; 2511 2512 if (DstTy == S16) 2513 return legalizeFDIV16(MI, MRI, B); 2514 if (DstTy == S32) 2515 return legalizeFDIV32(MI, MRI, B); 2516 if (DstTy == S64) 2517 return legalizeFDIV64(MI, MRI, B); 2518 2519 return false; 2520 } 2521 2522 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2523 const LLT S32 = LLT::scalar(32); 2524 2525 auto Cvt0 = B.buildUITOFP(S32, Src); 2526 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2527 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2528 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2529 return B.buildFPTOUI(S32, Mul).getReg(0); 2530 } 2531 2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2533 Register DstReg, 2534 Register Num, 2535 Register Den, 2536 bool IsRem) const { 2537 const LLT S1 = LLT::scalar(1); 2538 const LLT S32 = LLT::scalar(32); 2539 2540 // RCP = URECIP(Den) = 2^32 / Den + e 2541 // e is rounding error. 2542 auto RCP = buildDivRCP(B, Den); 2543 2544 // RCP_LO = mul(RCP, Den) 2545 auto RCP_LO = B.buildMul(S32, RCP, Den); 2546 2547 // RCP_HI = mulhu (RCP, Den) */ 2548 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2549 2550 // NEG_RCP_LO = -RCP_LO 2551 auto Zero = B.buildConstant(S32, 0); 2552 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2553 2554 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2555 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2556 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2557 2558 // Calculate the rounding error from the URECIP instruction 2559 // E = mulhu(ABS_RCP_LO, RCP) 2560 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2561 2562 // RCP_A_E = RCP + E 2563 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2564 2565 // RCP_S_E = RCP - E 2566 auto RCP_S_E = B.buildSub(S32, RCP, E); 2567 2568 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2569 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2570 2571 // Quotient = mulhu(Tmp0, Num)stmp 2572 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2573 2574 // Num_S_Remainder = Quotient * Den 2575 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2576 2577 // Remainder = Num - Num_S_Remainder 2578 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2579 2580 // Remainder_GE_Den = Remainder >= Den 2581 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2582 2583 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2584 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2585 Num, Num_S_Remainder); 2586 2587 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2588 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2589 2590 // Calculate Division result: 2591 2592 // Quotient_A_One = Quotient + 1 2593 auto One = B.buildConstant(S32, 1); 2594 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2595 2596 // Quotient_S_One = Quotient - 1 2597 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2598 2599 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2600 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2601 2602 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2603 if (IsRem) { 2604 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2605 2606 // Calculate Rem result: 2607 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2608 2609 // Remainder_A_Den = Remainder + Den 2610 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2611 2612 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2613 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2614 2615 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2616 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2617 } else { 2618 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2619 } 2620 } 2621 2622 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2623 MachineRegisterInfo &MRI, 2624 MachineIRBuilder &B) const { 2625 B.setInstr(MI); 2626 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2627 Register DstReg = MI.getOperand(0).getReg(); 2628 Register Num = MI.getOperand(1).getReg(); 2629 Register Den = MI.getOperand(2).getReg(); 2630 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2631 MI.eraseFromParent(); 2632 return true; 2633 } 2634 2635 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2636 // 2637 // Return lo, hi of result 2638 // 2639 // %cvt.lo = G_UITOFP Val.lo 2640 // %cvt.hi = G_UITOFP Val.hi 2641 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2642 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2643 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2644 // %mul2 = G_FMUL %mul1, 2**(-32) 2645 // %trunc = G_INTRINSIC_TRUNC %mul2 2646 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2647 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2648 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2649 Register Val) { 2650 const LLT S32 = LLT::scalar(32); 2651 auto Unmerge = B.buildUnmerge(S32, Val); 2652 2653 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2654 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2655 2656 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2657 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2658 2659 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2660 auto Mul1 = 2661 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2662 2663 // 2**(-32) 2664 auto Mul2 = 2665 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2666 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2667 2668 // -(2**32) 2669 auto Mad2 = B.buildFMAD(S32, Trunc, 2670 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2671 2672 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2673 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2674 2675 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2676 } 2677 2678 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2679 MachineRegisterInfo &MRI, 2680 MachineIRBuilder &B) const { 2681 B.setInstr(MI); 2682 2683 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2684 const LLT S32 = LLT::scalar(32); 2685 const LLT S64 = LLT::scalar(64); 2686 const LLT S1 = LLT::scalar(1); 2687 Register Numer = MI.getOperand(1).getReg(); 2688 Register Denom = MI.getOperand(2).getReg(); 2689 Register RcpLo, RcpHi; 2690 2691 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2692 2693 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2694 2695 auto Zero64 = B.buildConstant(S64, 0); 2696 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2697 2698 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2699 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2700 2701 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2702 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2703 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2704 2705 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2706 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2707 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2708 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2709 2710 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2711 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2712 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2713 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2714 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2715 2716 auto Zero32 = B.buildConstant(S32, 0); 2717 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2718 auto Add2_HiC = 2719 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2720 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2721 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2722 2723 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2724 Register NumerLo = UnmergeNumer.getReg(0); 2725 Register NumerHi = UnmergeNumer.getReg(1); 2726 2727 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2728 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2729 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2730 Register Mul3_Lo = UnmergeMul3.getReg(0); 2731 Register Mul3_Hi = UnmergeMul3.getReg(1); 2732 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2733 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2734 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2735 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2736 2737 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2738 Register DenomLo = UnmergeDenom.getReg(0); 2739 Register DenomHi = UnmergeDenom.getReg(1); 2740 2741 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2742 auto C1 = B.buildSExt(S32, CmpHi); 2743 2744 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2745 auto C2 = B.buildSExt(S32, CmpLo); 2746 2747 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2748 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2749 2750 // TODO: Here and below portions of the code can be enclosed into if/endif. 2751 // Currently control flow is unconditional and we have 4 selects after 2752 // potential endif to substitute PHIs. 2753 2754 // if C3 != 0 ... 2755 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2756 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2757 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2758 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2759 2760 auto One64 = B.buildConstant(S64, 1); 2761 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2762 2763 auto C4 = 2764 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2765 auto C5 = 2766 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2767 auto C6 = B.buildSelect( 2768 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2769 2770 // if (C6 != 0) 2771 auto Add4 = B.buildAdd(S64, Add3, One64); 2772 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2773 2774 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2775 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2776 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2777 2778 // endif C6 2779 // endif C3 2780 2781 if (IsDiv) { 2782 auto Sel1 = B.buildSelect( 2783 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2784 B.buildSelect(MI.getOperand(0), 2785 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2786 } else { 2787 auto Sel2 = B.buildSelect( 2788 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2789 B.buildSelect(MI.getOperand(0), 2790 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2791 } 2792 2793 MI.eraseFromParent(); 2794 return true; 2795 } 2796 2797 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2798 MachineRegisterInfo &MRI, 2799 MachineIRBuilder &B) const { 2800 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2801 if (Ty == LLT::scalar(32)) 2802 return legalizeUDIV_UREM32(MI, MRI, B); 2803 if (Ty == LLT::scalar(64)) 2804 return legalizeUDIV_UREM64(MI, MRI, B); 2805 return false; 2806 } 2807 2808 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2809 MachineRegisterInfo &MRI, 2810 MachineIRBuilder &B) const { 2811 B.setInstr(MI); 2812 const LLT S32 = LLT::scalar(32); 2813 2814 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2815 Register DstReg = MI.getOperand(0).getReg(); 2816 Register LHS = MI.getOperand(1).getReg(); 2817 Register RHS = MI.getOperand(2).getReg(); 2818 2819 auto ThirtyOne = B.buildConstant(S32, 31); 2820 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2821 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2822 2823 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2824 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2825 2826 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2827 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2828 2829 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2830 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2831 2832 if (IsRem) { 2833 auto RSign = LHSign; // Remainder sign is the same as LHS 2834 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2835 B.buildSub(DstReg, UDivRem, RSign); 2836 } else { 2837 auto DSign = B.buildXor(S32, LHSign, RHSign); 2838 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2839 B.buildSub(DstReg, UDivRem, DSign); 2840 } 2841 2842 MI.eraseFromParent(); 2843 return true; 2844 } 2845 2846 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2847 MachineRegisterInfo &MRI, 2848 MachineIRBuilder &B) const { 2849 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2850 return legalizeSDIV_SREM32(MI, MRI, B); 2851 return false; 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2855 MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register Res = MI.getOperand(0).getReg(); 2858 Register LHS = MI.getOperand(1).getReg(); 2859 Register RHS = MI.getOperand(2).getReg(); 2860 2861 uint16_t Flags = MI.getFlags(); 2862 2863 LLT ResTy = MRI.getType(Res); 2864 LLT S32 = LLT::scalar(32); 2865 LLT S64 = LLT::scalar(64); 2866 2867 const MachineFunction &MF = B.getMF(); 2868 bool Unsafe = 2869 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2870 2871 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2872 return false; 2873 2874 if (!Unsafe && ResTy == S32 && 2875 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2876 return false; 2877 2878 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2879 // 1 / x -> RCP(x) 2880 if (CLHS->isExactlyValue(1.0)) { 2881 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2882 .addUse(RHS) 2883 .setMIFlags(Flags); 2884 2885 MI.eraseFromParent(); 2886 return true; 2887 } 2888 2889 // -1 / x -> RCP( FNEG(x) ) 2890 if (CLHS->isExactlyValue(-1.0)) { 2891 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2892 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2893 .addUse(FNeg.getReg(0)) 2894 .setMIFlags(Flags); 2895 2896 MI.eraseFromParent(); 2897 return true; 2898 } 2899 } 2900 2901 // x / y -> x * (1.0 / y) 2902 if (Unsafe) { 2903 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2904 .addUse(RHS) 2905 .setMIFlags(Flags); 2906 B.buildFMul(Res, LHS, RCP, Flags); 2907 2908 MI.eraseFromParent(); 2909 return true; 2910 } 2911 2912 return false; 2913 } 2914 2915 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2916 MachineRegisterInfo &MRI, 2917 MachineIRBuilder &B) const { 2918 B.setInstr(MI); 2919 Register Res = MI.getOperand(0).getReg(); 2920 Register LHS = MI.getOperand(1).getReg(); 2921 Register RHS = MI.getOperand(2).getReg(); 2922 2923 uint16_t Flags = MI.getFlags(); 2924 2925 LLT S16 = LLT::scalar(16); 2926 LLT S32 = LLT::scalar(32); 2927 2928 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2929 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2930 2931 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2932 .addUse(RHSExt.getReg(0)) 2933 .setMIFlags(Flags); 2934 2935 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2936 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2937 2938 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2939 .addUse(RDst.getReg(0)) 2940 .addUse(RHS) 2941 .addUse(LHS) 2942 .setMIFlags(Flags); 2943 2944 MI.eraseFromParent(); 2945 return true; 2946 } 2947 2948 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2949 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2950 static void toggleSPDenormMode(bool Enable, 2951 MachineIRBuilder &B, 2952 const GCNSubtarget &ST, 2953 AMDGPU::SIModeRegisterDefaults Mode) { 2954 // Set SP denorm mode to this value. 2955 unsigned SPDenormMode = 2956 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2957 2958 if (ST.hasDenormModeInst()) { 2959 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2960 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2961 2962 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2963 B.buildInstr(AMDGPU::S_DENORM_MODE) 2964 .addImm(NewDenormModeValue); 2965 2966 } else { 2967 // Select FP32 bit field in mode register. 2968 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2969 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2970 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2971 2972 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2973 .addImm(SPDenormMode) 2974 .addImm(SPDenormModeBitField); 2975 } 2976 } 2977 2978 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2979 MachineRegisterInfo &MRI, 2980 MachineIRBuilder &B) const { 2981 B.setInstr(MI); 2982 Register Res = MI.getOperand(0).getReg(); 2983 Register LHS = MI.getOperand(1).getReg(); 2984 Register RHS = MI.getOperand(2).getReg(); 2985 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2986 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2987 2988 uint16_t Flags = MI.getFlags(); 2989 2990 LLT S32 = LLT::scalar(32); 2991 LLT S1 = LLT::scalar(1); 2992 2993 auto One = B.buildFConstant(S32, 1.0f); 2994 2995 auto DenominatorScaled = 2996 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2997 .addUse(LHS) 2998 .addUse(RHS) 2999 .addImm(0) 3000 .setMIFlags(Flags); 3001 auto NumeratorScaled = 3002 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3003 .addUse(LHS) 3004 .addUse(RHS) 3005 .addImm(1) 3006 .setMIFlags(Flags); 3007 3008 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3009 .addUse(DenominatorScaled.getReg(0)) 3010 .setMIFlags(Flags); 3011 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3012 3013 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3014 // aren't modeled as reading it. 3015 if (!Mode.allFP32Denormals()) 3016 toggleSPDenormMode(true, B, ST, Mode); 3017 3018 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3019 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3020 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3021 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3022 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3023 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3024 3025 if (!Mode.allFP32Denormals()) 3026 toggleSPDenormMode(false, B, ST, Mode); 3027 3028 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3029 .addUse(Fma4.getReg(0)) 3030 .addUse(Fma1.getReg(0)) 3031 .addUse(Fma3.getReg(0)) 3032 .addUse(NumeratorScaled.getReg(1)) 3033 .setMIFlags(Flags); 3034 3035 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3036 .addUse(Fmas.getReg(0)) 3037 .addUse(RHS) 3038 .addUse(LHS) 3039 .setMIFlags(Flags); 3040 3041 MI.eraseFromParent(); 3042 return true; 3043 } 3044 3045 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3046 MachineRegisterInfo &MRI, 3047 MachineIRBuilder &B) const { 3048 B.setInstr(MI); 3049 Register Res = MI.getOperand(0).getReg(); 3050 Register LHS = MI.getOperand(1).getReg(); 3051 Register RHS = MI.getOperand(2).getReg(); 3052 3053 uint16_t Flags = MI.getFlags(); 3054 3055 LLT S64 = LLT::scalar(64); 3056 LLT S1 = LLT::scalar(1); 3057 3058 auto One = B.buildFConstant(S64, 1.0); 3059 3060 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3061 .addUse(LHS) 3062 .addUse(RHS) 3063 .addImm(0) 3064 .setMIFlags(Flags); 3065 3066 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3067 3068 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3069 .addUse(DivScale0.getReg(0)) 3070 .setMIFlags(Flags); 3071 3072 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3073 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3074 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3075 3076 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3077 .addUse(LHS) 3078 .addUse(RHS) 3079 .addImm(1) 3080 .setMIFlags(Flags); 3081 3082 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3083 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3084 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3085 3086 Register Scale; 3087 if (!ST.hasUsableDivScaleConditionOutput()) { 3088 // Workaround a hardware bug on SI where the condition output from div_scale 3089 // is not usable. 3090 3091 LLT S32 = LLT::scalar(32); 3092 3093 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3094 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3095 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3096 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3097 3098 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3099 Scale1Unmerge.getReg(1)); 3100 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3101 Scale0Unmerge.getReg(1)); 3102 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3103 } else { 3104 Scale = DivScale1.getReg(1); 3105 } 3106 3107 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3108 .addUse(Fma4.getReg(0)) 3109 .addUse(Fma3.getReg(0)) 3110 .addUse(Mul.getReg(0)) 3111 .addUse(Scale) 3112 .setMIFlags(Flags); 3113 3114 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3115 .addUse(Fmas.getReg(0)) 3116 .addUse(RHS) 3117 .addUse(LHS) 3118 .setMIFlags(Flags); 3119 3120 MI.eraseFromParent(); 3121 return true; 3122 } 3123 3124 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3125 MachineRegisterInfo &MRI, 3126 MachineIRBuilder &B) const { 3127 B.setInstr(MI); 3128 Register Res = MI.getOperand(0).getReg(); 3129 Register LHS = MI.getOperand(2).getReg(); 3130 Register RHS = MI.getOperand(3).getReg(); 3131 uint16_t Flags = MI.getFlags(); 3132 3133 LLT S32 = LLT::scalar(32); 3134 LLT S1 = LLT::scalar(1); 3135 3136 auto Abs = B.buildFAbs(S32, RHS, Flags); 3137 const APFloat C0Val(1.0f); 3138 3139 auto C0 = B.buildConstant(S32, 0x6f800000); 3140 auto C1 = B.buildConstant(S32, 0x2f800000); 3141 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3142 3143 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3144 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3145 3146 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3147 3148 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3149 .addUse(Mul0.getReg(0)) 3150 .setMIFlags(Flags); 3151 3152 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3153 3154 B.buildFMul(Res, Sel, Mul1, Flags); 3155 3156 MI.eraseFromParent(); 3157 return true; 3158 } 3159 3160 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3161 MachineRegisterInfo &MRI, 3162 MachineIRBuilder &B) const { 3163 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3164 if (!MFI->isEntryFunction()) { 3165 return legalizePreloadedArgIntrin(MI, MRI, B, 3166 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3167 } 3168 3169 B.setInstr(MI); 3170 3171 uint64_t Offset = 3172 ST.getTargetLowering()->getImplicitParameterOffset( 3173 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3174 Register DstReg = MI.getOperand(0).getReg(); 3175 LLT DstTy = MRI.getType(DstReg); 3176 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3177 3178 const ArgDescriptor *Arg; 3179 const TargetRegisterClass *RC; 3180 std::tie(Arg, RC) 3181 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3182 if (!Arg) 3183 return false; 3184 3185 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3186 if (!loadInputValue(KernargPtrReg, B, Arg)) 3187 return false; 3188 3189 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3190 MI.eraseFromParent(); 3191 return true; 3192 } 3193 3194 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3195 MachineRegisterInfo &MRI, 3196 MachineIRBuilder &B, 3197 unsigned AddrSpace) const { 3198 B.setInstr(MI); 3199 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3200 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3201 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3202 MI.eraseFromParent(); 3203 return true; 3204 } 3205 3206 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3207 // offset (the offset that is included in bounds checking and swizzling, to be 3208 // split between the instruction's voffset and immoffset fields) and soffset 3209 // (the offset that is excluded from bounds checking and swizzling, to go in 3210 // the instruction's soffset field). This function takes the first kind of 3211 // offset and figures out how to split it between voffset and immoffset. 3212 std::tuple<Register, unsigned, unsigned> 3213 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3214 Register OrigOffset) const { 3215 const unsigned MaxImm = 4095; 3216 Register BaseReg; 3217 unsigned TotalConstOffset; 3218 MachineInstr *OffsetDef; 3219 const LLT S32 = LLT::scalar(32); 3220 3221 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3222 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3223 3224 unsigned ImmOffset = TotalConstOffset; 3225 3226 // If the immediate value is too big for the immoffset field, put the value 3227 // and -4096 into the immoffset field so that the value that is copied/added 3228 // for the voffset field is a multiple of 4096, and it stands more chance 3229 // of being CSEd with the copy/add for another similar load/store. 3230 // However, do not do that rounding down to a multiple of 4096 if that is a 3231 // negative number, as it appears to be illegal to have a negative offset 3232 // in the vgpr, even if adding the immediate offset makes it positive. 3233 unsigned Overflow = ImmOffset & ~MaxImm; 3234 ImmOffset -= Overflow; 3235 if ((int32_t)Overflow < 0) { 3236 Overflow += ImmOffset; 3237 ImmOffset = 0; 3238 } 3239 3240 if (Overflow != 0) { 3241 if (!BaseReg) { 3242 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3243 } else { 3244 auto OverflowVal = B.buildConstant(S32, Overflow); 3245 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3246 } 3247 } 3248 3249 if (!BaseReg) 3250 BaseReg = B.buildConstant(S32, 0).getReg(0); 3251 3252 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3253 } 3254 3255 /// Handle register layout difference for f16 images for some subtargets. 3256 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3257 MachineRegisterInfo &MRI, 3258 Register Reg) const { 3259 if (!ST.hasUnpackedD16VMem()) 3260 return Reg; 3261 3262 const LLT S16 = LLT::scalar(16); 3263 const LLT S32 = LLT::scalar(32); 3264 LLT StoreVT = MRI.getType(Reg); 3265 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3266 3267 auto Unmerge = B.buildUnmerge(S16, Reg); 3268 3269 SmallVector<Register, 4> WideRegs; 3270 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3271 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3272 3273 int NumElts = StoreVT.getNumElements(); 3274 3275 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3276 } 3277 3278 Register AMDGPULegalizerInfo::fixStoreSourceType( 3279 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3280 MachineRegisterInfo *MRI = B.getMRI(); 3281 LLT Ty = MRI->getType(VData); 3282 3283 const LLT S16 = LLT::scalar(16); 3284 3285 // Fixup illegal register types for i8 stores. 3286 if (Ty == LLT::scalar(8) || Ty == S16) { 3287 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3288 return AnyExt; 3289 } 3290 3291 if (Ty.isVector()) { 3292 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3293 if (IsFormat) 3294 return handleD16VData(B, *MRI, VData); 3295 } 3296 } 3297 3298 return VData; 3299 } 3300 3301 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3302 MachineRegisterInfo &MRI, 3303 MachineIRBuilder &B, 3304 bool IsTyped, 3305 bool IsFormat) const { 3306 B.setInstr(MI); 3307 3308 Register VData = MI.getOperand(1).getReg(); 3309 LLT Ty = MRI.getType(VData); 3310 LLT EltTy = Ty.getScalarType(); 3311 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3312 const LLT S32 = LLT::scalar(32); 3313 3314 VData = fixStoreSourceType(B, VData, IsFormat); 3315 Register RSrc = MI.getOperand(2).getReg(); 3316 3317 MachineMemOperand *MMO = *MI.memoperands_begin(); 3318 const int MemSize = MMO->getSize(); 3319 3320 unsigned ImmOffset; 3321 unsigned TotalOffset; 3322 3323 // The typed intrinsics add an immediate after the registers. 3324 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3325 3326 // The struct intrinsic variants add one additional operand over raw. 3327 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3328 Register VIndex; 3329 int OpOffset = 0; 3330 if (HasVIndex) { 3331 VIndex = MI.getOperand(3).getReg(); 3332 OpOffset = 1; 3333 } 3334 3335 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3336 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3337 3338 unsigned Format = 0; 3339 if (IsTyped) { 3340 Format = MI.getOperand(5 + OpOffset).getImm(); 3341 ++OpOffset; 3342 } 3343 3344 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3345 3346 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3347 if (TotalOffset != 0) 3348 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3349 3350 unsigned Opc; 3351 if (IsTyped) { 3352 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3353 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3354 } else if (IsFormat) { 3355 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3356 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3357 } else { 3358 switch (MemSize) { 3359 case 1: 3360 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3361 break; 3362 case 2: 3363 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3364 break; 3365 default: 3366 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3367 break; 3368 } 3369 } 3370 3371 if (!VIndex) 3372 VIndex = B.buildConstant(S32, 0).getReg(0); 3373 3374 auto MIB = B.buildInstr(Opc) 3375 .addUse(VData) // vdata 3376 .addUse(RSrc) // rsrc 3377 .addUse(VIndex) // vindex 3378 .addUse(VOffset) // voffset 3379 .addUse(SOffset) // soffset 3380 .addImm(ImmOffset); // offset(imm) 3381 3382 if (IsTyped) 3383 MIB.addImm(Format); 3384 3385 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3386 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3387 .addMemOperand(MMO); 3388 3389 MI.eraseFromParent(); 3390 return true; 3391 } 3392 3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3394 MachineRegisterInfo &MRI, 3395 MachineIRBuilder &B, 3396 bool IsFormat, 3397 bool IsTyped) const { 3398 B.setInstr(MI); 3399 3400 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3401 MachineMemOperand *MMO = *MI.memoperands_begin(); 3402 const int MemSize = MMO->getSize(); 3403 const LLT S32 = LLT::scalar(32); 3404 3405 Register Dst = MI.getOperand(0).getReg(); 3406 Register RSrc = MI.getOperand(2).getReg(); 3407 3408 // The typed intrinsics add an immediate after the registers. 3409 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3410 3411 // The struct intrinsic variants add one additional operand over raw. 3412 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3413 Register VIndex; 3414 int OpOffset = 0; 3415 if (HasVIndex) { 3416 VIndex = MI.getOperand(3).getReg(); 3417 OpOffset = 1; 3418 } 3419 3420 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3421 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3422 3423 unsigned Format = 0; 3424 if (IsTyped) { 3425 Format = MI.getOperand(5 + OpOffset).getImm(); 3426 ++OpOffset; 3427 } 3428 3429 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3430 unsigned ImmOffset; 3431 unsigned TotalOffset; 3432 3433 LLT Ty = MRI.getType(Dst); 3434 LLT EltTy = Ty.getScalarType(); 3435 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3436 const bool Unpacked = ST.hasUnpackedD16VMem(); 3437 3438 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3439 if (TotalOffset != 0) 3440 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3441 3442 unsigned Opc; 3443 3444 if (IsTyped) { 3445 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3446 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3447 } else if (IsFormat) { 3448 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3449 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3450 } else { 3451 switch (MemSize) { 3452 case 1: 3453 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3454 break; 3455 case 2: 3456 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3457 break; 3458 default: 3459 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3460 break; 3461 } 3462 } 3463 3464 Register LoadDstReg; 3465 3466 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3467 LLT UnpackedTy = Ty.changeElementSize(32); 3468 3469 if (IsExtLoad) 3470 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3471 else if (Unpacked && IsD16 && Ty.isVector()) 3472 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3473 else 3474 LoadDstReg = Dst; 3475 3476 if (!VIndex) 3477 VIndex = B.buildConstant(S32, 0).getReg(0); 3478 3479 auto MIB = B.buildInstr(Opc) 3480 .addDef(LoadDstReg) // vdata 3481 .addUse(RSrc) // rsrc 3482 .addUse(VIndex) // vindex 3483 .addUse(VOffset) // voffset 3484 .addUse(SOffset) // soffset 3485 .addImm(ImmOffset); // offset(imm) 3486 3487 if (IsTyped) 3488 MIB.addImm(Format); 3489 3490 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3491 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3492 .addMemOperand(MMO); 3493 3494 if (LoadDstReg != Dst) { 3495 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3496 3497 // Widen result for extending loads was widened. 3498 if (IsExtLoad) 3499 B.buildTrunc(Dst, LoadDstReg); 3500 else { 3501 // Repack to original 16-bit vector result 3502 // FIXME: G_TRUNC should work, but legalization currently fails 3503 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3504 SmallVector<Register, 4> Repack; 3505 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3506 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3507 B.buildMerge(Dst, Repack); 3508 } 3509 } 3510 3511 MI.eraseFromParent(); 3512 return true; 3513 } 3514 3515 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3516 MachineIRBuilder &B, 3517 bool IsInc) const { 3518 B.setInstr(MI); 3519 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3520 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3521 B.buildInstr(Opc) 3522 .addDef(MI.getOperand(0).getReg()) 3523 .addUse(MI.getOperand(2).getReg()) 3524 .addUse(MI.getOperand(3).getReg()) 3525 .cloneMemRefs(MI); 3526 MI.eraseFromParent(); 3527 return true; 3528 } 3529 3530 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3531 switch (IntrID) { 3532 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3533 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3534 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3535 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3536 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3537 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3538 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3539 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3541 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3542 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3544 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3545 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3546 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3547 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3548 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3549 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3550 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3551 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3552 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3553 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3554 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3556 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3557 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3558 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3559 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3560 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3561 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3562 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3563 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3564 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3565 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3566 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3567 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3568 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3569 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3571 default: 3572 llvm_unreachable("unhandled atomic opcode"); 3573 } 3574 } 3575 3576 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3577 MachineIRBuilder &B, 3578 Intrinsic::ID IID) const { 3579 B.setInstr(MI); 3580 3581 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3582 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3583 3584 Register Dst = MI.getOperand(0).getReg(); 3585 Register VData = MI.getOperand(2).getReg(); 3586 3587 Register CmpVal; 3588 int OpOffset = 0; 3589 3590 if (IsCmpSwap) { 3591 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3592 ++OpOffset; 3593 } 3594 3595 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3596 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3597 3598 // The struct intrinsic variants add one additional operand over raw. 3599 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3600 Register VIndex; 3601 if (HasVIndex) { 3602 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3603 ++OpOffset; 3604 } 3605 3606 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3607 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3608 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3609 3610 MachineMemOperand *MMO = *MI.memoperands_begin(); 3611 3612 unsigned ImmOffset; 3613 unsigned TotalOffset; 3614 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3615 if (TotalOffset != 0) 3616 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3617 3618 if (!VIndex) 3619 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3620 3621 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3622 .addDef(Dst) 3623 .addUse(VData); // vdata 3624 3625 if (IsCmpSwap) 3626 MIB.addReg(CmpVal); 3627 3628 MIB.addUse(RSrc) // rsrc 3629 .addUse(VIndex) // vindex 3630 .addUse(VOffset) // voffset 3631 .addUse(SOffset) // soffset 3632 .addImm(ImmOffset) // offset(imm) 3633 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3634 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3635 .addMemOperand(MMO); 3636 3637 MI.eraseFromParent(); 3638 return true; 3639 } 3640 3641 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3642 /// vector with s16 typed elements. 3643 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3644 SmallVectorImpl<Register> &PackedAddrs, 3645 int AddrIdx, int DimIdx, int NumVAddrs, 3646 int NumGradients) { 3647 const LLT S16 = LLT::scalar(16); 3648 const LLT V2S16 = LLT::vector(2, 16); 3649 3650 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3651 MachineOperand &SrcOp = MI.getOperand(I); 3652 if (!SrcOp.isReg()) 3653 continue; // _L to _LZ may have eliminated this. 3654 3655 Register AddrReg = SrcOp.getReg(); 3656 3657 if (I < DimIdx) { 3658 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3659 PackedAddrs.push_back(AddrReg); 3660 } else { 3661 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3662 // derivatives dx/dh and dx/dv are packed with undef. 3663 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3664 ((NumGradients / 2) % 2 == 1 && 3665 (I == DimIdx + (NumGradients / 2) - 1 || 3666 I == DimIdx + NumGradients - 1)) || 3667 // Check for _L to _LZ optimization 3668 !MI.getOperand(I + 1).isReg()) { 3669 PackedAddrs.push_back( 3670 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3671 .getReg(0)); 3672 } else { 3673 PackedAddrs.push_back( 3674 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3675 .getReg(0)); 3676 ++I; 3677 } 3678 } 3679 } 3680 } 3681 3682 /// Convert from separate vaddr components to a single vector address register, 3683 /// and replace the remaining operands with $noreg. 3684 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3685 int DimIdx, int NumVAddrs) { 3686 const LLT S32 = LLT::scalar(32); 3687 3688 SmallVector<Register, 8> AddrRegs; 3689 for (int I = 0; I != NumVAddrs; ++I) { 3690 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3691 if (SrcOp.isReg()) { 3692 AddrRegs.push_back(SrcOp.getReg()); 3693 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3694 } 3695 } 3696 3697 int NumAddrRegs = AddrRegs.size(); 3698 if (NumAddrRegs != 1) { 3699 // Round up to 8 elements for v5-v7 3700 // FIXME: Missing intermediate sized register classes and instructions. 3701 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3702 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3703 auto Undef = B.buildUndef(S32); 3704 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3705 NumAddrRegs = RoundedNumRegs; 3706 } 3707 3708 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3709 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3710 } 3711 3712 for (int I = 1; I != NumVAddrs; ++I) { 3713 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3714 if (SrcOp.isReg()) 3715 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3716 } 3717 } 3718 3719 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3720 /// 3721 /// Depending on the subtarget, load/store with 16-bit element data need to be 3722 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3723 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3724 /// registers. 3725 /// 3726 /// We don't want to directly select image instructions just yet, but also want 3727 /// to exposes all register repacking to the legalizer/combiners. We also don't 3728 /// want a selected instrution entering RegBankSelect. In order to avoid 3729 /// defining a multitude of intermediate image instructions, directly hack on 3730 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3731 /// now unnecessary arguments with $noreg. 3732 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3733 MachineInstr &MI, MachineIRBuilder &B, 3734 GISelChangeObserver &Observer, 3735 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3736 B.setInstr(MI); 3737 3738 const int NumDefs = MI.getNumExplicitDefs(); 3739 bool IsTFE = NumDefs == 2; 3740 // We are only processing the operands of d16 image operations on subtargets 3741 // that use the unpacked register layout, or need to repack the TFE result. 3742 3743 // TODO: Do we need to guard against already legalized intrinsics? 3744 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3745 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3746 3747 MachineRegisterInfo *MRI = B.getMRI(); 3748 const LLT S32 = LLT::scalar(32); 3749 const LLT S16 = LLT::scalar(16); 3750 const LLT V2S16 = LLT::vector(2, 16); 3751 3752 // Index of first address argument 3753 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3754 3755 // Check for 16 bit addresses and pack if true. 3756 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3757 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3758 const bool IsA16 = AddrTy == S16; 3759 3760 int NumVAddrs, NumGradients; 3761 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3762 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3763 getDMaskIdx(BaseOpcode, NumDefs); 3764 unsigned DMask = 0; 3765 3766 int DMaskLanes = 0; 3767 if (!BaseOpcode->Atomic) { 3768 DMask = MI.getOperand(DMaskIdx).getImm(); 3769 if (BaseOpcode->Gather4) { 3770 DMaskLanes = 4; 3771 } else if (DMask != 0) { 3772 DMaskLanes = countPopulation(DMask); 3773 } else if (!IsTFE && !BaseOpcode->Store) { 3774 // If dmask is 0, this is a no-op load. This can be eliminated. 3775 B.buildUndef(MI.getOperand(0)); 3776 MI.eraseFromParent(); 3777 return true; 3778 } 3779 } 3780 3781 Observer.changingInstr(MI); 3782 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3783 3784 unsigned NewOpcode = NumDefs == 0 ? 3785 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3786 3787 // Track that we legalized this 3788 MI.setDesc(B.getTII().get(NewOpcode)); 3789 3790 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3791 // dmask to be at least 1 otherwise the instruction will fail 3792 if (IsTFE && DMask == 0) { 3793 DMask = 0x1; 3794 DMaskLanes = 1; 3795 MI.getOperand(DMaskIdx).setImm(DMask); 3796 } 3797 3798 if (BaseOpcode->Atomic) { 3799 Register VData0 = MI.getOperand(2).getReg(); 3800 LLT Ty = MRI->getType(VData0); 3801 3802 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3803 if (Ty.isVector()) 3804 return false; 3805 3806 if (BaseOpcode->AtomicX2) { 3807 Register VData1 = MI.getOperand(3).getReg(); 3808 // The two values are packed in one register. 3809 LLT PackedTy = LLT::vector(2, Ty); 3810 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3811 MI.getOperand(2).setReg(Concat.getReg(0)); 3812 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3813 } 3814 } 3815 3816 int CorrectedNumVAddrs = NumVAddrs; 3817 3818 // Optimize _L to _LZ when _L is zero 3819 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3820 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3821 const ConstantFP *ConstantLod; 3822 const int LodIdx = AddrIdx + NumVAddrs - 1; 3823 3824 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3825 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3826 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3827 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3828 LZMappingInfo->LZ, ImageDimIntr->Dim); 3829 3830 // The starting indexes should remain in the same place. 3831 --NumVAddrs; 3832 --CorrectedNumVAddrs; 3833 3834 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3835 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3836 MI.RemoveOperand(LodIdx); 3837 } 3838 } 3839 } 3840 3841 // Optimize _mip away, when 'lod' is zero 3842 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3843 int64_t ConstantLod; 3844 const int LodIdx = AddrIdx + NumVAddrs - 1; 3845 3846 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3847 if (ConstantLod == 0) { 3848 // TODO: Change intrinsic opcode and remove operand instead or replacing 3849 // it with 0, as the _L to _LZ handling is done above. 3850 MI.getOperand(LodIdx).ChangeToImmediate(0); 3851 --CorrectedNumVAddrs; 3852 } 3853 } 3854 } 3855 3856 // If the register allocator cannot place the address registers contiguously 3857 // without introducing moves, then using the non-sequential address encoding 3858 // is always preferable, since it saves VALU instructions and is usually a 3859 // wash in terms of code size or even better. 3860 // 3861 // However, we currently have no way of hinting to the register allocator 3862 // that MIMG addresses should be placed contiguously when it is possible to 3863 // do so, so force non-NSA for the common 2-address case as a heuristic. 3864 // 3865 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3866 // allocation when possible. 3867 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3868 3869 // Rewrite the addressing register layout before doing anything else. 3870 if (IsA16) { 3871 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3872 // should be introduced. 3873 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3874 return false; 3875 3876 if (NumVAddrs > 1) { 3877 SmallVector<Register, 4> PackedRegs; 3878 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3879 NumGradients); 3880 3881 if (!UseNSA && PackedRegs.size() > 1) { 3882 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3883 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3884 PackedRegs[0] = Concat.getReg(0); 3885 PackedRegs.resize(1); 3886 } 3887 3888 const int NumPacked = PackedRegs.size(); 3889 for (int I = 0; I != NumVAddrs; ++I) { 3890 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3891 if (!SrcOp.isReg()) { 3892 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3893 continue; 3894 } 3895 3896 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3897 3898 if (I < NumPacked) 3899 SrcOp.setReg(PackedRegs[I]); 3900 else 3901 SrcOp.setReg(AMDGPU::NoRegister); 3902 } 3903 } 3904 } else if (!UseNSA && NumVAddrs > 1) { 3905 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3906 } 3907 3908 3909 if (BaseOpcode->Store) { // No TFE for stores? 3910 // TODO: Handle dmask trim 3911 Register VData = MI.getOperand(1).getReg(); 3912 LLT Ty = MRI->getType(VData); 3913 if (!Ty.isVector() || Ty.getElementType() != S16) 3914 return true; 3915 3916 B.setInstr(MI); 3917 3918 Register RepackedReg = handleD16VData(B, *MRI, VData); 3919 if (RepackedReg != VData) { 3920 MI.getOperand(1).setReg(RepackedReg); 3921 } 3922 3923 return true; 3924 } 3925 3926 Register DstReg = MI.getOperand(0).getReg(); 3927 LLT Ty = MRI->getType(DstReg); 3928 const LLT EltTy = Ty.getScalarType(); 3929 const bool IsD16 = Ty.getScalarType() == S16; 3930 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3931 3932 // Confirm that the return type is large enough for the dmask specified 3933 if (NumElts < DMaskLanes) 3934 return false; 3935 3936 if (NumElts > 4 || DMaskLanes > 4) 3937 return false; 3938 3939 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3940 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3941 3942 // The raw dword aligned data component of the load. The only legal cases 3943 // where this matters should be when using the packed D16 format, for 3944 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3945 LLT RoundedTy; 3946 3947 // S32 vector to to cover all data, plus TFE result element. 3948 LLT TFETy; 3949 3950 // Register type to use for each loaded component. Will be S32 or V2S16. 3951 LLT RegTy; 3952 3953 if (IsD16 && ST.hasUnpackedD16VMem()) { 3954 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3955 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3956 RegTy = S32; 3957 } else { 3958 unsigned EltSize = EltTy.getSizeInBits(); 3959 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3960 unsigned RoundedSize = 32 * RoundedElts; 3961 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3962 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3963 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3964 } 3965 3966 // The return type does not need adjustment. 3967 // TODO: Should we change s16 case to s32 or <2 x s16>? 3968 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3969 return true; 3970 3971 Register Dst1Reg; 3972 3973 // Insert after the instruction. 3974 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3975 3976 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3977 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3978 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3979 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3980 3981 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3982 3983 MI.getOperand(0).setReg(NewResultReg); 3984 3985 // In the IR, TFE is supposed to be used with a 2 element struct return 3986 // type. The intruction really returns these two values in one contiguous 3987 // register, with one additional dword beyond the loaded data. Rewrite the 3988 // return type to use a single register result. 3989 3990 if (IsTFE) { 3991 Dst1Reg = MI.getOperand(1).getReg(); 3992 if (MRI->getType(Dst1Reg) != S32) 3993 return false; 3994 3995 // TODO: Make sure the TFE operand bit is set. 3996 MI.RemoveOperand(1); 3997 3998 // Handle the easy case that requires no repack instructions. 3999 if (Ty == S32) { 4000 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4001 return true; 4002 } 4003 } 4004 4005 // Now figure out how to copy the new result register back into the old 4006 // result. 4007 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4008 4009 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4010 4011 if (ResultNumRegs == 1) { 4012 assert(!IsTFE); 4013 ResultRegs[0] = NewResultReg; 4014 } else { 4015 // We have to repack into a new vector of some kind. 4016 for (int I = 0; I != NumDataRegs; ++I) 4017 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4018 B.buildUnmerge(ResultRegs, NewResultReg); 4019 4020 // Drop the final TFE element to get the data part. The TFE result is 4021 // directly written to the right place already. 4022 if (IsTFE) 4023 ResultRegs.resize(NumDataRegs); 4024 } 4025 4026 // For an s16 scalar result, we form an s32 result with a truncate regardless 4027 // of packed vs. unpacked. 4028 if (IsD16 && !Ty.isVector()) { 4029 B.buildTrunc(DstReg, ResultRegs[0]); 4030 return true; 4031 } 4032 4033 // Avoid a build/concat_vector of 1 entry. 4034 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4035 B.buildBitcast(DstReg, ResultRegs[0]); 4036 return true; 4037 } 4038 4039 assert(Ty.isVector()); 4040 4041 if (IsD16) { 4042 // For packed D16 results with TFE enabled, all the data components are 4043 // S32. Cast back to the expected type. 4044 // 4045 // TODO: We don't really need to use load s32 elements. We would only need one 4046 // cast for the TFE result if a multiple of v2s16 was used. 4047 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4048 for (Register &Reg : ResultRegs) 4049 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4050 } else if (ST.hasUnpackedD16VMem()) { 4051 for (Register &Reg : ResultRegs) 4052 Reg = B.buildTrunc(S16, Reg).getReg(0); 4053 } 4054 } 4055 4056 auto padWithUndef = [&](LLT Ty, int NumElts) { 4057 if (NumElts == 0) 4058 return; 4059 Register Undef = B.buildUndef(Ty).getReg(0); 4060 for (int I = 0; I != NumElts; ++I) 4061 ResultRegs.push_back(Undef); 4062 }; 4063 4064 // Pad out any elements eliminated due to the dmask. 4065 LLT ResTy = MRI->getType(ResultRegs[0]); 4066 if (!ResTy.isVector()) { 4067 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4068 B.buildBuildVector(DstReg, ResultRegs); 4069 return true; 4070 } 4071 4072 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4073 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4074 4075 // Deal with the one annoying legal case. 4076 const LLT V3S16 = LLT::vector(3, 16); 4077 if (Ty == V3S16) { 4078 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4079 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4080 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4081 return true; 4082 } 4083 4084 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4085 B.buildConcatVectors(DstReg, ResultRegs); 4086 return true; 4087 } 4088 4089 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4090 MachineInstr &MI, MachineIRBuilder &B, 4091 GISelChangeObserver &Observer) const { 4092 Register Dst = MI.getOperand(0).getReg(); 4093 LLT Ty = B.getMRI()->getType(Dst); 4094 unsigned Size = Ty.getSizeInBits(); 4095 MachineFunction &MF = B.getMF(); 4096 4097 Observer.changingInstr(MI); 4098 4099 // FIXME: We don't really need this intermediate instruction. The intrinsic 4100 // should be fixed to have a memory operand. Since it's readnone, we're not 4101 // allowed to add one. 4102 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4103 MI.RemoveOperand(1); // Remove intrinsic ID 4104 4105 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4106 // TODO: Should this use datalayout alignment? 4107 const unsigned MemSize = (Size + 7) / 8; 4108 const Align MemAlign(4); 4109 MachineMemOperand *MMO = MF.getMachineMemOperand( 4110 MachinePointerInfo(), 4111 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4112 MachineMemOperand::MOInvariant, 4113 MemSize, MemAlign); 4114 MI.addMemOperand(MF, MMO); 4115 4116 // There are no 96-bit result scalar loads, but widening to 128-bit should 4117 // always be legal. We may need to restore this to a 96-bit result if it turns 4118 // out this needs to be converted to a vector load during RegBankSelect. 4119 if (!isPowerOf2_32(Size)) { 4120 LegalizerHelper Helper(MF, *this, Observer, B); 4121 B.setInstr(MI); 4122 4123 if (Ty.isVector()) 4124 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4125 else 4126 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4127 } 4128 4129 Observer.changedInstr(MI); 4130 return true; 4131 } 4132 4133 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4134 MachineRegisterInfo &MRI, 4135 MachineIRBuilder &B) const { 4136 B.setInstr(MI); 4137 4138 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4139 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4140 !ST.isTrapHandlerEnabled()) { 4141 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4142 } else { 4143 // Pass queue pointer to trap handler as input, and insert trap instruction 4144 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4145 const ArgDescriptor *Arg = 4146 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4147 if (!Arg) 4148 return false; 4149 MachineRegisterInfo &MRI = *B.getMRI(); 4150 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4151 Register LiveIn = getLiveInRegister( 4152 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4153 /*InsertLiveInCopy=*/false); 4154 if (!loadInputValue(LiveIn, B, Arg)) 4155 return false; 4156 B.buildCopy(SGPR01, LiveIn); 4157 B.buildInstr(AMDGPU::S_TRAP) 4158 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4159 .addReg(SGPR01, RegState::Implicit); 4160 } 4161 4162 MI.eraseFromParent(); 4163 return true; 4164 } 4165 4166 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4167 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4168 B.setInstr(MI); 4169 4170 // Is non-HSA path or trap-handler disabled? then, report a warning 4171 // accordingly 4172 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4173 !ST.isTrapHandlerEnabled()) { 4174 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4175 "debugtrap handler not supported", 4176 MI.getDebugLoc(), DS_Warning); 4177 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4178 Ctx.diagnose(NoTrap); 4179 } else { 4180 // Insert debug-trap instruction 4181 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4182 } 4183 4184 MI.eraseFromParent(); 4185 return true; 4186 } 4187 4188 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4189 MachineIRBuilder &B, 4190 GISelChangeObserver &Observer) const { 4191 MachineRegisterInfo &MRI = *B.getMRI(); 4192 4193 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4194 auto IntrID = MI.getIntrinsicID(); 4195 switch (IntrID) { 4196 case Intrinsic::amdgcn_if: 4197 case Intrinsic::amdgcn_else: { 4198 MachineInstr *Br = nullptr; 4199 MachineBasicBlock *UncondBrTarget = nullptr; 4200 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4201 const SIRegisterInfo *TRI 4202 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4203 4204 B.setInstr(*BrCond); 4205 Register Def = MI.getOperand(1).getReg(); 4206 Register Use = MI.getOperand(3).getReg(); 4207 4208 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4209 if (IntrID == Intrinsic::amdgcn_if) { 4210 B.buildInstr(AMDGPU::SI_IF) 4211 .addDef(Def) 4212 .addUse(Use) 4213 .addMBB(UncondBrTarget); 4214 } else { 4215 B.buildInstr(AMDGPU::SI_ELSE) 4216 .addDef(Def) 4217 .addUse(Use) 4218 .addMBB(UncondBrTarget) 4219 .addImm(0); 4220 } 4221 4222 if (Br) { 4223 Br->getOperand(0).setMBB(CondBrTarget); 4224 } else { 4225 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4226 // since we're swapping branch targets it needs to be reinserted. 4227 // FIXME: IRTranslator should probably not do this 4228 B.buildBr(*CondBrTarget); 4229 } 4230 4231 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4232 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4233 MI.eraseFromParent(); 4234 BrCond->eraseFromParent(); 4235 return true; 4236 } 4237 4238 return false; 4239 } 4240 case Intrinsic::amdgcn_loop: { 4241 MachineInstr *Br = nullptr; 4242 MachineBasicBlock *UncondBrTarget = nullptr; 4243 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4244 const SIRegisterInfo *TRI 4245 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4246 4247 B.setInstr(*BrCond); 4248 4249 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4250 Register Reg = MI.getOperand(2).getReg(); 4251 B.buildInstr(AMDGPU::SI_LOOP) 4252 .addUse(Reg) 4253 .addMBB(UncondBrTarget); 4254 4255 if (Br) 4256 Br->getOperand(0).setMBB(CondBrTarget); 4257 else 4258 B.buildBr(*CondBrTarget); 4259 4260 MI.eraseFromParent(); 4261 BrCond->eraseFromParent(); 4262 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4263 return true; 4264 } 4265 4266 return false; 4267 } 4268 case Intrinsic::amdgcn_kernarg_segment_ptr: 4269 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4270 B.setInstr(MI); 4271 // This only makes sense to call in a kernel, so just lower to null. 4272 B.buildConstant(MI.getOperand(0).getReg(), 0); 4273 MI.eraseFromParent(); 4274 return true; 4275 } 4276 4277 return legalizePreloadedArgIntrin( 4278 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4279 case Intrinsic::amdgcn_implicitarg_ptr: 4280 return legalizeImplicitArgPtr(MI, MRI, B); 4281 case Intrinsic::amdgcn_workitem_id_x: 4282 return legalizePreloadedArgIntrin(MI, MRI, B, 4283 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4284 case Intrinsic::amdgcn_workitem_id_y: 4285 return legalizePreloadedArgIntrin(MI, MRI, B, 4286 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4287 case Intrinsic::amdgcn_workitem_id_z: 4288 return legalizePreloadedArgIntrin(MI, MRI, B, 4289 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4290 case Intrinsic::amdgcn_workgroup_id_x: 4291 return legalizePreloadedArgIntrin(MI, MRI, B, 4292 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4293 case Intrinsic::amdgcn_workgroup_id_y: 4294 return legalizePreloadedArgIntrin(MI, MRI, B, 4295 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4296 case Intrinsic::amdgcn_workgroup_id_z: 4297 return legalizePreloadedArgIntrin(MI, MRI, B, 4298 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4299 case Intrinsic::amdgcn_dispatch_ptr: 4300 return legalizePreloadedArgIntrin(MI, MRI, B, 4301 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4302 case Intrinsic::amdgcn_queue_ptr: 4303 return legalizePreloadedArgIntrin(MI, MRI, B, 4304 AMDGPUFunctionArgInfo::QUEUE_PTR); 4305 case Intrinsic::amdgcn_implicit_buffer_ptr: 4306 return legalizePreloadedArgIntrin( 4307 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4308 case Intrinsic::amdgcn_dispatch_id: 4309 return legalizePreloadedArgIntrin(MI, MRI, B, 4310 AMDGPUFunctionArgInfo::DISPATCH_ID); 4311 case Intrinsic::amdgcn_fdiv_fast: 4312 return legalizeFDIVFastIntrin(MI, MRI, B); 4313 case Intrinsic::amdgcn_is_shared: 4314 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4315 case Intrinsic::amdgcn_is_private: 4316 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4317 case Intrinsic::amdgcn_wavefrontsize: { 4318 B.setInstr(MI); 4319 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4320 MI.eraseFromParent(); 4321 return true; 4322 } 4323 case Intrinsic::amdgcn_s_buffer_load: 4324 return legalizeSBufferLoad(MI, B, Observer); 4325 case Intrinsic::amdgcn_raw_buffer_store: 4326 case Intrinsic::amdgcn_struct_buffer_store: 4327 return legalizeBufferStore(MI, MRI, B, false, false); 4328 case Intrinsic::amdgcn_raw_buffer_store_format: 4329 case Intrinsic::amdgcn_struct_buffer_store_format: 4330 return legalizeBufferStore(MI, MRI, B, false, true); 4331 case Intrinsic::amdgcn_raw_tbuffer_store: 4332 case Intrinsic::amdgcn_struct_tbuffer_store: 4333 return legalizeBufferStore(MI, MRI, B, true, true); 4334 case Intrinsic::amdgcn_raw_buffer_load: 4335 case Intrinsic::amdgcn_struct_buffer_load: 4336 return legalizeBufferLoad(MI, MRI, B, false, false); 4337 case Intrinsic::amdgcn_raw_buffer_load_format: 4338 case Intrinsic::amdgcn_struct_buffer_load_format: 4339 return legalizeBufferLoad(MI, MRI, B, true, false); 4340 case Intrinsic::amdgcn_raw_tbuffer_load: 4341 case Intrinsic::amdgcn_struct_tbuffer_load: 4342 return legalizeBufferLoad(MI, MRI, B, true, true); 4343 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4344 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4345 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4346 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4347 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4348 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4349 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4350 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4351 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4352 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4353 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4354 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4355 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4356 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4357 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4358 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4359 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4360 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4361 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4362 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4363 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4364 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4365 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4366 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4367 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4368 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4369 return legalizeBufferAtomic(MI, B, IntrID); 4370 case Intrinsic::amdgcn_atomic_inc: 4371 return legalizeAtomicIncDec(MI, B, true); 4372 case Intrinsic::amdgcn_atomic_dec: 4373 return legalizeAtomicIncDec(MI, B, false); 4374 case Intrinsic::trap: 4375 return legalizeTrapIntrinsic(MI, MRI, B); 4376 case Intrinsic::debugtrap: 4377 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4378 default: { 4379 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4380 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4381 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4382 return true; 4383 } 4384 } 4385 4386 return true; 4387 } 4388