1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isMultiple32(unsigned TypeIdx, 54 unsigned MaxSize = 1024) { 55 return [=](const LegalityQuery &Query) { 56 const LLT Ty = Query.Types[TypeIdx]; 57 const LLT EltTy = Ty.getScalarType(); 58 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 59 }; 60 } 61 62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 63 return [=](const LegalityQuery &Query) { 64 const LLT Ty = Query.Types[TypeIdx]; 65 return Ty.isVector() && 66 Ty.getNumElements() % 2 != 0 && 67 Ty.getElementType().getSizeInBits() < 32 && 68 Ty.getSizeInBits() % 32 != 0; 69 }; 70 } 71 72 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 const LLT EltTy = Ty.getScalarType(); 76 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 77 }; 78 } 79 80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getElementType(); 84 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 85 }; 86 } 87 88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 unsigned Size = Ty.getSizeInBits(); 93 unsigned Pieces = (Size + 63) / 64; 94 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 95 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 96 }; 97 } 98 99 // Increase the number of vector elements to reach the next multiple of 32-bit 100 // type. 101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 105 const LLT EltTy = Ty.getElementType(); 106 const int Size = Ty.getSizeInBits(); 107 const int EltSize = EltTy.getSizeInBits(); 108 const int NextMul32 = (Size + 31) / 32; 109 110 assert(EltSize < 32); 111 112 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 113 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 114 }; 115 } 116 117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 118 return [=](const LegalityQuery &Query) { 119 const LLT QueryTy = Query.Types[TypeIdx]; 120 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 121 }; 122 } 123 124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 125 return [=](const LegalityQuery &Query) { 126 const LLT QueryTy = Query.Types[TypeIdx]; 127 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 128 }; 129 } 130 131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 132 return [=](const LegalityQuery &Query) { 133 const LLT QueryTy = Query.Types[TypeIdx]; 134 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 135 }; 136 } 137 138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 139 // v2s16. 140 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 141 return [=](const LegalityQuery &Query) { 142 const LLT Ty = Query.Types[TypeIdx]; 143 if (Ty.isVector()) { 144 const int EltSize = Ty.getElementType().getSizeInBits(); 145 return EltSize == 32 || EltSize == 64 || 146 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 147 EltSize == 128 || EltSize == 256; 148 } 149 150 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 151 }; 152 } 153 154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT QueryTy = Query.Types[TypeIdx]; 157 if (!QueryTy.isVector()) 158 return false; 159 const LLT EltTy = QueryTy.getElementType(); 160 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 161 }; 162 } 163 164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 165 return [=](const LegalityQuery &Query) { 166 const LLT Ty = Query.Types[TypeIdx]; 167 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 168 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 169 }; 170 } 171 172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 173 const GCNTargetMachine &TM) 174 : ST(ST_) { 175 using namespace TargetOpcode; 176 177 auto GetAddrSpacePtr = [&TM](unsigned AS) { 178 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 179 }; 180 181 const LLT S1 = LLT::scalar(1); 182 const LLT S16 = LLT::scalar(16); 183 const LLT S32 = LLT::scalar(32); 184 const LLT S64 = LLT::scalar(64); 185 const LLT S128 = LLT::scalar(128); 186 const LLT S256 = LLT::scalar(256); 187 const LLT S512 = LLT::scalar(512); 188 const LLT S1024 = LLT::scalar(1024); 189 190 const LLT V2S16 = LLT::vector(2, 16); 191 const LLT V4S16 = LLT::vector(4, 16); 192 193 const LLT V2S32 = LLT::vector(2, 32); 194 const LLT V3S32 = LLT::vector(3, 32); 195 const LLT V4S32 = LLT::vector(4, 32); 196 const LLT V5S32 = LLT::vector(5, 32); 197 const LLT V6S32 = LLT::vector(6, 32); 198 const LLT V7S32 = LLT::vector(7, 32); 199 const LLT V8S32 = LLT::vector(8, 32); 200 const LLT V9S32 = LLT::vector(9, 32); 201 const LLT V10S32 = LLT::vector(10, 32); 202 const LLT V11S32 = LLT::vector(11, 32); 203 const LLT V12S32 = LLT::vector(12, 32); 204 const LLT V13S32 = LLT::vector(13, 32); 205 const LLT V14S32 = LLT::vector(14, 32); 206 const LLT V15S32 = LLT::vector(15, 32); 207 const LLT V16S32 = LLT::vector(16, 32); 208 const LLT V32S32 = LLT::vector(32, 32); 209 210 const LLT V2S64 = LLT::vector(2, 64); 211 const LLT V3S64 = LLT::vector(3, 64); 212 const LLT V4S64 = LLT::vector(4, 64); 213 const LLT V5S64 = LLT::vector(5, 64); 214 const LLT V6S64 = LLT::vector(6, 64); 215 const LLT V7S64 = LLT::vector(7, 64); 216 const LLT V8S64 = LLT::vector(8, 64); 217 const LLT V16S64 = LLT::vector(16, 64); 218 219 std::initializer_list<LLT> AllS32Vectors = 220 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 221 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 222 std::initializer_list<LLT> AllS64Vectors = 223 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 224 225 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 226 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 227 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 228 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 229 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 230 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 231 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 232 233 const LLT CodePtr = FlatPtr; 234 235 const std::initializer_list<LLT> AddrSpaces64 = { 236 GlobalPtr, ConstantPtr, FlatPtr 237 }; 238 239 const std::initializer_list<LLT> AddrSpaces32 = { 240 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 241 }; 242 243 const std::initializer_list<LLT> FPTypesBase = { 244 S32, S64 245 }; 246 247 const std::initializer_list<LLT> FPTypes16 = { 248 S32, S64, S16 249 }; 250 251 const std::initializer_list<LLT> FPTypesPK16 = { 252 S32, S64, S16, V2S16 253 }; 254 255 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 256 257 setAction({G_BRCOND, S1}, Legal); // VCC branches 258 setAction({G_BRCOND, S32}, Legal); // SCC branches 259 260 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 261 // elements for v3s16 262 getActionDefinitionsBuilder(G_PHI) 263 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 264 .legalFor(AllS32Vectors) 265 .legalFor(AllS64Vectors) 266 .legalFor(AddrSpaces64) 267 .legalFor(AddrSpaces32) 268 .clampScalar(0, S32, S256) 269 .widenScalarToNextPow2(0, 32) 270 .clampMaxNumElements(0, S32, 16) 271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 272 .legalIf(isPointer(0)); 273 274 if (ST.hasVOP3PInsts()) { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32, S16, V2S16}) 277 .clampScalar(0, S16, S32) 278 .clampMaxNumElements(0, S16, 2) 279 .scalarize(0) 280 .widenScalarToNextPow2(0, 32); 281 } else if (ST.has16BitInsts()) { 282 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 283 .legalFor({S32, S16}) 284 .clampScalar(0, S16, S32) 285 .scalarize(0) 286 .widenScalarToNextPow2(0, 32); 287 } else { 288 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 } 293 294 // FIXME: Not really legal. Placeholder for custom lowering. 295 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 296 .customFor({S32, S64}) 297 .clampScalar(0, S32, S64) 298 .widenScalarToNextPow2(0, 32) 299 .scalarize(0); 300 301 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 302 .legalFor({S32}) 303 .clampScalar(0, S32, S32) 304 .scalarize(0); 305 306 // Report legal for any types we can handle anywhere. For the cases only legal 307 // on the SALU, RegBankSelect will be able to re-legalize. 308 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 309 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 310 .clampScalar(0, S32, S64) 311 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 312 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 313 .widenScalarToNextPow2(0) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 317 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 318 .legalFor({{S32, S1}, {S32, S32}}) 319 .minScalar(0, S32) 320 // TODO: .scalarize(0) 321 .lower(); 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 351 // If the amount is divergent, we have to do a wave reduction to get the 352 // maximum value, so this is expanded during RegBankSelect. 353 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 354 .legalFor({{PrivatePtr, S32}}); 355 356 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 357 .unsupportedFor({PrivatePtr}) 358 .custom(); 359 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 360 361 auto &FPOpActions = getActionDefinitionsBuilder( 362 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 363 .legalFor({S32, S64}); 364 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 365 .customFor({S32, S64}); 366 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 367 .customFor({S32, S64}); 368 369 if (ST.has16BitInsts()) { 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.legalFor({S16, V2S16}); 372 else 373 FPOpActions.legalFor({S16}); 374 375 TrigActions.customFor({S16}); 376 FDIVActions.customFor({S16}); 377 } 378 379 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 380 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 381 382 if (ST.hasVOP3PInsts()) { 383 MinNumMaxNum.customFor(FPTypesPK16) 384 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 385 .clampMaxNumElements(0, S16, 2) 386 .clampScalar(0, S16, S64) 387 .scalarize(0); 388 } else if (ST.has16BitInsts()) { 389 MinNumMaxNum.customFor(FPTypes16) 390 .clampScalar(0, S16, S64) 391 .scalarize(0); 392 } else { 393 MinNumMaxNum.customFor(FPTypesBase) 394 .clampScalar(0, S32, S64) 395 .scalarize(0); 396 } 397 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.clampMaxNumElements(0, S16, 2); 400 401 FPOpActions 402 .scalarize(0) 403 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 404 405 TrigActions 406 .scalarize(0) 407 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 408 409 FDIVActions 410 .scalarize(0) 411 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 412 413 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 414 .legalFor(FPTypesPK16) 415 .clampMaxNumElements(0, S16, 2) 416 .scalarize(0) 417 .clampScalar(0, S16, S64); 418 419 if (ST.has16BitInsts()) { 420 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 421 .legalFor({S32, S64, S16}) 422 .scalarize(0) 423 .clampScalar(0, S16, S64); 424 } else { 425 getActionDefinitionsBuilder(G_FSQRT) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 430 if (ST.hasFractBug()) { 431 getActionDefinitionsBuilder(G_FFLOOR) 432 .customFor({S64}) 433 .legalFor({S32, S64}) 434 .scalarize(0) 435 .clampScalar(0, S32, S64); 436 } else { 437 getActionDefinitionsBuilder(G_FFLOOR) 438 .legalFor({S32, S64}) 439 .scalarize(0) 440 .clampScalar(0, S32, S64); 441 } 442 } 443 444 getActionDefinitionsBuilder(G_FPTRUNC) 445 .legalFor({{S32, S64}, {S16, S32}}) 446 .scalarize(0) 447 .lower(); 448 449 getActionDefinitionsBuilder(G_FPEXT) 450 .legalFor({{S64, S32}, {S32, S16}}) 451 .lowerFor({{S64, S16}}) // FIXME: Implement 452 .scalarize(0); 453 454 getActionDefinitionsBuilder(G_FSUB) 455 // Use actual fsub instruction 456 .legalFor({S32}) 457 // Must use fadd + fneg 458 .lowerFor({S64, S16, V2S16}) 459 .scalarize(0) 460 .clampScalar(0, S32, S64); 461 462 // Whether this is legal depends on the floating point mode for the function. 463 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 464 if (ST.hasMadF16()) 465 FMad.customFor({S32, S16}); 466 else 467 FMad.customFor({S32}); 468 FMad.scalarize(0) 469 .lower(); 470 471 // TODO: Do we need to clamp maximum bitwidth? 472 getActionDefinitionsBuilder(G_TRUNC) 473 .legalIf(isScalar(0)) 474 .legalFor({{V2S16, V2S32}}) 475 .clampMaxNumElements(0, S16, 2) 476 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 477 // situations (like an invalid implicit use), we don't want to infinite loop 478 // in the legalizer. 479 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 480 .alwaysLegal(); 481 482 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 483 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 484 {S32, S1}, {S64, S1}, {S16, S1}}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64) 487 .widenScalarToNextPow2(1, 32); 488 489 // TODO: Split s1->s64 during regbankselect for VALU. 490 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 491 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 492 .lowerFor({{S32, S64}}) 493 .lowerIf(typeIs(1, S1)) 494 .customFor({{S64, S64}}); 495 if (ST.has16BitInsts()) 496 IToFP.legalFor({{S16, S16}}); 497 IToFP.clampScalar(1, S32, S64) 498 .scalarize(0) 499 .widenScalarToNextPow2(1); 500 501 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 502 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 503 .customFor({{S64, S64}}); 504 if (ST.has16BitInsts()) 505 FPToI.legalFor({{S16, S16}}); 506 else 507 FPToI.minScalar(1, S32); 508 509 FPToI.minScalar(0, S32) 510 .scalarize(0) 511 .lower(); 512 513 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 514 .scalarize(0) 515 .lower(); 516 517 if (ST.has16BitInsts()) { 518 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 519 .legalFor({S16, S32, S64}) 520 .clampScalar(0, S16, S64) 521 .scalarize(0); 522 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 523 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 524 .legalFor({S32, S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 529 .legalFor({S32}) 530 .customFor({S64}) 531 .clampScalar(0, S32, S64) 532 .scalarize(0); 533 } 534 535 // FIXME: Clamp offset operand. 536 getActionDefinitionsBuilder(G_PTR_ADD) 537 .legalIf(isPointer(0)) 538 .scalarize(0); 539 540 getActionDefinitionsBuilder(G_PTRMASK) 541 .legalIf(typeInSet(1, {S64, S32})) 542 .minScalar(1, S32) 543 .maxScalarIf(sizeIs(0, 32), 1, S32) 544 .maxScalarIf(sizeIs(0, 64), 1, S64) 545 .scalarize(0); 546 547 auto &CmpBuilder = 548 getActionDefinitionsBuilder(G_ICMP) 549 // The compare output type differs based on the register bank of the output, 550 // so make both s1 and s32 legal. 551 // 552 // Scalar compares producing output in scc will be promoted to s32, as that 553 // is the allocatable register type that will be needed for the copy from 554 // scc. This will be promoted during RegBankSelect, and we assume something 555 // before that won't try to use s32 result types. 556 // 557 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 558 // bank. 559 .legalForCartesianProduct( 560 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 561 .legalForCartesianProduct( 562 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 563 if (ST.has16BitInsts()) { 564 CmpBuilder.legalFor({{S1, S16}}); 565 } 566 567 CmpBuilder 568 .widenScalarToNextPow2(1) 569 .clampScalar(1, S32, S64) 570 .scalarize(0) 571 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 572 573 getActionDefinitionsBuilder(G_FCMP) 574 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 575 .widenScalarToNextPow2(1) 576 .clampScalar(1, S32, S64) 577 .scalarize(0); 578 579 // FIXME: fpow has a selection pattern that should move to custom lowering. 580 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 581 if (ST.has16BitInsts()) 582 Exp2Ops.legalFor({S32, S16}); 583 else 584 Exp2Ops.legalFor({S32}); 585 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 586 Exp2Ops.scalarize(0); 587 588 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 589 if (ST.has16BitInsts()) 590 ExpOps.customFor({{S32}, {S16}}); 591 else 592 ExpOps.customFor({S32}); 593 ExpOps.clampScalar(0, MinScalarFPTy, S32) 594 .scalarize(0); 595 596 // The 64-bit versions produce 32-bit results, but only on the SALU. 597 getActionDefinitionsBuilder(G_CTPOP) 598 .legalFor({{S32, S32}, {S32, S64}}) 599 .clampScalar(0, S32, S32) 600 .clampScalar(1, S32, S64) 601 .scalarize(0) 602 .widenScalarToNextPow2(0, 32) 603 .widenScalarToNextPow2(1, 32); 604 605 // The hardware instructions return a different result on 0 than the generic 606 // instructions expect. The hardware produces -1, but these produce the 607 // bitwidth. 608 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 609 .scalarize(0) 610 .clampScalar(0, S32, S32) 611 .clampScalar(1, S32, S64) 612 .widenScalarToNextPow2(0, 32) 613 .widenScalarToNextPow2(1, 32) 614 .lower(); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 getActionDefinitionsBuilder(G_BITREVERSE) 626 .legalFor({S32}) 627 .clampScalar(0, S32, S32) 628 .scalarize(0); 629 630 if (ST.has16BitInsts()) { 631 getActionDefinitionsBuilder(G_BSWAP) 632 .legalFor({S16, S32, V2S16}) 633 .clampMaxNumElements(0, S16, 2) 634 // FIXME: Fixing non-power-of-2 before clamp is workaround for 635 // narrowScalar limitation. 636 .widenScalarToNextPow2(0) 637 .clampScalar(0, S16, S32) 638 .scalarize(0); 639 640 if (ST.hasVOP3PInsts()) { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16, V2S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampMaxNumElements(0, S16, 2) 645 .minScalar(0, S16) 646 .widenScalarToNextPow2(0) 647 .scalarize(0) 648 .lower(); 649 } else { 650 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 651 .legalFor({S32, S16}) 652 .widenScalarToNextPow2(0) 653 .minScalar(0, S16) 654 .scalarize(0) 655 .lower(); 656 } 657 } else { 658 // TODO: Should have same legality without v_perm_b32 659 getActionDefinitionsBuilder(G_BSWAP) 660 .legalFor({S32}) 661 .lowerIf(scalarNarrowerThan(0, 32)) 662 // FIXME: Fixing non-power-of-2 before clamp is workaround for 663 // narrowScalar limitation. 664 .widenScalarToNextPow2(0) 665 .maxScalar(0, S32) 666 .scalarize(0) 667 .lower(); 668 669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 670 .legalFor({S32}) 671 .minScalar(0, S32) 672 .widenScalarToNextPow2(0) 673 .scalarize(0) 674 .lower(); 675 } 676 677 getActionDefinitionsBuilder(G_INTTOPTR) 678 // List the common cases 679 .legalForCartesianProduct(AddrSpaces64, {S64}) 680 .legalForCartesianProduct(AddrSpaces32, {S32}) 681 .scalarize(0) 682 // Accept any address space as long as the size matches 683 .legalIf(sameSize(0, 1)) 684 .widenScalarIf(smallerThan(1, 0), 685 [](const LegalityQuery &Query) { 686 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 687 }) 688 .narrowScalarIf(largerThan(1, 0), 689 [](const LegalityQuery &Query) { 690 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 691 }); 692 693 getActionDefinitionsBuilder(G_PTRTOINT) 694 // List the common cases 695 .legalForCartesianProduct(AddrSpaces64, {S64}) 696 .legalForCartesianProduct(AddrSpaces32, {S32}) 697 .scalarize(0) 698 // Accept any address space as long as the size matches 699 .legalIf(sameSize(0, 1)) 700 .widenScalarIf(smallerThan(0, 1), 701 [](const LegalityQuery &Query) { 702 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 703 }) 704 .narrowScalarIf( 705 largerThan(0, 1), 706 [](const LegalityQuery &Query) { 707 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 708 }); 709 710 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 711 .scalarize(0) 712 .custom(); 713 714 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 715 // handle some operations by just promoting the register during 716 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 717 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 718 switch (AS) { 719 // FIXME: Private element size. 720 case AMDGPUAS::PRIVATE_ADDRESS: 721 return 32; 722 // FIXME: Check subtarget 723 case AMDGPUAS::LOCAL_ADDRESS: 724 return ST.useDS128() ? 128 : 64; 725 726 // Treat constant and global as identical. SMRD loads are sometimes usable 727 // for global loads (ideally constant address space should be eliminated) 728 // depending on the context. Legality cannot be context dependent, but 729 // RegBankSelect can split the load as necessary depending on the pointer 730 // register bank/uniformity and if the memory is invariant or not written in 731 // a kernel. 732 case AMDGPUAS::CONSTANT_ADDRESS: 733 case AMDGPUAS::GLOBAL_ADDRESS: 734 return IsLoad ? 512 : 128; 735 default: 736 return 128; 737 } 738 }; 739 740 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 741 bool IsLoad) -> bool { 742 const LLT DstTy = Query.Types[0]; 743 744 // Split vector extloads. 745 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 746 unsigned Align = Query.MMODescrs[0].AlignInBits; 747 748 if (MemSize < DstTy.getSizeInBits()) 749 MemSize = std::max(MemSize, Align); 750 751 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 752 return true; 753 754 const LLT PtrTy = Query.Types[1]; 755 unsigned AS = PtrTy.getAddressSpace(); 756 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 757 return true; 758 759 // Catch weird sized loads that don't evenly divide into the access sizes 760 // TODO: May be able to widen depending on alignment etc. 761 unsigned NumRegs = (MemSize + 31) / 32; 762 if (NumRegs == 3) { 763 if (!ST.hasDwordx3LoadStores()) 764 return true; 765 } else { 766 // If the alignment allows, these should have been widened. 767 if (!isPowerOf2_32(NumRegs)) 768 return true; 769 } 770 771 if (Align < MemSize) { 772 const SITargetLowering *TLI = ST.getTargetLowering(); 773 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 774 } 775 776 return false; 777 }; 778 779 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 780 unsigned Size = Query.Types[0].getSizeInBits(); 781 if (isPowerOf2_32(Size)) 782 return false; 783 784 if (Size == 96 && ST.hasDwordx3LoadStores()) 785 return false; 786 787 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 788 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 789 return false; 790 791 unsigned Align = Query.MMODescrs[0].AlignInBits; 792 unsigned RoundedSize = NextPowerOf2(Size); 793 return (Align >= RoundedSize); 794 }; 795 796 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 797 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 798 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 799 800 // TODO: Refine based on subtargets which support unaligned access or 128-bit 801 // LDS 802 // TODO: Unsupported flat for SI. 803 804 for (unsigned Op : {G_LOAD, G_STORE}) { 805 const bool IsStore = Op == G_STORE; 806 807 auto &Actions = getActionDefinitionsBuilder(Op); 808 // Whitelist the common cases. 809 // TODO: Loads to s16 on gfx9 810 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 811 {V2S32, GlobalPtr, 64, GlobalAlign32}, 812 {V4S32, GlobalPtr, 128, GlobalAlign32}, 813 {S128, GlobalPtr, 128, GlobalAlign32}, 814 {S64, GlobalPtr, 64, GlobalAlign32}, 815 {V2S64, GlobalPtr, 128, GlobalAlign32}, 816 {V2S16, GlobalPtr, 32, GlobalAlign32}, 817 {S32, GlobalPtr, 8, GlobalAlign8}, 818 {S32, GlobalPtr, 16, GlobalAlign16}, 819 820 {S32, LocalPtr, 32, 32}, 821 {S64, LocalPtr, 64, 32}, 822 {V2S32, LocalPtr, 64, 32}, 823 {S32, LocalPtr, 8, 8}, 824 {S32, LocalPtr, 16, 16}, 825 {V2S16, LocalPtr, 32, 32}, 826 827 {S32, PrivatePtr, 32, 32}, 828 {S32, PrivatePtr, 8, 8}, 829 {S32, PrivatePtr, 16, 16}, 830 {V2S16, PrivatePtr, 32, 32}, 831 832 {S32, FlatPtr, 32, GlobalAlign32}, 833 {S32, FlatPtr, 16, GlobalAlign16}, 834 {S32, FlatPtr, 8, GlobalAlign8}, 835 {V2S16, FlatPtr, 32, GlobalAlign32}, 836 837 {S32, ConstantPtr, 32, GlobalAlign32}, 838 {V2S32, ConstantPtr, 64, GlobalAlign32}, 839 {V4S32, ConstantPtr, 128, GlobalAlign32}, 840 {S64, ConstantPtr, 64, GlobalAlign32}, 841 {S128, ConstantPtr, 128, GlobalAlign32}, 842 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 843 Actions 844 .customIf(typeIs(1, Constant32Ptr)) 845 // Widen suitably aligned loads by loading extra elements. 846 .moreElementsIf([=](const LegalityQuery &Query) { 847 const LLT Ty = Query.Types[0]; 848 return Op == G_LOAD && Ty.isVector() && 849 shouldWidenLoadResult(Query); 850 }, moreElementsToNextPow2(0)) 851 .widenScalarIf([=](const LegalityQuery &Query) { 852 const LLT Ty = Query.Types[0]; 853 return Op == G_LOAD && !Ty.isVector() && 854 shouldWidenLoadResult(Query); 855 }, widenScalarOrEltToNextPow2(0)) 856 .narrowScalarIf( 857 [=](const LegalityQuery &Query) -> bool { 858 return !Query.Types[0].isVector() && 859 needToSplitMemOp(Query, Op == G_LOAD); 860 }, 861 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 862 const LLT DstTy = Query.Types[0]; 863 const LLT PtrTy = Query.Types[1]; 864 865 const unsigned DstSize = DstTy.getSizeInBits(); 866 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 867 868 // Split extloads. 869 if (DstSize > MemSize) 870 return std::make_pair(0, LLT::scalar(MemSize)); 871 872 if (!isPowerOf2_32(DstSize)) { 873 // We're probably decomposing an odd sized store. Try to split 874 // to the widest type. TODO: Account for alignment. As-is it 875 // should be OK, since the new parts will be further legalized. 876 unsigned FloorSize = PowerOf2Floor(DstSize); 877 return std::make_pair(0, LLT::scalar(FloorSize)); 878 } 879 880 if (DstSize > 32 && (DstSize % 32 != 0)) { 881 // FIXME: Need a way to specify non-extload of larger size if 882 // suitably aligned. 883 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 884 } 885 886 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 887 Op == G_LOAD); 888 if (MemSize > MaxSize) 889 return std::make_pair(0, LLT::scalar(MaxSize)); 890 891 unsigned Align = Query.MMODescrs[0].AlignInBits; 892 return std::make_pair(0, LLT::scalar(Align)); 893 }) 894 .fewerElementsIf( 895 [=](const LegalityQuery &Query) -> bool { 896 return Query.Types[0].isVector() && 897 needToSplitMemOp(Query, Op == G_LOAD); 898 }, 899 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 900 const LLT DstTy = Query.Types[0]; 901 const LLT PtrTy = Query.Types[1]; 902 903 LLT EltTy = DstTy.getElementType(); 904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 905 Op == G_LOAD); 906 907 // FIXME: Handle widened to power of 2 results better. This ends 908 // up scalarizing. 909 // FIXME: 3 element stores scalarized on SI 910 911 // Split if it's too large for the address space. 912 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 913 unsigned NumElts = DstTy.getNumElements(); 914 unsigned EltSize = EltTy.getSizeInBits(); 915 916 if (MaxSize % EltSize == 0) { 917 return std::make_pair( 918 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 919 } 920 921 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 922 923 // FIXME: Refine when odd breakdowns handled 924 // The scalars will need to be re-legalized. 925 if (NumPieces == 1 || NumPieces >= NumElts || 926 NumElts % NumPieces != 0) 927 return std::make_pair(0, EltTy); 928 929 return std::make_pair(0, 930 LLT::vector(NumElts / NumPieces, EltTy)); 931 } 932 933 // FIXME: We could probably handle weird extending loads better. 934 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 935 if (DstTy.getSizeInBits() > MemSize) 936 return std::make_pair(0, EltTy); 937 938 unsigned EltSize = EltTy.getSizeInBits(); 939 unsigned DstSize = DstTy.getSizeInBits(); 940 if (!isPowerOf2_32(DstSize)) { 941 // We're probably decomposing an odd sized store. Try to split 942 // to the widest type. TODO: Account for alignment. As-is it 943 // should be OK, since the new parts will be further legalized. 944 unsigned FloorSize = PowerOf2Floor(DstSize); 945 return std::make_pair( 946 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 947 } 948 949 // Need to split because of alignment. 950 unsigned Align = Query.MMODescrs[0].AlignInBits; 951 if (EltSize > Align && 952 (EltSize / Align < DstTy.getNumElements())) { 953 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 954 } 955 956 // May need relegalization for the scalars. 957 return std::make_pair(0, EltTy); 958 }) 959 .minScalar(0, S32); 960 961 if (IsStore) 962 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 963 964 // TODO: Need a bitcast lower option? 965 Actions 966 .legalIf([=](const LegalityQuery &Query) { 967 const LLT Ty0 = Query.Types[0]; 968 unsigned Size = Ty0.getSizeInBits(); 969 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 972 // FIXME: Widening store from alignment not valid. 973 if (MemSize < Size) 974 MemSize = std::max(MemSize, Align); 975 976 // No extending vector loads. 977 if (Size > MemSize && Ty0.isVector()) 978 return false; 979 980 switch (MemSize) { 981 case 8: 982 case 16: 983 return Size == 32; 984 case 32: 985 case 64: 986 case 128: 987 return true; 988 case 96: 989 return ST.hasDwordx3LoadStores(); 990 case 256: 991 case 512: 992 return true; 993 default: 994 return false; 995 } 996 }) 997 .widenScalarToNextPow2(0) 998 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 999 } 1000 1001 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1002 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1003 {S32, GlobalPtr, 16, 2 * 8}, 1004 {S32, LocalPtr, 8, 8}, 1005 {S32, LocalPtr, 16, 16}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {S32, ConstantPtr, 8, 8}, 1009 {S32, ConstantPtr, 16, 2 * 8}}); 1010 if (ST.hasFlatAddressSpace()) { 1011 ExtLoads.legalForTypesWithMemDesc( 1012 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1013 } 1014 1015 ExtLoads.clampScalar(0, S32, S32) 1016 .widenScalarToNextPow2(0) 1017 .unsupportedIfMemSizeNotPow2() 1018 .lower(); 1019 1020 auto &Atomics = getActionDefinitionsBuilder( 1021 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1022 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1023 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1024 G_ATOMICRMW_UMIN}) 1025 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1026 {S64, GlobalPtr}, {S64, LocalPtr}}); 1027 if (ST.hasFlatAddressSpace()) { 1028 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1029 } 1030 1031 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1032 .legalFor({{S32, LocalPtr}}); 1033 1034 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1035 // demarshalling 1036 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1037 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1038 {S32, FlatPtr}, {S64, FlatPtr}}) 1039 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1040 {S32, RegionPtr}, {S64, RegionPtr}}); 1041 // TODO: Pointer types, any 32-bit or 64-bit vector 1042 1043 // Condition should be s32 for scalar, s1 for vector. 1044 getActionDefinitionsBuilder(G_SELECT) 1045 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1046 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1047 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1048 .clampScalar(0, S16, S64) 1049 .scalarize(1) 1050 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1051 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1052 .clampMaxNumElements(0, S32, 2) 1053 .clampMaxNumElements(0, LocalPtr, 2) 1054 .clampMaxNumElements(0, PrivatePtr, 2) 1055 .scalarize(0) 1056 .widenScalarToNextPow2(0) 1057 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1058 1059 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1060 // be more flexible with the shift amount type. 1061 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1062 .legalFor({{S32, S32}, {S64, S32}}); 1063 if (ST.has16BitInsts()) { 1064 if (ST.hasVOP3PInsts()) { 1065 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1066 .clampMaxNumElements(0, S16, 2); 1067 } else 1068 Shifts.legalFor({{S16, S16}}); 1069 1070 // TODO: Support 16-bit shift amounts for all types 1071 Shifts.widenScalarIf( 1072 [=](const LegalityQuery &Query) { 1073 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1074 // 32-bit amount. 1075 const LLT ValTy = Query.Types[0]; 1076 const LLT AmountTy = Query.Types[1]; 1077 return ValTy.getSizeInBits() <= 16 && 1078 AmountTy.getSizeInBits() < 16; 1079 }, changeTo(1, S16)); 1080 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1081 Shifts.clampScalar(1, S32, S32); 1082 Shifts.clampScalar(0, S16, S64); 1083 Shifts.widenScalarToNextPow2(0, 16); 1084 } else { 1085 // Make sure we legalize the shift amount type first, as the general 1086 // expansion for the shifted type will produce much worse code if it hasn't 1087 // been truncated already. 1088 Shifts.clampScalar(1, S32, S32); 1089 Shifts.clampScalar(0, S32, S64); 1090 Shifts.widenScalarToNextPow2(0, 32); 1091 } 1092 Shifts.scalarize(0); 1093 1094 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1095 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1096 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1097 unsigned IdxTypeIdx = 2; 1098 1099 getActionDefinitionsBuilder(Op) 1100 .customIf([=](const LegalityQuery &Query) { 1101 const LLT EltTy = Query.Types[EltTypeIdx]; 1102 const LLT VecTy = Query.Types[VecTypeIdx]; 1103 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1104 return (EltTy.getSizeInBits() == 16 || 1105 EltTy.getSizeInBits() % 32 == 0) && 1106 VecTy.getSizeInBits() % 32 == 0 && 1107 VecTy.getSizeInBits() <= 1024 && 1108 IdxTy.getSizeInBits() == 32; 1109 }) 1110 .clampScalar(EltTypeIdx, S32, S64) 1111 .clampScalar(VecTypeIdx, S32, S64) 1112 .clampScalar(IdxTypeIdx, S32, S32); 1113 } 1114 1115 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1116 .unsupportedIf([=](const LegalityQuery &Query) { 1117 const LLT &EltTy = Query.Types[1].getElementType(); 1118 return Query.Types[0] != EltTy; 1119 }); 1120 1121 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1122 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1123 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1124 1125 // FIXME: Doesn't handle extract of illegal sizes. 1126 getActionDefinitionsBuilder(Op) 1127 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1128 // FIXME: Multiples of 16 should not be legal. 1129 .legalIf([=](const LegalityQuery &Query) { 1130 const LLT BigTy = Query.Types[BigTyIdx]; 1131 const LLT LitTy = Query.Types[LitTyIdx]; 1132 return (BigTy.getSizeInBits() % 32 == 0) && 1133 (LitTy.getSizeInBits() % 16 == 0); 1134 }) 1135 .widenScalarIf( 1136 [=](const LegalityQuery &Query) { 1137 const LLT BigTy = Query.Types[BigTyIdx]; 1138 return (BigTy.getScalarSizeInBits() < 16); 1139 }, 1140 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1141 .widenScalarIf( 1142 [=](const LegalityQuery &Query) { 1143 const LLT LitTy = Query.Types[LitTyIdx]; 1144 return (LitTy.getScalarSizeInBits() < 16); 1145 }, 1146 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1147 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1148 .widenScalarToNextPow2(BigTyIdx, 32); 1149 1150 } 1151 1152 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1153 .legalForCartesianProduct(AllS32Vectors, {S32}) 1154 .legalForCartesianProduct(AllS64Vectors, {S64}) 1155 .clampNumElements(0, V16S32, V32S32) 1156 .clampNumElements(0, V2S64, V16S64) 1157 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1158 1159 if (ST.hasScalarPackInsts()) { 1160 BuildVector 1161 // FIXME: Should probably widen s1 vectors straight to s32 1162 .minScalarOrElt(0, S16) 1163 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1164 .minScalar(1, S32); 1165 1166 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1167 .legalFor({V2S16, S32}) 1168 .lower(); 1169 BuildVector.minScalarOrElt(0, S32); 1170 } else { 1171 BuildVector.customFor({V2S16, S16}); 1172 BuildVector.minScalarOrElt(0, S32); 1173 1174 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1175 .customFor({V2S16, S32}) 1176 .lower(); 1177 } 1178 1179 BuildVector.legalIf(isRegisterType(0)); 1180 1181 // FIXME: Clamp maximum size 1182 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1183 .legalIf(isRegisterType(0)); 1184 1185 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1186 // pre-legalize. 1187 if (ST.hasVOP3PInsts()) { 1188 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1189 .customFor({V2S16, V2S16}) 1190 .lower(); 1191 } else 1192 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1193 1194 // Merge/Unmerge 1195 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1196 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1197 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1198 1199 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1200 const LLT Ty = Query.Types[TypeIdx]; 1201 if (Ty.isVector()) { 1202 const LLT &EltTy = Ty.getElementType(); 1203 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1204 return true; 1205 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1206 return true; 1207 } 1208 return false; 1209 }; 1210 1211 auto &Builder = getActionDefinitionsBuilder(Op) 1212 .lowerFor({{S16, V2S16}}) 1213 .lowerIf([=](const LegalityQuery &Query) { 1214 const LLT BigTy = Query.Types[BigTyIdx]; 1215 return BigTy.getSizeInBits() == 32; 1216 }) 1217 // Try to widen to s16 first for small types. 1218 // TODO: Only do this on targets with legal s16 shifts 1219 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1220 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1221 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1222 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1223 elementTypeIs(1, S16)), 1224 changeTo(1, V2S16)) 1225 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1226 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1227 // valid. 1228 .clampScalar(LitTyIdx, S32, S512) 1229 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1230 // Break up vectors with weird elements into scalars 1231 .fewerElementsIf( 1232 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1233 scalarize(0)) 1234 .fewerElementsIf( 1235 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1236 scalarize(1)) 1237 .clampScalar(BigTyIdx, S32, S1024); 1238 1239 if (Op == G_MERGE_VALUES) { 1240 Builder.widenScalarIf( 1241 // TODO: Use 16-bit shifts if legal for 8-bit values? 1242 [=](const LegalityQuery &Query) { 1243 const LLT Ty = Query.Types[LitTyIdx]; 1244 return Ty.getSizeInBits() < 32; 1245 }, 1246 changeTo(LitTyIdx, S32)); 1247 } 1248 1249 Builder.widenScalarIf( 1250 [=](const LegalityQuery &Query) { 1251 const LLT Ty = Query.Types[BigTyIdx]; 1252 return !isPowerOf2_32(Ty.getSizeInBits()) && 1253 Ty.getSizeInBits() % 16 != 0; 1254 }, 1255 [=](const LegalityQuery &Query) { 1256 // Pick the next power of 2, or a multiple of 64 over 128. 1257 // Whichever is smaller. 1258 const LLT &Ty = Query.Types[BigTyIdx]; 1259 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1260 if (NewSizeInBits >= 256) { 1261 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1262 if (RoundedTo < NewSizeInBits) 1263 NewSizeInBits = RoundedTo; 1264 } 1265 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1266 }) 1267 .legalIf([=](const LegalityQuery &Query) { 1268 const LLT &BigTy = Query.Types[BigTyIdx]; 1269 const LLT &LitTy = Query.Types[LitTyIdx]; 1270 1271 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1272 return false; 1273 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1274 return false; 1275 1276 return BigTy.getSizeInBits() % 16 == 0 && 1277 LitTy.getSizeInBits() % 16 == 0 && 1278 BigTy.getSizeInBits() <= 1024; 1279 }) 1280 // Any vectors left are the wrong size. Scalarize them. 1281 .scalarize(0) 1282 .scalarize(1); 1283 } 1284 1285 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1286 // RegBankSelect. 1287 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1288 .legalFor({{S32}, {S64}}); 1289 1290 if (ST.hasVOP3PInsts()) { 1291 SextInReg.lowerFor({{V2S16}}) 1292 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1293 // get more vector shift opportunities, since we'll get those when 1294 // expanded. 1295 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1296 } else if (ST.has16BitInsts()) { 1297 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1298 } else { 1299 // Prefer to promote to s32 before lowering if we don't have 16-bit 1300 // shifts. This avoid a lot of intermediate truncate and extend operations. 1301 SextInReg.lowerFor({{S32}, {S64}}); 1302 } 1303 1304 SextInReg 1305 .scalarize(0) 1306 .clampScalar(0, S32, S64) 1307 .lower(); 1308 1309 getActionDefinitionsBuilder(G_FSHR) 1310 .legalFor({{S32, S32}}) 1311 .scalarize(0) 1312 .lower(); 1313 1314 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1315 .legalFor({S64}); 1316 1317 getActionDefinitionsBuilder({ 1318 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1319 G_FCOPYSIGN, 1320 1321 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1322 G_READ_REGISTER, 1323 G_WRITE_REGISTER, 1324 1325 G_SADDO, G_SSUBO, 1326 1327 // TODO: Implement 1328 G_FMINIMUM, G_FMAXIMUM, 1329 G_FSHL 1330 }).lower(); 1331 1332 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1333 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1334 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1335 .unsupported(); 1336 1337 computeTables(); 1338 verify(*ST.getInstrInfo()); 1339 } 1340 1341 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1342 MachineRegisterInfo &MRI, 1343 MachineIRBuilder &B, 1344 GISelChangeObserver &Observer) const { 1345 switch (MI.getOpcode()) { 1346 case TargetOpcode::G_ADDRSPACE_CAST: 1347 return legalizeAddrSpaceCast(MI, MRI, B); 1348 case TargetOpcode::G_FRINT: 1349 return legalizeFrint(MI, MRI, B); 1350 case TargetOpcode::G_FCEIL: 1351 return legalizeFceil(MI, MRI, B); 1352 case TargetOpcode::G_INTRINSIC_TRUNC: 1353 return legalizeIntrinsicTrunc(MI, MRI, B); 1354 case TargetOpcode::G_SITOFP: 1355 return legalizeITOFP(MI, MRI, B, true); 1356 case TargetOpcode::G_UITOFP: 1357 return legalizeITOFP(MI, MRI, B, false); 1358 case TargetOpcode::G_FPTOSI: 1359 return legalizeFPTOI(MI, MRI, B, true); 1360 case TargetOpcode::G_FPTOUI: 1361 return legalizeFPTOI(MI, MRI, B, false); 1362 case TargetOpcode::G_FMINNUM: 1363 case TargetOpcode::G_FMAXNUM: 1364 case TargetOpcode::G_FMINNUM_IEEE: 1365 case TargetOpcode::G_FMAXNUM_IEEE: 1366 return legalizeMinNumMaxNum(MI, MRI, B); 1367 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1368 return legalizeExtractVectorElt(MI, MRI, B); 1369 case TargetOpcode::G_INSERT_VECTOR_ELT: 1370 return legalizeInsertVectorElt(MI, MRI, B); 1371 case TargetOpcode::G_SHUFFLE_VECTOR: 1372 return legalizeShuffleVector(MI, MRI, B); 1373 case TargetOpcode::G_FSIN: 1374 case TargetOpcode::G_FCOS: 1375 return legalizeSinCos(MI, MRI, B); 1376 case TargetOpcode::G_GLOBAL_VALUE: 1377 return legalizeGlobalValue(MI, MRI, B); 1378 case TargetOpcode::G_LOAD: 1379 return legalizeLoad(MI, MRI, B, Observer); 1380 case TargetOpcode::G_FMAD: 1381 return legalizeFMad(MI, MRI, B); 1382 case TargetOpcode::G_FDIV: 1383 return legalizeFDIV(MI, MRI, B); 1384 case TargetOpcode::G_UDIV: 1385 case TargetOpcode::G_UREM: 1386 return legalizeUDIV_UREM(MI, MRI, B); 1387 case TargetOpcode::G_SDIV: 1388 case TargetOpcode::G_SREM: 1389 return legalizeSDIV_SREM(MI, MRI, B); 1390 case TargetOpcode::G_ATOMIC_CMPXCHG: 1391 return legalizeAtomicCmpXChg(MI, MRI, B); 1392 case TargetOpcode::G_FLOG: 1393 return legalizeFlog(MI, B, numbers::ln2f); 1394 case TargetOpcode::G_FLOG10: 1395 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1396 case TargetOpcode::G_FEXP: 1397 return legalizeFExp(MI, B); 1398 case TargetOpcode::G_FPOW: 1399 return legalizeFPow(MI, B); 1400 case TargetOpcode::G_FFLOOR: 1401 return legalizeFFloor(MI, MRI, B); 1402 case TargetOpcode::G_BUILD_VECTOR: 1403 return legalizeBuildVector(MI, MRI, B); 1404 default: 1405 return false; 1406 } 1407 1408 llvm_unreachable("expected switch to return"); 1409 } 1410 1411 Register AMDGPULegalizerInfo::getSegmentAperture( 1412 unsigned AS, 1413 MachineRegisterInfo &MRI, 1414 MachineIRBuilder &B) const { 1415 MachineFunction &MF = B.getMF(); 1416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1417 const LLT S32 = LLT::scalar(32); 1418 1419 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1420 1421 if (ST.hasApertureRegs()) { 1422 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1423 // getreg. 1424 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1425 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1426 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1427 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1428 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1429 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1430 unsigned Encoding = 1431 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1432 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1433 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1434 1435 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1436 1437 B.buildInstr(AMDGPU::S_GETREG_B32) 1438 .addDef(GetReg) 1439 .addImm(Encoding); 1440 MRI.setType(GetReg, S32); 1441 1442 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1443 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1444 } 1445 1446 Register QueuePtr = MRI.createGenericVirtualRegister( 1447 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1448 1449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1450 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1451 return Register(); 1452 1453 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1454 // private_segment_aperture_base_hi. 1455 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1456 1457 // TODO: can we be smarter about machine pointer info? 1458 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1459 MachineMemOperand *MMO = MF.getMachineMemOperand( 1460 PtrInfo, 1461 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1462 MachineMemOperand::MOInvariant, 1463 4, commonAlignment(Align(64), StructOffset)); 1464 1465 Register LoadAddr; 1466 1467 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1468 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1469 } 1470 1471 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1472 MachineInstr &MI, MachineRegisterInfo &MRI, 1473 MachineIRBuilder &B) const { 1474 MachineFunction &MF = B.getMF(); 1475 1476 B.setInstr(MI); 1477 1478 const LLT S32 = LLT::scalar(32); 1479 Register Dst = MI.getOperand(0).getReg(); 1480 Register Src = MI.getOperand(1).getReg(); 1481 1482 LLT DstTy = MRI.getType(Dst); 1483 LLT SrcTy = MRI.getType(Src); 1484 unsigned DestAS = DstTy.getAddressSpace(); 1485 unsigned SrcAS = SrcTy.getAddressSpace(); 1486 1487 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1488 // vector element. 1489 assert(!DstTy.isVector()); 1490 1491 const AMDGPUTargetMachine &TM 1492 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1493 1494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1495 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1496 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1497 return true; 1498 } 1499 1500 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1501 // Truncate. 1502 B.buildExtract(Dst, Src, 0); 1503 MI.eraseFromParent(); 1504 return true; 1505 } 1506 1507 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1508 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1509 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1510 1511 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1512 // another. Merge operands are required to be the same type, but creating an 1513 // extra ptrtoint would be kind of pointless. 1514 auto HighAddr = B.buildConstant( 1515 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1516 B.buildMerge(Dst, {Src, HighAddr}); 1517 MI.eraseFromParent(); 1518 return true; 1519 } 1520 1521 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1522 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1523 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1524 unsigned NullVal = TM.getNullPointerValue(DestAS); 1525 1526 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1527 auto FlatNull = B.buildConstant(SrcTy, 0); 1528 1529 // Extract low 32-bits of the pointer. 1530 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1531 1532 auto CmpRes = 1533 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1534 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1535 1536 MI.eraseFromParent(); 1537 return true; 1538 } 1539 1540 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1541 return false; 1542 1543 if (!ST.hasFlatAddressSpace()) 1544 return false; 1545 1546 auto SegmentNull = 1547 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1548 auto FlatNull = 1549 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1550 1551 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1552 if (!ApertureReg.isValid()) 1553 return false; 1554 1555 auto CmpRes = 1556 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1557 1558 // Coerce the type of the low half of the result so we can use merge_values. 1559 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1560 1561 // TODO: Should we allow mismatched types but matching sizes in merges to 1562 // avoid the ptrtoint? 1563 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1564 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1565 1566 MI.eraseFromParent(); 1567 return true; 1568 } 1569 1570 bool AMDGPULegalizerInfo::legalizeFrint( 1571 MachineInstr &MI, MachineRegisterInfo &MRI, 1572 MachineIRBuilder &B) const { 1573 B.setInstr(MI); 1574 1575 Register Src = MI.getOperand(1).getReg(); 1576 LLT Ty = MRI.getType(Src); 1577 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1578 1579 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1580 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1581 1582 auto C1 = B.buildFConstant(Ty, C1Val); 1583 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1584 1585 // TODO: Should this propagate fast-math-flags? 1586 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1587 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1588 1589 auto C2 = B.buildFConstant(Ty, C2Val); 1590 auto Fabs = B.buildFAbs(Ty, Src); 1591 1592 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1593 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1594 return true; 1595 } 1596 1597 bool AMDGPULegalizerInfo::legalizeFceil( 1598 MachineInstr &MI, MachineRegisterInfo &MRI, 1599 MachineIRBuilder &B) const { 1600 B.setInstr(MI); 1601 1602 const LLT S1 = LLT::scalar(1); 1603 const LLT S64 = LLT::scalar(64); 1604 1605 Register Src = MI.getOperand(1).getReg(); 1606 assert(MRI.getType(Src) == S64); 1607 1608 // result = trunc(src) 1609 // if (src > 0.0 && src != result) 1610 // result += 1.0 1611 1612 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1613 1614 const auto Zero = B.buildFConstant(S64, 0.0); 1615 const auto One = B.buildFConstant(S64, 1.0); 1616 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1617 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1618 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1619 auto Add = B.buildSelect(S64, And, One, Zero); 1620 1621 // TODO: Should this propagate fast-math-flags? 1622 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1623 return true; 1624 } 1625 1626 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1627 MachineIRBuilder &B) { 1628 const unsigned FractBits = 52; 1629 const unsigned ExpBits = 11; 1630 LLT S32 = LLT::scalar(32); 1631 1632 auto Const0 = B.buildConstant(S32, FractBits - 32); 1633 auto Const1 = B.buildConstant(S32, ExpBits); 1634 1635 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1636 .addUse(Const0.getReg(0)) 1637 .addUse(Const1.getReg(0)); 1638 1639 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1640 } 1641 1642 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1643 MachineInstr &MI, MachineRegisterInfo &MRI, 1644 MachineIRBuilder &B) const { 1645 B.setInstr(MI); 1646 1647 const LLT S1 = LLT::scalar(1); 1648 const LLT S32 = LLT::scalar(32); 1649 const LLT S64 = LLT::scalar(64); 1650 1651 Register Src = MI.getOperand(1).getReg(); 1652 assert(MRI.getType(Src) == S64); 1653 1654 // TODO: Should this use extract since the low half is unused? 1655 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1656 Register Hi = Unmerge.getReg(1); 1657 1658 // Extract the upper half, since this is where we will find the sign and 1659 // exponent. 1660 auto Exp = extractF64Exponent(Hi, B); 1661 1662 const unsigned FractBits = 52; 1663 1664 // Extract the sign bit. 1665 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1666 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1667 1668 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1669 1670 const auto Zero32 = B.buildConstant(S32, 0); 1671 1672 // Extend back to 64-bits. 1673 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1674 1675 auto Shr = B.buildAShr(S64, FractMask, Exp); 1676 auto Not = B.buildNot(S64, Shr); 1677 auto Tmp0 = B.buildAnd(S64, Src, Not); 1678 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1679 1680 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1681 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1682 1683 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1684 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1685 return true; 1686 } 1687 1688 bool AMDGPULegalizerInfo::legalizeITOFP( 1689 MachineInstr &MI, MachineRegisterInfo &MRI, 1690 MachineIRBuilder &B, bool Signed) const { 1691 B.setInstr(MI); 1692 1693 Register Dst = MI.getOperand(0).getReg(); 1694 Register Src = MI.getOperand(1).getReg(); 1695 1696 const LLT S64 = LLT::scalar(64); 1697 const LLT S32 = LLT::scalar(32); 1698 1699 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1700 1701 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1702 1703 auto CvtHi = Signed ? 1704 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1705 B.buildUITOFP(S64, Unmerge.getReg(1)); 1706 1707 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1708 1709 auto ThirtyTwo = B.buildConstant(S32, 32); 1710 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1711 .addUse(CvtHi.getReg(0)) 1712 .addUse(ThirtyTwo.getReg(0)); 1713 1714 // TODO: Should this propagate fast-math-flags? 1715 B.buildFAdd(Dst, LdExp, CvtLo); 1716 MI.eraseFromParent(); 1717 return true; 1718 } 1719 1720 // TODO: Copied from DAG implementation. Verify logic and document how this 1721 // actually works. 1722 bool AMDGPULegalizerInfo::legalizeFPTOI( 1723 MachineInstr &MI, MachineRegisterInfo &MRI, 1724 MachineIRBuilder &B, bool Signed) const { 1725 B.setInstr(MI); 1726 1727 Register Dst = MI.getOperand(0).getReg(); 1728 Register Src = MI.getOperand(1).getReg(); 1729 1730 const LLT S64 = LLT::scalar(64); 1731 const LLT S32 = LLT::scalar(32); 1732 1733 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1734 1735 unsigned Flags = MI.getFlags(); 1736 1737 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1738 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1739 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1740 1741 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1742 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1743 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1744 1745 auto Hi = Signed ? 1746 B.buildFPTOSI(S32, FloorMul) : 1747 B.buildFPTOUI(S32, FloorMul); 1748 auto Lo = B.buildFPTOUI(S32, Fma); 1749 1750 B.buildMerge(Dst, { Lo, Hi }); 1751 MI.eraseFromParent(); 1752 1753 return true; 1754 } 1755 1756 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1757 MachineInstr &MI, MachineRegisterInfo &MRI, 1758 MachineIRBuilder &B) const { 1759 MachineFunction &MF = B.getMF(); 1760 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1761 1762 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1763 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1764 1765 // With ieee_mode disabled, the instructions have the correct behavior 1766 // already for G_FMINNUM/G_FMAXNUM 1767 if (!MFI->getMode().IEEE) 1768 return !IsIEEEOp; 1769 1770 if (IsIEEEOp) 1771 return true; 1772 1773 MachineIRBuilder HelperBuilder(MI); 1774 GISelObserverWrapper DummyObserver; 1775 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1776 HelperBuilder.setInstr(MI); 1777 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1778 } 1779 1780 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1781 MachineInstr &MI, MachineRegisterInfo &MRI, 1782 MachineIRBuilder &B) const { 1783 // TODO: Should move some of this into LegalizerHelper. 1784 1785 // TODO: Promote dynamic indexing of s16 to s32 1786 1787 // FIXME: Artifact combiner probably should have replaced the truncated 1788 // constant before this, so we shouldn't need 1789 // getConstantVRegValWithLookThrough. 1790 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1791 MI.getOperand(2).getReg(), MRI); 1792 if (!IdxVal) // Dynamic case will be selected to register indexing. 1793 return true; 1794 1795 Register Dst = MI.getOperand(0).getReg(); 1796 Register Vec = MI.getOperand(1).getReg(); 1797 1798 LLT VecTy = MRI.getType(Vec); 1799 LLT EltTy = VecTy.getElementType(); 1800 assert(EltTy == MRI.getType(Dst)); 1801 1802 B.setInstr(MI); 1803 1804 if (IdxVal->Value < VecTy.getNumElements()) 1805 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1806 else 1807 B.buildUndef(Dst); 1808 1809 MI.eraseFromParent(); 1810 return true; 1811 } 1812 1813 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1814 MachineInstr &MI, MachineRegisterInfo &MRI, 1815 MachineIRBuilder &B) const { 1816 // TODO: Should move some of this into LegalizerHelper. 1817 1818 // TODO: Promote dynamic indexing of s16 to s32 1819 1820 // FIXME: Artifact combiner probably should have replaced the truncated 1821 // constant before this, so we shouldn't need 1822 // getConstantVRegValWithLookThrough. 1823 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1824 MI.getOperand(3).getReg(), MRI); 1825 if (!IdxVal) // Dynamic case will be selected to register indexing. 1826 return true; 1827 1828 Register Dst = MI.getOperand(0).getReg(); 1829 Register Vec = MI.getOperand(1).getReg(); 1830 Register Ins = MI.getOperand(2).getReg(); 1831 1832 LLT VecTy = MRI.getType(Vec); 1833 LLT EltTy = VecTy.getElementType(); 1834 assert(EltTy == MRI.getType(Ins)); 1835 1836 B.setInstr(MI); 1837 1838 if (IdxVal->Value < VecTy.getNumElements()) 1839 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1840 else 1841 B.buildUndef(Dst); 1842 1843 MI.eraseFromParent(); 1844 return true; 1845 } 1846 1847 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1848 MachineInstr &MI, MachineRegisterInfo &MRI, 1849 MachineIRBuilder &B) const { 1850 const LLT V2S16 = LLT::vector(2, 16); 1851 1852 Register Dst = MI.getOperand(0).getReg(); 1853 Register Src0 = MI.getOperand(1).getReg(); 1854 LLT DstTy = MRI.getType(Dst); 1855 LLT SrcTy = MRI.getType(Src0); 1856 1857 if (SrcTy == V2S16 && DstTy == V2S16 && 1858 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1859 return true; 1860 1861 MachineIRBuilder HelperBuilder(MI); 1862 GISelObserverWrapper DummyObserver; 1863 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1864 HelperBuilder.setInstr(MI); 1865 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1866 } 1867 1868 bool AMDGPULegalizerInfo::legalizeSinCos( 1869 MachineInstr &MI, MachineRegisterInfo &MRI, 1870 MachineIRBuilder &B) const { 1871 B.setInstr(MI); 1872 1873 Register DstReg = MI.getOperand(0).getReg(); 1874 Register SrcReg = MI.getOperand(1).getReg(); 1875 LLT Ty = MRI.getType(DstReg); 1876 unsigned Flags = MI.getFlags(); 1877 1878 Register TrigVal; 1879 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1880 if (ST.hasTrigReducedRange()) { 1881 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1882 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1883 .addUse(MulVal.getReg(0)) 1884 .setMIFlags(Flags).getReg(0); 1885 } else 1886 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1887 1888 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1889 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1890 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1891 .addUse(TrigVal) 1892 .setMIFlags(Flags); 1893 MI.eraseFromParent(); 1894 return true; 1895 } 1896 1897 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1898 Register DstReg, LLT PtrTy, 1899 MachineIRBuilder &B, const GlobalValue *GV, 1900 unsigned Offset, unsigned GAFlags) const { 1901 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1902 // to the following code sequence: 1903 // 1904 // For constant address space: 1905 // s_getpc_b64 s[0:1] 1906 // s_add_u32 s0, s0, $symbol 1907 // s_addc_u32 s1, s1, 0 1908 // 1909 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1910 // a fixup or relocation is emitted to replace $symbol with a literal 1911 // constant, which is a pc-relative offset from the encoding of the $symbol 1912 // operand to the global variable. 1913 // 1914 // For global address space: 1915 // s_getpc_b64 s[0:1] 1916 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1917 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1918 // 1919 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1920 // fixups or relocations are emitted to replace $symbol@*@lo and 1921 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1922 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1923 // operand to the global variable. 1924 // 1925 // What we want here is an offset from the value returned by s_getpc 1926 // (which is the address of the s_add_u32 instruction) to the global 1927 // variable, but since the encoding of $symbol starts 4 bytes after the start 1928 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1929 // small. This requires us to add 4 to the global variable offset in order to 1930 // compute the correct address. 1931 1932 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1933 1934 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1935 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1936 1937 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1938 .addDef(PCReg); 1939 1940 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1941 if (GAFlags == SIInstrInfo::MO_NONE) 1942 MIB.addImm(0); 1943 else 1944 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1945 1946 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1947 1948 if (PtrTy.getSizeInBits() == 32) 1949 B.buildExtract(DstReg, PCReg, 0); 1950 return true; 1951 } 1952 1953 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1954 MachineInstr &MI, MachineRegisterInfo &MRI, 1955 MachineIRBuilder &B) const { 1956 Register DstReg = MI.getOperand(0).getReg(); 1957 LLT Ty = MRI.getType(DstReg); 1958 unsigned AS = Ty.getAddressSpace(); 1959 1960 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1961 MachineFunction &MF = B.getMF(); 1962 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1963 B.setInstr(MI); 1964 1965 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1966 if (!MFI->isEntryFunction()) { 1967 const Function &Fn = MF.getFunction(); 1968 DiagnosticInfoUnsupported BadLDSDecl( 1969 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1970 DS_Warning); 1971 Fn.getContext().diagnose(BadLDSDecl); 1972 1973 // We currently don't have a way to correctly allocate LDS objects that 1974 // aren't directly associated with a kernel. We do force inlining of 1975 // functions that use local objects. However, if these dead functions are 1976 // not eliminated, we don't want a compile time error. Just emit a warning 1977 // and a trap, since there should be no callable path here. 1978 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1979 B.buildUndef(DstReg); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 // TODO: We could emit code to handle the initialization somewhere. 1985 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1986 const SITargetLowering *TLI = ST.getTargetLowering(); 1987 if (!TLI->shouldUseLDSConstAddress(GV)) { 1988 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1989 return true; // Leave in place; 1990 } 1991 1992 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1993 MI.eraseFromParent(); 1994 return true; 1995 } 1996 1997 const Function &Fn = MF.getFunction(); 1998 DiagnosticInfoUnsupported BadInit( 1999 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2000 Fn.getContext().diagnose(BadInit); 2001 return true; 2002 } 2003 2004 const SITargetLowering *TLI = ST.getTargetLowering(); 2005 2006 if (TLI->shouldEmitFixup(GV)) { 2007 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 if (TLI->shouldEmitPCReloc(GV)) { 2013 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2019 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2020 2021 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2022 MachinePointerInfo::getGOT(MF), 2023 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2024 MachineMemOperand::MOInvariant, 2025 8 /*Size*/, Align(8)); 2026 2027 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2028 2029 if (Ty.getSizeInBits() == 32) { 2030 // Truncate if this is a 32-bit constant adrdess. 2031 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2032 B.buildExtract(DstReg, Load, 0); 2033 } else 2034 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2035 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 bool AMDGPULegalizerInfo::legalizeLoad( 2041 MachineInstr &MI, MachineRegisterInfo &MRI, 2042 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2043 B.setInstr(MI); 2044 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2045 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2046 Observer.changingInstr(MI); 2047 MI.getOperand(1).setReg(Cast.getReg(0)); 2048 Observer.changedInstr(MI); 2049 return true; 2050 } 2051 2052 bool AMDGPULegalizerInfo::legalizeFMad( 2053 MachineInstr &MI, MachineRegisterInfo &MRI, 2054 MachineIRBuilder &B) const { 2055 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2056 assert(Ty.isScalar()); 2057 2058 MachineFunction &MF = B.getMF(); 2059 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2060 2061 // TODO: Always legal with future ftz flag. 2062 // FIXME: Do we need just output? 2063 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2064 return true; 2065 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2066 return true; 2067 2068 MachineIRBuilder HelperBuilder(MI); 2069 GISelObserverWrapper DummyObserver; 2070 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2071 HelperBuilder.setInstr(MI); 2072 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2077 Register DstReg = MI.getOperand(0).getReg(); 2078 Register PtrReg = MI.getOperand(1).getReg(); 2079 Register CmpVal = MI.getOperand(2).getReg(); 2080 Register NewVal = MI.getOperand(3).getReg(); 2081 2082 assert(SITargetLowering::isFlatGlobalAddrSpace( 2083 MRI.getType(PtrReg).getAddressSpace()) && 2084 "this should not have been custom lowered"); 2085 2086 LLT ValTy = MRI.getType(CmpVal); 2087 LLT VecTy = LLT::vector(2, ValTy); 2088 2089 B.setInstr(MI); 2090 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2091 2092 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2093 .addDef(DstReg) 2094 .addUse(PtrReg) 2095 .addUse(PackedVal) 2096 .setMemRefs(MI.memoperands()); 2097 2098 MI.eraseFromParent(); 2099 return true; 2100 } 2101 2102 bool AMDGPULegalizerInfo::legalizeFlog( 2103 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2104 Register Dst = MI.getOperand(0).getReg(); 2105 Register Src = MI.getOperand(1).getReg(); 2106 LLT Ty = B.getMRI()->getType(Dst); 2107 unsigned Flags = MI.getFlags(); 2108 B.setInstr(MI); 2109 2110 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2111 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2112 2113 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2114 MI.eraseFromParent(); 2115 return true; 2116 } 2117 2118 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2119 MachineIRBuilder &B) const { 2120 Register Dst = MI.getOperand(0).getReg(); 2121 Register Src = MI.getOperand(1).getReg(); 2122 unsigned Flags = MI.getFlags(); 2123 LLT Ty = B.getMRI()->getType(Dst); 2124 B.setInstr(MI); 2125 2126 auto K = B.buildFConstant(Ty, numbers::log2e); 2127 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2128 B.buildFExp2(Dst, Mul, Flags); 2129 MI.eraseFromParent(); 2130 return true; 2131 } 2132 2133 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2134 MachineIRBuilder &B) const { 2135 Register Dst = MI.getOperand(0).getReg(); 2136 Register Src0 = MI.getOperand(1).getReg(); 2137 Register Src1 = MI.getOperand(2).getReg(); 2138 unsigned Flags = MI.getFlags(); 2139 LLT Ty = B.getMRI()->getType(Dst); 2140 B.setInstr(MI); 2141 const LLT S16 = LLT::scalar(16); 2142 const LLT S32 = LLT::scalar(32); 2143 2144 if (Ty == S32) { 2145 auto Log = B.buildFLog2(S32, Src0, Flags); 2146 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2147 .addUse(Log.getReg(0)) 2148 .addUse(Src1) 2149 .setMIFlags(Flags); 2150 B.buildFExp2(Dst, Mul, Flags); 2151 } else if (Ty == S16) { 2152 // There's no f16 fmul_legacy, so we need to convert for it. 2153 auto Log = B.buildFLog2(S16, Src0, Flags); 2154 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2155 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2156 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2157 .addUse(Ext0.getReg(0)) 2158 .addUse(Ext1.getReg(0)) 2159 .setMIFlags(Flags); 2160 2161 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2162 } else 2163 return false; 2164 2165 MI.eraseFromParent(); 2166 return true; 2167 } 2168 2169 // Find a source register, ignoring any possible source modifiers. 2170 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2171 Register ModSrc = OrigSrc; 2172 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2173 ModSrc = SrcFNeg->getOperand(1).getReg(); 2174 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2175 ModSrc = SrcFAbs->getOperand(1).getReg(); 2176 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2177 ModSrc = SrcFAbs->getOperand(1).getReg(); 2178 return ModSrc; 2179 } 2180 2181 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2182 MachineRegisterInfo &MRI, 2183 MachineIRBuilder &B) const { 2184 B.setInstr(MI); 2185 2186 const LLT S1 = LLT::scalar(1); 2187 const LLT S64 = LLT::scalar(64); 2188 Register Dst = MI.getOperand(0).getReg(); 2189 Register OrigSrc = MI.getOperand(1).getReg(); 2190 unsigned Flags = MI.getFlags(); 2191 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2192 "this should not have been custom lowered"); 2193 2194 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2195 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2196 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2197 // V_FRACT bug is: 2198 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2199 // 2200 // Convert floor(x) to (x - fract(x)) 2201 2202 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2203 .addUse(OrigSrc) 2204 .setMIFlags(Flags); 2205 2206 // Give source modifier matching some assistance before obscuring a foldable 2207 // pattern. 2208 2209 // TODO: We can avoid the neg on the fract? The input sign to fract 2210 // shouldn't matter? 2211 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2212 2213 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2214 2215 Register Min = MRI.createGenericVirtualRegister(S64); 2216 2217 // We don't need to concern ourselves with the snan handling difference, so 2218 // use the one which will directly select. 2219 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2220 if (MFI->getMode().IEEE) 2221 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2222 else 2223 B.buildFMinNum(Min, Fract, Const, Flags); 2224 2225 Register CorrectedFract = Min; 2226 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2227 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2228 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2229 } 2230 2231 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2232 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2233 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 // Turn an illegal packed v2s16 build vector into bit operations. 2239 // TODO: This should probably be a bitcast action in LegalizerHelper. 2240 bool AMDGPULegalizerInfo::legalizeBuildVector( 2241 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2242 Register Dst = MI.getOperand(0).getReg(); 2243 const LLT S32 = LLT::scalar(32); 2244 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2245 2246 Register Src0 = MI.getOperand(1).getReg(); 2247 Register Src1 = MI.getOperand(2).getReg(); 2248 assert(MRI.getType(Src0) == LLT::scalar(16)); 2249 2250 B.setInstr(MI); 2251 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2252 B.buildBitcast(Dst, Merge); 2253 2254 MI.eraseFromParent(); 2255 return true; 2256 } 2257 2258 // Return the use branch instruction, otherwise null if the usage is invalid. 2259 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2260 MachineRegisterInfo &MRI, 2261 MachineInstr *&Br, 2262 MachineBasicBlock *&UncondBrTarget) { 2263 Register CondDef = MI.getOperand(0).getReg(); 2264 if (!MRI.hasOneNonDBGUse(CondDef)) 2265 return nullptr; 2266 2267 MachineBasicBlock *Parent = MI.getParent(); 2268 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2269 if (UseMI.getParent() != Parent || 2270 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2271 return nullptr; 2272 2273 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2274 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2275 if (Next == Parent->end()) { 2276 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2277 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2278 return nullptr; 2279 UncondBrTarget = &*NextMBB; 2280 } else { 2281 if (Next->getOpcode() != AMDGPU::G_BR) 2282 return nullptr; 2283 Br = &*Next; 2284 UncondBrTarget = Br->getOperand(0).getMBB(); 2285 } 2286 2287 return &UseMI; 2288 } 2289 2290 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2291 MachineRegisterInfo &MRI, 2292 Register LiveIn, 2293 Register PhyReg) const { 2294 assert(PhyReg.isPhysical() && "Physical register expected"); 2295 2296 // Insert the live-in copy, if required, by defining destination virtual 2297 // register. 2298 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2299 if (!MRI.getVRegDef(LiveIn)) { 2300 // FIXME: Should have scoped insert pt 2301 MachineBasicBlock &OrigInsBB = B.getMBB(); 2302 auto OrigInsPt = B.getInsertPt(); 2303 2304 MachineBasicBlock &EntryMBB = B.getMF().front(); 2305 EntryMBB.addLiveIn(PhyReg); 2306 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2307 B.buildCopy(LiveIn, PhyReg); 2308 2309 B.setInsertPt(OrigInsBB, OrigInsPt); 2310 } 2311 2312 return LiveIn; 2313 } 2314 2315 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2316 MachineRegisterInfo &MRI, 2317 Register PhyReg, LLT Ty, 2318 bool InsertLiveInCopy) const { 2319 assert(PhyReg.isPhysical() && "Physical register expected"); 2320 2321 // Get or create virtual live-in regester 2322 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2323 if (!LiveIn) { 2324 LiveIn = MRI.createGenericVirtualRegister(Ty); 2325 MRI.addLiveIn(PhyReg, LiveIn); 2326 } 2327 2328 // When the actual true copy required is from virtual register to physical 2329 // register (to be inserted later), live-in copy insertion from physical 2330 // to register virtual register is not required 2331 if (!InsertLiveInCopy) 2332 return LiveIn; 2333 2334 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2335 } 2336 2337 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2338 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2339 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2340 const ArgDescriptor *Arg; 2341 const TargetRegisterClass *RC; 2342 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2343 if (!Arg) { 2344 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2345 return nullptr; 2346 } 2347 return Arg; 2348 } 2349 2350 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2351 const ArgDescriptor *Arg) const { 2352 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2353 return false; // TODO: Handle these 2354 2355 Register SrcReg = Arg->getRegister(); 2356 assert(SrcReg.isPhysical() && "Physical register expected"); 2357 assert(DstReg.isVirtual() && "Virtual register expected"); 2358 2359 MachineRegisterInfo &MRI = *B.getMRI(); 2360 2361 LLT Ty = MRI.getType(DstReg); 2362 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2363 2364 if (Arg->isMasked()) { 2365 // TODO: Should we try to emit this once in the entry block? 2366 const LLT S32 = LLT::scalar(32); 2367 const unsigned Mask = Arg->getMask(); 2368 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2369 2370 Register AndMaskSrc = LiveIn; 2371 2372 if (Shift != 0) { 2373 auto ShiftAmt = B.buildConstant(S32, Shift); 2374 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2375 } 2376 2377 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2378 } else { 2379 B.buildCopy(DstReg, LiveIn); 2380 } 2381 2382 return true; 2383 } 2384 2385 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2386 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2387 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2388 B.setInstr(MI); 2389 2390 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2391 if (!Arg) 2392 return false; 2393 2394 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2395 return false; 2396 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2402 MachineRegisterInfo &MRI, 2403 MachineIRBuilder &B) const { 2404 B.setInstr(MI); 2405 Register Dst = MI.getOperand(0).getReg(); 2406 LLT DstTy = MRI.getType(Dst); 2407 LLT S16 = LLT::scalar(16); 2408 LLT S32 = LLT::scalar(32); 2409 LLT S64 = LLT::scalar(64); 2410 2411 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2412 return true; 2413 2414 if (DstTy == S16) 2415 return legalizeFDIV16(MI, MRI, B); 2416 if (DstTy == S32) 2417 return legalizeFDIV32(MI, MRI, B); 2418 if (DstTy == S64) 2419 return legalizeFDIV64(MI, MRI, B); 2420 2421 return false; 2422 } 2423 2424 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2425 const LLT S32 = LLT::scalar(32); 2426 2427 auto Cvt0 = B.buildUITOFP(S32, Src); 2428 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2429 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2430 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2431 return B.buildFPTOUI(S32, Mul).getReg(0); 2432 } 2433 2434 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2435 Register DstReg, 2436 Register Num, 2437 Register Den, 2438 bool IsRem) const { 2439 const LLT S1 = LLT::scalar(1); 2440 const LLT S32 = LLT::scalar(32); 2441 2442 // RCP = URECIP(Den) = 2^32 / Den + e 2443 // e is rounding error. 2444 auto RCP = buildDivRCP(B, Den); 2445 2446 // RCP_LO = mul(RCP, Den) 2447 auto RCP_LO = B.buildMul(S32, RCP, Den); 2448 2449 // RCP_HI = mulhu (RCP, Den) */ 2450 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2451 2452 // NEG_RCP_LO = -RCP_LO 2453 auto Zero = B.buildConstant(S32, 0); 2454 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2455 2456 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2457 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2458 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2459 2460 // Calculate the rounding error from the URECIP instruction 2461 // E = mulhu(ABS_RCP_LO, RCP) 2462 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2463 2464 // RCP_A_E = RCP + E 2465 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2466 2467 // RCP_S_E = RCP - E 2468 auto RCP_S_E = B.buildSub(S32, RCP, E); 2469 2470 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2471 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2472 2473 // Quotient = mulhu(Tmp0, Num)stmp 2474 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2475 2476 // Num_S_Remainder = Quotient * Den 2477 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2478 2479 // Remainder = Num - Num_S_Remainder 2480 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2481 2482 // Remainder_GE_Den = Remainder >= Den 2483 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2484 2485 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2486 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2487 Num, Num_S_Remainder); 2488 2489 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2490 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2491 2492 // Calculate Division result: 2493 2494 // Quotient_A_One = Quotient + 1 2495 auto One = B.buildConstant(S32, 1); 2496 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2497 2498 // Quotient_S_One = Quotient - 1 2499 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2500 2501 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2502 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2503 2504 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2505 if (IsRem) { 2506 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2507 2508 // Calculate Rem result: 2509 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2510 2511 // Remainder_A_Den = Remainder + Den 2512 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2513 2514 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2515 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2516 2517 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2518 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2519 } else { 2520 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2521 } 2522 } 2523 2524 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2525 MachineRegisterInfo &MRI, 2526 MachineIRBuilder &B) const { 2527 B.setInstr(MI); 2528 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2529 Register DstReg = MI.getOperand(0).getReg(); 2530 Register Num = MI.getOperand(1).getReg(); 2531 Register Den = MI.getOperand(2).getReg(); 2532 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2533 MI.eraseFromParent(); 2534 return true; 2535 } 2536 2537 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2538 // 2539 // Return lo, hi of result 2540 // 2541 // %cvt.lo = G_UITOFP Val.lo 2542 // %cvt.hi = G_UITOFP Val.hi 2543 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2544 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2545 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2546 // %mul2 = G_FMUL %mul1, 2**(-32) 2547 // %trunc = G_INTRINSIC_TRUNC %mul2 2548 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2549 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2550 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2551 Register Val) { 2552 const LLT S32 = LLT::scalar(32); 2553 auto Unmerge = B.buildUnmerge(S32, Val); 2554 2555 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2556 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2557 2558 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2559 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2560 2561 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2562 auto Mul1 = 2563 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2564 2565 // 2**(-32) 2566 auto Mul2 = 2567 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2568 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2569 2570 // -(2**32) 2571 auto Mad2 = B.buildFMAD(S32, Trunc, 2572 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2573 2574 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2575 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2576 2577 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2578 } 2579 2580 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2581 MachineRegisterInfo &MRI, 2582 MachineIRBuilder &B) const { 2583 B.setInstr(MI); 2584 2585 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2586 const LLT S32 = LLT::scalar(32); 2587 const LLT S64 = LLT::scalar(64); 2588 const LLT S1 = LLT::scalar(1); 2589 Register Numer = MI.getOperand(1).getReg(); 2590 Register Denom = MI.getOperand(2).getReg(); 2591 Register RcpLo, RcpHi; 2592 2593 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2594 2595 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2596 2597 auto Zero64 = B.buildConstant(S64, 0); 2598 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2599 2600 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2601 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2602 2603 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2604 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2605 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2606 2607 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2608 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2609 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2610 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2611 2612 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2613 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2614 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2615 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2616 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2617 2618 auto Zero32 = B.buildConstant(S32, 0); 2619 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2620 auto Add2_HiC = 2621 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2622 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2623 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2624 2625 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2626 Register NumerLo = UnmergeNumer.getReg(0); 2627 Register NumerHi = UnmergeNumer.getReg(1); 2628 2629 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2630 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2631 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2632 Register Mul3_Lo = UnmergeMul3.getReg(0); 2633 Register Mul3_Hi = UnmergeMul3.getReg(1); 2634 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2635 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2636 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2637 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2638 2639 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2640 Register DenomLo = UnmergeDenom.getReg(0); 2641 Register DenomHi = UnmergeDenom.getReg(1); 2642 2643 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2644 auto C1 = B.buildSExt(S32, CmpHi); 2645 2646 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2647 auto C2 = B.buildSExt(S32, CmpLo); 2648 2649 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2650 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2651 2652 // TODO: Here and below portions of the code can be enclosed into if/endif. 2653 // Currently control flow is unconditional and we have 4 selects after 2654 // potential endif to substitute PHIs. 2655 2656 // if C3 != 0 ... 2657 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2658 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2659 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2660 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2661 2662 auto One64 = B.buildConstant(S64, 1); 2663 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2664 2665 auto C4 = 2666 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2667 auto C5 = 2668 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2669 auto C6 = B.buildSelect( 2670 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2671 2672 // if (C6 != 0) 2673 auto Add4 = B.buildAdd(S64, Add3, One64); 2674 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2675 2676 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2677 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2678 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2679 2680 // endif C6 2681 // endif C3 2682 2683 if (IsDiv) { 2684 auto Sel1 = B.buildSelect( 2685 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2686 B.buildSelect(MI.getOperand(0), 2687 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2688 } else { 2689 auto Sel2 = B.buildSelect( 2690 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2691 B.buildSelect(MI.getOperand(0), 2692 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2693 } 2694 2695 MI.eraseFromParent(); 2696 return true; 2697 } 2698 2699 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2700 MachineRegisterInfo &MRI, 2701 MachineIRBuilder &B) const { 2702 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2703 if (Ty == LLT::scalar(32)) 2704 return legalizeUDIV_UREM32(MI, MRI, B); 2705 if (Ty == LLT::scalar(64)) 2706 return legalizeUDIV_UREM64(MI, MRI, B); 2707 return false; 2708 } 2709 2710 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2711 MachineRegisterInfo &MRI, 2712 MachineIRBuilder &B) const { 2713 B.setInstr(MI); 2714 const LLT S32 = LLT::scalar(32); 2715 2716 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2717 Register DstReg = MI.getOperand(0).getReg(); 2718 Register LHS = MI.getOperand(1).getReg(); 2719 Register RHS = MI.getOperand(2).getReg(); 2720 2721 auto ThirtyOne = B.buildConstant(S32, 31); 2722 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2723 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2724 2725 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2726 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2727 2728 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2729 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2730 2731 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2732 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2733 2734 if (IsRem) { 2735 auto RSign = LHSign; // Remainder sign is the same as LHS 2736 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2737 B.buildSub(DstReg, UDivRem, RSign); 2738 } else { 2739 auto DSign = B.buildXor(S32, LHSign, RHSign); 2740 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2741 B.buildSub(DstReg, UDivRem, DSign); 2742 } 2743 2744 MI.eraseFromParent(); 2745 return true; 2746 } 2747 2748 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2749 MachineRegisterInfo &MRI, 2750 MachineIRBuilder &B) const { 2751 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2752 return legalizeSDIV_SREM32(MI, MRI, B); 2753 return false; 2754 } 2755 2756 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2757 MachineRegisterInfo &MRI, 2758 MachineIRBuilder &B) const { 2759 Register Res = MI.getOperand(0).getReg(); 2760 Register LHS = MI.getOperand(1).getReg(); 2761 Register RHS = MI.getOperand(2).getReg(); 2762 2763 uint16_t Flags = MI.getFlags(); 2764 2765 LLT ResTy = MRI.getType(Res); 2766 LLT S32 = LLT::scalar(32); 2767 LLT S64 = LLT::scalar(64); 2768 2769 const MachineFunction &MF = B.getMF(); 2770 bool Unsafe = 2771 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2772 2773 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2774 return false; 2775 2776 if (!Unsafe && ResTy == S32 && 2777 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2778 return false; 2779 2780 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2781 // 1 / x -> RCP(x) 2782 if (CLHS->isExactlyValue(1.0)) { 2783 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2784 .addUse(RHS) 2785 .setMIFlags(Flags); 2786 2787 MI.eraseFromParent(); 2788 return true; 2789 } 2790 2791 // -1 / x -> RCP( FNEG(x) ) 2792 if (CLHS->isExactlyValue(-1.0)) { 2793 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2794 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2795 .addUse(FNeg.getReg(0)) 2796 .setMIFlags(Flags); 2797 2798 MI.eraseFromParent(); 2799 return true; 2800 } 2801 } 2802 2803 // x / y -> x * (1.0 / y) 2804 if (Unsafe) { 2805 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2806 .addUse(RHS) 2807 .setMIFlags(Flags); 2808 B.buildFMul(Res, LHS, RCP, Flags); 2809 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 2814 return false; 2815 } 2816 2817 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 B.setInstr(MI); 2821 Register Res = MI.getOperand(0).getReg(); 2822 Register LHS = MI.getOperand(1).getReg(); 2823 Register RHS = MI.getOperand(2).getReg(); 2824 2825 uint16_t Flags = MI.getFlags(); 2826 2827 LLT S16 = LLT::scalar(16); 2828 LLT S32 = LLT::scalar(32); 2829 2830 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2831 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2832 2833 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2834 .addUse(RHSExt.getReg(0)) 2835 .setMIFlags(Flags); 2836 2837 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2838 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2839 2840 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2841 .addUse(RDst.getReg(0)) 2842 .addUse(RHS) 2843 .addUse(LHS) 2844 .setMIFlags(Flags); 2845 2846 MI.eraseFromParent(); 2847 return true; 2848 } 2849 2850 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2851 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2852 static void toggleSPDenormMode(bool Enable, 2853 MachineIRBuilder &B, 2854 const GCNSubtarget &ST, 2855 AMDGPU::SIModeRegisterDefaults Mode) { 2856 // Set SP denorm mode to this value. 2857 unsigned SPDenormMode = 2858 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2859 2860 if (ST.hasDenormModeInst()) { 2861 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2862 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2863 2864 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2865 B.buildInstr(AMDGPU::S_DENORM_MODE) 2866 .addImm(NewDenormModeValue); 2867 2868 } else { 2869 // Select FP32 bit field in mode register. 2870 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2871 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2872 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2873 2874 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2875 .addImm(SPDenormMode) 2876 .addImm(SPDenormModeBitField); 2877 } 2878 } 2879 2880 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2881 MachineRegisterInfo &MRI, 2882 MachineIRBuilder &B) const { 2883 B.setInstr(MI); 2884 Register Res = MI.getOperand(0).getReg(); 2885 Register LHS = MI.getOperand(1).getReg(); 2886 Register RHS = MI.getOperand(2).getReg(); 2887 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2888 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2889 2890 uint16_t Flags = MI.getFlags(); 2891 2892 LLT S32 = LLT::scalar(32); 2893 LLT S1 = LLT::scalar(1); 2894 2895 auto One = B.buildFConstant(S32, 1.0f); 2896 2897 auto DenominatorScaled = 2898 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2899 .addUse(LHS) 2900 .addUse(RHS) 2901 .addImm(0) 2902 .setMIFlags(Flags); 2903 auto NumeratorScaled = 2904 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2905 .addUse(LHS) 2906 .addUse(RHS) 2907 .addImm(1) 2908 .setMIFlags(Flags); 2909 2910 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2911 .addUse(DenominatorScaled.getReg(0)) 2912 .setMIFlags(Flags); 2913 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2914 2915 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2916 // aren't modeled as reading it. 2917 if (!Mode.allFP32Denormals()) 2918 toggleSPDenormMode(true, B, ST, Mode); 2919 2920 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2921 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2922 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2923 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2924 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2925 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2926 2927 if (!Mode.allFP32Denormals()) 2928 toggleSPDenormMode(false, B, ST, Mode); 2929 2930 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2931 .addUse(Fma4.getReg(0)) 2932 .addUse(Fma1.getReg(0)) 2933 .addUse(Fma3.getReg(0)) 2934 .addUse(NumeratorScaled.getReg(1)) 2935 .setMIFlags(Flags); 2936 2937 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2938 .addUse(Fmas.getReg(0)) 2939 .addUse(RHS) 2940 .addUse(LHS) 2941 .setMIFlags(Flags); 2942 2943 MI.eraseFromParent(); 2944 return true; 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2948 MachineRegisterInfo &MRI, 2949 MachineIRBuilder &B) const { 2950 B.setInstr(MI); 2951 Register Res = MI.getOperand(0).getReg(); 2952 Register LHS = MI.getOperand(1).getReg(); 2953 Register RHS = MI.getOperand(2).getReg(); 2954 2955 uint16_t Flags = MI.getFlags(); 2956 2957 LLT S64 = LLT::scalar(64); 2958 LLT S1 = LLT::scalar(1); 2959 2960 auto One = B.buildFConstant(S64, 1.0); 2961 2962 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2963 .addUse(LHS) 2964 .addUse(RHS) 2965 .addImm(0) 2966 .setMIFlags(Flags); 2967 2968 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2969 2970 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2971 .addUse(DivScale0.getReg(0)) 2972 .setMIFlags(Flags); 2973 2974 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2975 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2976 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2977 2978 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2979 .addUse(LHS) 2980 .addUse(RHS) 2981 .addImm(1) 2982 .setMIFlags(Flags); 2983 2984 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2985 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 2986 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2987 2988 Register Scale; 2989 if (!ST.hasUsableDivScaleConditionOutput()) { 2990 // Workaround a hardware bug on SI where the condition output from div_scale 2991 // is not usable. 2992 2993 LLT S32 = LLT::scalar(32); 2994 2995 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2996 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2997 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2998 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2999 3000 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3001 Scale1Unmerge.getReg(1)); 3002 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3003 Scale0Unmerge.getReg(1)); 3004 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3005 } else { 3006 Scale = DivScale1.getReg(1); 3007 } 3008 3009 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3010 .addUse(Fma4.getReg(0)) 3011 .addUse(Fma3.getReg(0)) 3012 .addUse(Mul.getReg(0)) 3013 .addUse(Scale) 3014 .setMIFlags(Flags); 3015 3016 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3017 .addUse(Fmas.getReg(0)) 3018 .addUse(RHS) 3019 .addUse(LHS) 3020 .setMIFlags(Flags); 3021 3022 MI.eraseFromParent(); 3023 return true; 3024 } 3025 3026 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3027 MachineRegisterInfo &MRI, 3028 MachineIRBuilder &B) const { 3029 B.setInstr(MI); 3030 Register Res = MI.getOperand(0).getReg(); 3031 Register LHS = MI.getOperand(2).getReg(); 3032 Register RHS = MI.getOperand(3).getReg(); 3033 uint16_t Flags = MI.getFlags(); 3034 3035 LLT S32 = LLT::scalar(32); 3036 LLT S1 = LLT::scalar(1); 3037 3038 auto Abs = B.buildFAbs(S32, RHS, Flags); 3039 const APFloat C0Val(1.0f); 3040 3041 auto C0 = B.buildConstant(S32, 0x6f800000); 3042 auto C1 = B.buildConstant(S32, 0x2f800000); 3043 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3044 3045 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3046 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3047 3048 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3049 3050 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3051 .addUse(Mul0.getReg(0)) 3052 .setMIFlags(Flags); 3053 3054 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3055 3056 B.buildFMul(Res, Sel, Mul1, Flags); 3057 3058 MI.eraseFromParent(); 3059 return true; 3060 } 3061 3062 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3063 MachineRegisterInfo &MRI, 3064 MachineIRBuilder &B) const { 3065 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3066 if (!MFI->isEntryFunction()) { 3067 return legalizePreloadedArgIntrin(MI, MRI, B, 3068 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3069 } 3070 3071 B.setInstr(MI); 3072 3073 uint64_t Offset = 3074 ST.getTargetLowering()->getImplicitParameterOffset( 3075 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3076 Register DstReg = MI.getOperand(0).getReg(); 3077 LLT DstTy = MRI.getType(DstReg); 3078 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3079 3080 const ArgDescriptor *Arg; 3081 const TargetRegisterClass *RC; 3082 std::tie(Arg, RC) 3083 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3084 if (!Arg) 3085 return false; 3086 3087 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3088 if (!loadInputValue(KernargPtrReg, B, Arg)) 3089 return false; 3090 3091 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3092 MI.eraseFromParent(); 3093 return true; 3094 } 3095 3096 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3097 MachineRegisterInfo &MRI, 3098 MachineIRBuilder &B, 3099 unsigned AddrSpace) const { 3100 B.setInstr(MI); 3101 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3102 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3103 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3109 // offset (the offset that is included in bounds checking and swizzling, to be 3110 // split between the instruction's voffset and immoffset fields) and soffset 3111 // (the offset that is excluded from bounds checking and swizzling, to go in 3112 // the instruction's soffset field). This function takes the first kind of 3113 // offset and figures out how to split it between voffset and immoffset. 3114 std::tuple<Register, unsigned, unsigned> 3115 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3116 Register OrigOffset) const { 3117 const unsigned MaxImm = 4095; 3118 Register BaseReg; 3119 unsigned TotalConstOffset; 3120 MachineInstr *OffsetDef; 3121 const LLT S32 = LLT::scalar(32); 3122 3123 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3124 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3125 3126 unsigned ImmOffset = TotalConstOffset; 3127 3128 // If the immediate value is too big for the immoffset field, put the value 3129 // and -4096 into the immoffset field so that the value that is copied/added 3130 // for the voffset field is a multiple of 4096, and it stands more chance 3131 // of being CSEd with the copy/add for another similar load/store. 3132 // However, do not do that rounding down to a multiple of 4096 if that is a 3133 // negative number, as it appears to be illegal to have a negative offset 3134 // in the vgpr, even if adding the immediate offset makes it positive. 3135 unsigned Overflow = ImmOffset & ~MaxImm; 3136 ImmOffset -= Overflow; 3137 if ((int32_t)Overflow < 0) { 3138 Overflow += ImmOffset; 3139 ImmOffset = 0; 3140 } 3141 3142 if (Overflow != 0) { 3143 if (!BaseReg) { 3144 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3145 } else { 3146 auto OverflowVal = B.buildConstant(S32, Overflow); 3147 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3148 } 3149 } 3150 3151 if (!BaseReg) 3152 BaseReg = B.buildConstant(S32, 0).getReg(0); 3153 3154 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3155 } 3156 3157 /// Handle register layout difference for f16 images for some subtargets. 3158 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3159 MachineRegisterInfo &MRI, 3160 Register Reg) const { 3161 if (!ST.hasUnpackedD16VMem()) 3162 return Reg; 3163 3164 const LLT S16 = LLT::scalar(16); 3165 const LLT S32 = LLT::scalar(32); 3166 LLT StoreVT = MRI.getType(Reg); 3167 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3168 3169 auto Unmerge = B.buildUnmerge(S16, Reg); 3170 3171 SmallVector<Register, 4> WideRegs; 3172 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3173 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3174 3175 int NumElts = StoreVT.getNumElements(); 3176 3177 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3178 } 3179 3180 Register AMDGPULegalizerInfo::fixStoreSourceType( 3181 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3182 MachineRegisterInfo *MRI = B.getMRI(); 3183 LLT Ty = MRI->getType(VData); 3184 3185 const LLT S16 = LLT::scalar(16); 3186 3187 // Fixup illegal register types for i8 stores. 3188 if (Ty == LLT::scalar(8) || Ty == S16) { 3189 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3190 return AnyExt; 3191 } 3192 3193 if (Ty.isVector()) { 3194 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3195 if (IsFormat) 3196 return handleD16VData(B, *MRI, VData); 3197 } 3198 } 3199 3200 return VData; 3201 } 3202 3203 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3204 MachineRegisterInfo &MRI, 3205 MachineIRBuilder &B, 3206 bool IsTyped, 3207 bool IsFormat) const { 3208 B.setInstr(MI); 3209 3210 Register VData = MI.getOperand(1).getReg(); 3211 LLT Ty = MRI.getType(VData); 3212 LLT EltTy = Ty.getScalarType(); 3213 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3214 const LLT S32 = LLT::scalar(32); 3215 3216 VData = fixStoreSourceType(B, VData, IsFormat); 3217 Register RSrc = MI.getOperand(2).getReg(); 3218 3219 MachineMemOperand *MMO = *MI.memoperands_begin(); 3220 const int MemSize = MMO->getSize(); 3221 3222 unsigned ImmOffset; 3223 unsigned TotalOffset; 3224 3225 // The typed intrinsics add an immediate after the registers. 3226 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3227 3228 // The struct intrinsic variants add one additional operand over raw. 3229 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3230 Register VIndex; 3231 int OpOffset = 0; 3232 if (HasVIndex) { 3233 VIndex = MI.getOperand(3).getReg(); 3234 OpOffset = 1; 3235 } 3236 3237 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3238 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3239 3240 unsigned Format = 0; 3241 if (IsTyped) { 3242 Format = MI.getOperand(5 + OpOffset).getImm(); 3243 ++OpOffset; 3244 } 3245 3246 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3247 3248 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3249 if (TotalOffset != 0) 3250 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3251 3252 unsigned Opc; 3253 if (IsTyped) { 3254 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3255 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3256 } else if (IsFormat) { 3257 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3258 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3259 } else { 3260 switch (MemSize) { 3261 case 1: 3262 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3263 break; 3264 case 2: 3265 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3266 break; 3267 default: 3268 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3269 break; 3270 } 3271 } 3272 3273 if (!VIndex) 3274 VIndex = B.buildConstant(S32, 0).getReg(0); 3275 3276 auto MIB = B.buildInstr(Opc) 3277 .addUse(VData) // vdata 3278 .addUse(RSrc) // rsrc 3279 .addUse(VIndex) // vindex 3280 .addUse(VOffset) // voffset 3281 .addUse(SOffset) // soffset 3282 .addImm(ImmOffset); // offset(imm) 3283 3284 if (IsTyped) 3285 MIB.addImm(Format); 3286 3287 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3288 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3289 .addMemOperand(MMO); 3290 3291 MI.eraseFromParent(); 3292 return true; 3293 } 3294 3295 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3296 MachineRegisterInfo &MRI, 3297 MachineIRBuilder &B, 3298 bool IsFormat, 3299 bool IsTyped) const { 3300 B.setInstr(MI); 3301 3302 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3303 MachineMemOperand *MMO = *MI.memoperands_begin(); 3304 const int MemSize = MMO->getSize(); 3305 const LLT S32 = LLT::scalar(32); 3306 3307 Register Dst = MI.getOperand(0).getReg(); 3308 Register RSrc = MI.getOperand(2).getReg(); 3309 3310 // The typed intrinsics add an immediate after the registers. 3311 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3312 3313 // The struct intrinsic variants add one additional operand over raw. 3314 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3315 Register VIndex; 3316 int OpOffset = 0; 3317 if (HasVIndex) { 3318 VIndex = MI.getOperand(3).getReg(); 3319 OpOffset = 1; 3320 } 3321 3322 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3323 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3324 3325 unsigned Format = 0; 3326 if (IsTyped) { 3327 Format = MI.getOperand(5 + OpOffset).getImm(); 3328 ++OpOffset; 3329 } 3330 3331 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3332 unsigned ImmOffset; 3333 unsigned TotalOffset; 3334 3335 LLT Ty = MRI.getType(Dst); 3336 LLT EltTy = Ty.getScalarType(); 3337 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3338 const bool Unpacked = ST.hasUnpackedD16VMem(); 3339 3340 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3341 if (TotalOffset != 0) 3342 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3343 3344 unsigned Opc; 3345 3346 if (IsTyped) { 3347 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3348 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3349 } else if (IsFormat) { 3350 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3351 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3352 } else { 3353 switch (MemSize) { 3354 case 1: 3355 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3356 break; 3357 case 2: 3358 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3359 break; 3360 default: 3361 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3362 break; 3363 } 3364 } 3365 3366 Register LoadDstReg; 3367 3368 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3369 LLT UnpackedTy = Ty.changeElementSize(32); 3370 3371 if (IsExtLoad) 3372 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3373 else if (Unpacked && IsD16 && Ty.isVector()) 3374 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3375 else 3376 LoadDstReg = Dst; 3377 3378 if (!VIndex) 3379 VIndex = B.buildConstant(S32, 0).getReg(0); 3380 3381 auto MIB = B.buildInstr(Opc) 3382 .addDef(LoadDstReg) // vdata 3383 .addUse(RSrc) // rsrc 3384 .addUse(VIndex) // vindex 3385 .addUse(VOffset) // voffset 3386 .addUse(SOffset) // soffset 3387 .addImm(ImmOffset); // offset(imm) 3388 3389 if (IsTyped) 3390 MIB.addImm(Format); 3391 3392 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3393 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3394 .addMemOperand(MMO); 3395 3396 if (LoadDstReg != Dst) { 3397 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3398 3399 // Widen result for extending loads was widened. 3400 if (IsExtLoad) 3401 B.buildTrunc(Dst, LoadDstReg); 3402 else { 3403 // Repack to original 16-bit vector result 3404 // FIXME: G_TRUNC should work, but legalization currently fails 3405 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3406 SmallVector<Register, 4> Repack; 3407 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3408 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3409 B.buildMerge(Dst, Repack); 3410 } 3411 } 3412 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3418 MachineIRBuilder &B, 3419 bool IsInc) const { 3420 B.setInstr(MI); 3421 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3422 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3423 B.buildInstr(Opc) 3424 .addDef(MI.getOperand(0).getReg()) 3425 .addUse(MI.getOperand(2).getReg()) 3426 .addUse(MI.getOperand(3).getReg()) 3427 .cloneMemRefs(MI); 3428 MI.eraseFromParent(); 3429 return true; 3430 } 3431 3432 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3433 switch (IntrID) { 3434 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3435 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3436 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3437 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3438 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3439 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3440 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3441 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3442 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3443 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3444 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3445 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3446 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3447 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3448 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3449 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3450 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3451 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3452 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3453 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3454 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3455 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3456 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3457 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3458 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3459 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3461 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3462 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3463 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3464 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3466 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3467 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3468 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3469 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3470 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3471 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3472 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3473 default: 3474 llvm_unreachable("unhandled atomic opcode"); 3475 } 3476 } 3477 3478 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3479 MachineIRBuilder &B, 3480 Intrinsic::ID IID) const { 3481 B.setInstr(MI); 3482 3483 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3484 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3485 3486 Register Dst = MI.getOperand(0).getReg(); 3487 Register VData = MI.getOperand(2).getReg(); 3488 3489 Register CmpVal; 3490 int OpOffset = 0; 3491 3492 if (IsCmpSwap) { 3493 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3494 ++OpOffset; 3495 } 3496 3497 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3498 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3499 3500 // The struct intrinsic variants add one additional operand over raw. 3501 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3502 Register VIndex; 3503 if (HasVIndex) { 3504 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3505 ++OpOffset; 3506 } 3507 3508 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3509 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3510 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3511 3512 MachineMemOperand *MMO = *MI.memoperands_begin(); 3513 3514 unsigned ImmOffset; 3515 unsigned TotalOffset; 3516 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3517 if (TotalOffset != 0) 3518 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3519 3520 if (!VIndex) 3521 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3522 3523 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3524 .addDef(Dst) 3525 .addUse(VData); // vdata 3526 3527 if (IsCmpSwap) 3528 MIB.addReg(CmpVal); 3529 3530 MIB.addUse(RSrc) // rsrc 3531 .addUse(VIndex) // vindex 3532 .addUse(VOffset) // voffset 3533 .addUse(SOffset) // soffset 3534 .addImm(ImmOffset) // offset(imm) 3535 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3536 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3537 .addMemOperand(MMO); 3538 3539 MI.eraseFromParent(); 3540 return true; 3541 } 3542 3543 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3544 /// vector with s16 typed elements. 3545 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3546 SmallVectorImpl<Register> &PackedAddrs, 3547 int AddrIdx, int DimIdx, int NumVAddrs, 3548 int NumGradients) { 3549 const LLT S16 = LLT::scalar(16); 3550 const LLT V2S16 = LLT::vector(2, 16); 3551 3552 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3553 MachineOperand &SrcOp = MI.getOperand(I); 3554 if (!SrcOp.isReg()) 3555 continue; // _L to _LZ may have eliminated this. 3556 3557 Register AddrReg = SrcOp.getReg(); 3558 3559 if (I < DimIdx) { 3560 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3561 PackedAddrs.push_back(AddrReg); 3562 } else { 3563 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3564 // derivatives dx/dh and dx/dv are packed with undef. 3565 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3566 ((NumGradients / 2) % 2 == 1 && 3567 (I == DimIdx + (NumGradients / 2) - 1 || 3568 I == DimIdx + NumGradients - 1)) || 3569 // Check for _L to _LZ optimization 3570 !MI.getOperand(I + 1).isReg()) { 3571 PackedAddrs.push_back( 3572 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3573 .getReg(0)); 3574 } else { 3575 PackedAddrs.push_back( 3576 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3577 .getReg(0)); 3578 ++I; 3579 } 3580 } 3581 } 3582 } 3583 3584 /// Convert from separate vaddr components to a single vector address register, 3585 /// and replace the remaining operands with $noreg. 3586 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3587 int DimIdx, int NumVAddrs) { 3588 const LLT S32 = LLT::scalar(32); 3589 3590 SmallVector<Register, 8> AddrRegs; 3591 for (int I = 0; I != NumVAddrs; ++I) { 3592 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3593 if (SrcOp.isReg()) { 3594 AddrRegs.push_back(SrcOp.getReg()); 3595 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3596 } 3597 } 3598 3599 int NumAddrRegs = AddrRegs.size(); 3600 if (NumAddrRegs != 1) { 3601 // Round up to 8 elements for v5-v7 3602 // FIXME: Missing intermediate sized register classes and instructions. 3603 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3604 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3605 auto Undef = B.buildUndef(S32); 3606 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3607 NumAddrRegs = RoundedNumRegs; 3608 } 3609 3610 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3611 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3612 } 3613 3614 for (int I = 1; I != NumVAddrs; ++I) { 3615 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3616 if (SrcOp.isReg()) 3617 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3618 } 3619 } 3620 3621 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3622 /// 3623 /// Depending on the subtarget, load/store with 16-bit element data need to be 3624 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3625 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3626 /// registers. 3627 /// 3628 /// We don't want to directly select image instructions just yet, but also want 3629 /// to exposes all register repacking to the legalizer/combiners. We also don't 3630 /// want a selected instrution entering RegBankSelect. In order to avoid 3631 /// defining a multitude of intermediate image instructions, directly hack on 3632 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3633 /// now unnecessary arguments with $noreg. 3634 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3635 MachineInstr &MI, MachineIRBuilder &B, 3636 GISelChangeObserver &Observer, 3637 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3638 B.setInstr(MI); 3639 3640 const int NumDefs = MI.getNumExplicitDefs(); 3641 bool IsTFE = NumDefs == 2; 3642 // We are only processing the operands of d16 image operations on subtargets 3643 // that use the unpacked register layout, or need to repack the TFE result. 3644 3645 // TODO: Do we need to guard against already legalized intrinsics? 3646 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3647 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3648 3649 MachineRegisterInfo *MRI = B.getMRI(); 3650 const LLT S32 = LLT::scalar(32); 3651 const LLT S16 = LLT::scalar(16); 3652 const LLT V2S16 = LLT::vector(2, 16); 3653 3654 // Index of first address argument 3655 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3656 3657 // Check for 16 bit addresses and pack if true. 3658 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3659 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3660 const bool IsA16 = AddrTy == S16; 3661 3662 int NumVAddrs, NumGradients; 3663 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3664 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3665 getDMaskIdx(BaseOpcode, NumDefs); 3666 unsigned DMask = 0; 3667 3668 int DMaskLanes = 0; 3669 if (!BaseOpcode->Atomic) { 3670 DMask = MI.getOperand(DMaskIdx).getImm(); 3671 if (BaseOpcode->Gather4) { 3672 DMaskLanes = 4; 3673 } else if (DMask != 0) { 3674 DMaskLanes = countPopulation(DMask); 3675 } else if (!IsTFE && !BaseOpcode->Store) { 3676 // If dmask is 0, this is a no-op load. This can be eliminated. 3677 B.buildUndef(MI.getOperand(0)); 3678 MI.eraseFromParent(); 3679 return true; 3680 } 3681 } 3682 3683 Observer.changingInstr(MI); 3684 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3685 3686 unsigned NewOpcode = NumDefs == 0 ? 3687 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3688 3689 // Track that we legalized this 3690 MI.setDesc(B.getTII().get(NewOpcode)); 3691 3692 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3693 // dmask to be at least 1 otherwise the instruction will fail 3694 if (IsTFE && DMask == 0) { 3695 DMask = 0x1; 3696 DMaskLanes = 1; 3697 MI.getOperand(DMaskIdx).setImm(DMask); 3698 } 3699 3700 if (BaseOpcode->Atomic) { 3701 Register VData0 = MI.getOperand(2).getReg(); 3702 LLT Ty = MRI->getType(VData0); 3703 3704 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3705 if (Ty.isVector()) 3706 return false; 3707 3708 if (BaseOpcode->AtomicX2) { 3709 Register VData1 = MI.getOperand(3).getReg(); 3710 // The two values are packed in one register. 3711 LLT PackedTy = LLT::vector(2, Ty); 3712 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3713 MI.getOperand(2).setReg(Concat.getReg(0)); 3714 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3715 } 3716 } 3717 3718 int CorrectedNumVAddrs = NumVAddrs; 3719 3720 // Optimize _L to _LZ when _L is zero 3721 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3722 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3723 const ConstantFP *ConstantLod; 3724 const int LodIdx = AddrIdx + NumVAddrs - 1; 3725 3726 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3727 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3728 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3729 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3730 LZMappingInfo->LZ, ImageDimIntr->Dim); 3731 3732 // The starting indexes should remain in the same place. 3733 --NumVAddrs; 3734 --CorrectedNumVAddrs; 3735 3736 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3737 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3738 MI.RemoveOperand(LodIdx); 3739 } 3740 } 3741 } 3742 3743 // Optimize _mip away, when 'lod' is zero 3744 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3745 int64_t ConstantLod; 3746 const int LodIdx = AddrIdx + NumVAddrs - 1; 3747 3748 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3749 if (ConstantLod == 0) { 3750 // TODO: Change intrinsic opcode and remove operand instead or replacing 3751 // it with 0, as the _L to _LZ handling is done above. 3752 MI.getOperand(LodIdx).ChangeToImmediate(0); 3753 --CorrectedNumVAddrs; 3754 } 3755 } 3756 } 3757 3758 // If the register allocator cannot place the address registers contiguously 3759 // without introducing moves, then using the non-sequential address encoding 3760 // is always preferable, since it saves VALU instructions and is usually a 3761 // wash in terms of code size or even better. 3762 // 3763 // However, we currently have no way of hinting to the register allocator 3764 // that MIMG addresses should be placed contiguously when it is possible to 3765 // do so, so force non-NSA for the common 2-address case as a heuristic. 3766 // 3767 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3768 // allocation when possible. 3769 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3770 3771 // Rewrite the addressing register layout before doing anything else. 3772 if (IsA16) { 3773 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3774 // should be introduced. 3775 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3776 return false; 3777 3778 if (NumVAddrs > 1) { 3779 SmallVector<Register, 4> PackedRegs; 3780 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3781 NumGradients); 3782 3783 if (!UseNSA && PackedRegs.size() > 1) { 3784 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3785 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3786 PackedRegs[0] = Concat.getReg(0); 3787 PackedRegs.resize(1); 3788 } 3789 3790 const int NumPacked = PackedRegs.size(); 3791 for (int I = 0; I != NumVAddrs; ++I) { 3792 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3793 if (!SrcOp.isReg()) { 3794 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3795 continue; 3796 } 3797 3798 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3799 3800 if (I < NumPacked) 3801 SrcOp.setReg(PackedRegs[I]); 3802 else 3803 SrcOp.setReg(AMDGPU::NoRegister); 3804 } 3805 } 3806 } else if (!UseNSA && NumVAddrs > 1) { 3807 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3808 } 3809 3810 3811 if (BaseOpcode->Store) { // No TFE for stores? 3812 // TODO: Handle dmask trim 3813 Register VData = MI.getOperand(1).getReg(); 3814 LLT Ty = MRI->getType(VData); 3815 if (!Ty.isVector() || Ty.getElementType() != S16) 3816 return true; 3817 3818 B.setInstr(MI); 3819 3820 Register RepackedReg = handleD16VData(B, *MRI, VData); 3821 if (RepackedReg != VData) { 3822 MI.getOperand(1).setReg(RepackedReg); 3823 } 3824 3825 return true; 3826 } 3827 3828 Register DstReg = MI.getOperand(0).getReg(); 3829 LLT Ty = MRI->getType(DstReg); 3830 const LLT EltTy = Ty.getScalarType(); 3831 const bool IsD16 = Ty.getScalarType() == S16; 3832 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3833 3834 // Confirm that the return type is large enough for the dmask specified 3835 if (NumElts < DMaskLanes) 3836 return false; 3837 3838 if (NumElts > 4 || DMaskLanes > 4) 3839 return false; 3840 3841 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3842 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3843 3844 // The raw dword aligned data component of the load. The only legal cases 3845 // where this matters should be when using the packed D16 format, for 3846 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3847 LLT RoundedTy; 3848 3849 // S32 vector to to cover all data, plus TFE result element. 3850 LLT TFETy; 3851 3852 // Register type to use for each loaded component. Will be S32 or V2S16. 3853 LLT RegTy; 3854 3855 if (IsD16 && ST.hasUnpackedD16VMem()) { 3856 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3857 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3858 RegTy = S32; 3859 } else { 3860 unsigned EltSize = EltTy.getSizeInBits(); 3861 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3862 unsigned RoundedSize = 32 * RoundedElts; 3863 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3864 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3865 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3866 } 3867 3868 // The return type does not need adjustment. 3869 // TODO: Should we change s16 case to s32 or <2 x s16>? 3870 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3871 return true; 3872 3873 Register Dst1Reg; 3874 3875 // Insert after the instruction. 3876 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3877 3878 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3879 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3880 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3881 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3882 3883 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3884 3885 MI.getOperand(0).setReg(NewResultReg); 3886 3887 // In the IR, TFE is supposed to be used with a 2 element struct return 3888 // type. The intruction really returns these two values in one contiguous 3889 // register, with one additional dword beyond the loaded data. Rewrite the 3890 // return type to use a single register result. 3891 3892 if (IsTFE) { 3893 Dst1Reg = MI.getOperand(1).getReg(); 3894 if (MRI->getType(Dst1Reg) != S32) 3895 return false; 3896 3897 // TODO: Make sure the TFE operand bit is set. 3898 MI.RemoveOperand(1); 3899 3900 // Handle the easy case that requires no repack instructions. 3901 if (Ty == S32) { 3902 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3903 return true; 3904 } 3905 } 3906 3907 // Now figure out how to copy the new result register back into the old 3908 // result. 3909 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3910 3911 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3912 3913 if (ResultNumRegs == 1) { 3914 assert(!IsTFE); 3915 ResultRegs[0] = NewResultReg; 3916 } else { 3917 // We have to repack into a new vector of some kind. 3918 for (int I = 0; I != NumDataRegs; ++I) 3919 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3920 B.buildUnmerge(ResultRegs, NewResultReg); 3921 3922 // Drop the final TFE element to get the data part. The TFE result is 3923 // directly written to the right place already. 3924 if (IsTFE) 3925 ResultRegs.resize(NumDataRegs); 3926 } 3927 3928 // For an s16 scalar result, we form an s32 result with a truncate regardless 3929 // of packed vs. unpacked. 3930 if (IsD16 && !Ty.isVector()) { 3931 B.buildTrunc(DstReg, ResultRegs[0]); 3932 return true; 3933 } 3934 3935 // Avoid a build/concat_vector of 1 entry. 3936 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3937 B.buildBitcast(DstReg, ResultRegs[0]); 3938 return true; 3939 } 3940 3941 assert(Ty.isVector()); 3942 3943 if (IsD16) { 3944 // For packed D16 results with TFE enabled, all the data components are 3945 // S32. Cast back to the expected type. 3946 // 3947 // TODO: We don't really need to use load s32 elements. We would only need one 3948 // cast for the TFE result if a multiple of v2s16 was used. 3949 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3950 for (Register &Reg : ResultRegs) 3951 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3952 } else if (ST.hasUnpackedD16VMem()) { 3953 for (Register &Reg : ResultRegs) 3954 Reg = B.buildTrunc(S16, Reg).getReg(0); 3955 } 3956 } 3957 3958 auto padWithUndef = [&](LLT Ty, int NumElts) { 3959 if (NumElts == 0) 3960 return; 3961 Register Undef = B.buildUndef(Ty).getReg(0); 3962 for (int I = 0; I != NumElts; ++I) 3963 ResultRegs.push_back(Undef); 3964 }; 3965 3966 // Pad out any elements eliminated due to the dmask. 3967 LLT ResTy = MRI->getType(ResultRegs[0]); 3968 if (!ResTy.isVector()) { 3969 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3970 B.buildBuildVector(DstReg, ResultRegs); 3971 return true; 3972 } 3973 3974 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3975 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3976 3977 // Deal with the one annoying legal case. 3978 const LLT V3S16 = LLT::vector(3, 16); 3979 if (Ty == V3S16) { 3980 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3981 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3982 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3983 return true; 3984 } 3985 3986 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3987 B.buildConcatVectors(DstReg, ResultRegs); 3988 return true; 3989 } 3990 3991 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3992 MachineInstr &MI, MachineIRBuilder &B, 3993 GISelChangeObserver &Observer) const { 3994 Register Dst = MI.getOperand(0).getReg(); 3995 LLT Ty = B.getMRI()->getType(Dst); 3996 unsigned Size = Ty.getSizeInBits(); 3997 MachineFunction &MF = B.getMF(); 3998 3999 Observer.changingInstr(MI); 4000 4001 // FIXME: We don't really need this intermediate instruction. The intrinsic 4002 // should be fixed to have a memory operand. Since it's readnone, we're not 4003 // allowed to add one. 4004 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4005 MI.RemoveOperand(1); // Remove intrinsic ID 4006 4007 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4008 // TODO: Should this use datalayout alignment? 4009 const unsigned MemSize = (Size + 7) / 8; 4010 const Align MemAlign(4); 4011 MachineMemOperand *MMO = MF.getMachineMemOperand( 4012 MachinePointerInfo(), 4013 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4014 MachineMemOperand::MOInvariant, 4015 MemSize, MemAlign); 4016 MI.addMemOperand(MF, MMO); 4017 4018 // There are no 96-bit result scalar loads, but widening to 128-bit should 4019 // always be legal. We may need to restore this to a 96-bit result if it turns 4020 // out this needs to be converted to a vector load during RegBankSelect. 4021 if (!isPowerOf2_32(Size)) { 4022 LegalizerHelper Helper(MF, *this, Observer, B); 4023 B.setInstr(MI); 4024 4025 if (Ty.isVector()) 4026 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4027 else 4028 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4029 } 4030 4031 Observer.changedInstr(MI); 4032 return true; 4033 } 4034 4035 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4036 MachineRegisterInfo &MRI, 4037 MachineIRBuilder &B) const { 4038 B.setInstr(MI); 4039 4040 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4041 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4042 !ST.isTrapHandlerEnabled()) { 4043 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4044 } else { 4045 // Pass queue pointer to trap handler as input, and insert trap instruction 4046 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4047 const ArgDescriptor *Arg = 4048 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4049 if (!Arg) 4050 return false; 4051 MachineRegisterInfo &MRI = *B.getMRI(); 4052 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4053 Register LiveIn = getLiveInRegister( 4054 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4055 /*InsertLiveInCopy=*/false); 4056 if (!loadInputValue(LiveIn, B, Arg)) 4057 return false; 4058 B.buildCopy(SGPR01, LiveIn); 4059 B.buildInstr(AMDGPU::S_TRAP) 4060 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4061 .addReg(SGPR01, RegState::Implicit); 4062 } 4063 4064 MI.eraseFromParent(); 4065 return true; 4066 } 4067 4068 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4069 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4070 B.setInstr(MI); 4071 4072 // Is non-HSA path or trap-handler disabled? then, report a warning 4073 // accordingly 4074 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4075 !ST.isTrapHandlerEnabled()) { 4076 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4077 "debugtrap handler not supported", 4078 MI.getDebugLoc(), DS_Warning); 4079 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4080 Ctx.diagnose(NoTrap); 4081 } else { 4082 // Insert debug-trap instruction 4083 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4084 } 4085 4086 MI.eraseFromParent(); 4087 return true; 4088 } 4089 4090 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4091 MachineIRBuilder &B, 4092 GISelChangeObserver &Observer) const { 4093 MachineRegisterInfo &MRI = *B.getMRI(); 4094 4095 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4096 auto IntrID = MI.getIntrinsicID(); 4097 switch (IntrID) { 4098 case Intrinsic::amdgcn_if: 4099 case Intrinsic::amdgcn_else: { 4100 MachineInstr *Br = nullptr; 4101 MachineBasicBlock *UncondBrTarget = nullptr; 4102 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4103 const SIRegisterInfo *TRI 4104 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4105 4106 B.setInstr(*BrCond); 4107 Register Def = MI.getOperand(1).getReg(); 4108 Register Use = MI.getOperand(3).getReg(); 4109 4110 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4111 if (IntrID == Intrinsic::amdgcn_if) { 4112 B.buildInstr(AMDGPU::SI_IF) 4113 .addDef(Def) 4114 .addUse(Use) 4115 .addMBB(UncondBrTarget); 4116 } else { 4117 B.buildInstr(AMDGPU::SI_ELSE) 4118 .addDef(Def) 4119 .addUse(Use) 4120 .addMBB(UncondBrTarget) 4121 .addImm(0); 4122 } 4123 4124 if (Br) { 4125 Br->getOperand(0).setMBB(CondBrTarget); 4126 } else { 4127 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4128 // since we're swapping branch targets it needs to be reinserted. 4129 // FIXME: IRTranslator should probably not do this 4130 B.buildBr(*CondBrTarget); 4131 } 4132 4133 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4134 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4135 MI.eraseFromParent(); 4136 BrCond->eraseFromParent(); 4137 return true; 4138 } 4139 4140 return false; 4141 } 4142 case Intrinsic::amdgcn_loop: { 4143 MachineInstr *Br = nullptr; 4144 MachineBasicBlock *UncondBrTarget = nullptr; 4145 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4146 const SIRegisterInfo *TRI 4147 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4148 4149 B.setInstr(*BrCond); 4150 4151 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4152 Register Reg = MI.getOperand(2).getReg(); 4153 B.buildInstr(AMDGPU::SI_LOOP) 4154 .addUse(Reg) 4155 .addMBB(UncondBrTarget); 4156 4157 if (Br) 4158 Br->getOperand(0).setMBB(CondBrTarget); 4159 else 4160 B.buildBr(*CondBrTarget); 4161 4162 MI.eraseFromParent(); 4163 BrCond->eraseFromParent(); 4164 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4165 return true; 4166 } 4167 4168 return false; 4169 } 4170 case Intrinsic::amdgcn_kernarg_segment_ptr: 4171 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4172 B.setInstr(MI); 4173 // This only makes sense to call in a kernel, so just lower to null. 4174 B.buildConstant(MI.getOperand(0).getReg(), 0); 4175 MI.eraseFromParent(); 4176 return true; 4177 } 4178 4179 return legalizePreloadedArgIntrin( 4180 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4181 case Intrinsic::amdgcn_implicitarg_ptr: 4182 return legalizeImplicitArgPtr(MI, MRI, B); 4183 case Intrinsic::amdgcn_workitem_id_x: 4184 return legalizePreloadedArgIntrin(MI, MRI, B, 4185 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4186 case Intrinsic::amdgcn_workitem_id_y: 4187 return legalizePreloadedArgIntrin(MI, MRI, B, 4188 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4189 case Intrinsic::amdgcn_workitem_id_z: 4190 return legalizePreloadedArgIntrin(MI, MRI, B, 4191 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4192 case Intrinsic::amdgcn_workgroup_id_x: 4193 return legalizePreloadedArgIntrin(MI, MRI, B, 4194 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4195 case Intrinsic::amdgcn_workgroup_id_y: 4196 return legalizePreloadedArgIntrin(MI, MRI, B, 4197 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4198 case Intrinsic::amdgcn_workgroup_id_z: 4199 return legalizePreloadedArgIntrin(MI, MRI, B, 4200 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4201 case Intrinsic::amdgcn_dispatch_ptr: 4202 return legalizePreloadedArgIntrin(MI, MRI, B, 4203 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4204 case Intrinsic::amdgcn_queue_ptr: 4205 return legalizePreloadedArgIntrin(MI, MRI, B, 4206 AMDGPUFunctionArgInfo::QUEUE_PTR); 4207 case Intrinsic::amdgcn_implicit_buffer_ptr: 4208 return legalizePreloadedArgIntrin( 4209 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4210 case Intrinsic::amdgcn_dispatch_id: 4211 return legalizePreloadedArgIntrin(MI, MRI, B, 4212 AMDGPUFunctionArgInfo::DISPATCH_ID); 4213 case Intrinsic::amdgcn_fdiv_fast: 4214 return legalizeFDIVFastIntrin(MI, MRI, B); 4215 case Intrinsic::amdgcn_is_shared: 4216 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4217 case Intrinsic::amdgcn_is_private: 4218 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4219 case Intrinsic::amdgcn_wavefrontsize: { 4220 B.setInstr(MI); 4221 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4222 MI.eraseFromParent(); 4223 return true; 4224 } 4225 case Intrinsic::amdgcn_s_buffer_load: 4226 return legalizeSBufferLoad(MI, B, Observer); 4227 case Intrinsic::amdgcn_raw_buffer_store: 4228 case Intrinsic::amdgcn_struct_buffer_store: 4229 return legalizeBufferStore(MI, MRI, B, false, false); 4230 case Intrinsic::amdgcn_raw_buffer_store_format: 4231 case Intrinsic::amdgcn_struct_buffer_store_format: 4232 return legalizeBufferStore(MI, MRI, B, false, true); 4233 case Intrinsic::amdgcn_raw_tbuffer_store: 4234 case Intrinsic::amdgcn_struct_tbuffer_store: 4235 return legalizeBufferStore(MI, MRI, B, true, true); 4236 case Intrinsic::amdgcn_raw_buffer_load: 4237 case Intrinsic::amdgcn_struct_buffer_load: 4238 return legalizeBufferLoad(MI, MRI, B, false, false); 4239 case Intrinsic::amdgcn_raw_buffer_load_format: 4240 case Intrinsic::amdgcn_struct_buffer_load_format: 4241 return legalizeBufferLoad(MI, MRI, B, true, false); 4242 case Intrinsic::amdgcn_raw_tbuffer_load: 4243 case Intrinsic::amdgcn_struct_tbuffer_load: 4244 return legalizeBufferLoad(MI, MRI, B, true, true); 4245 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4246 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4247 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4248 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4249 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4250 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4251 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4252 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4253 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4255 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4256 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4257 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4259 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4261 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4262 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4263 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4264 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4265 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4266 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4267 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4269 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4270 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4271 return legalizeBufferAtomic(MI, B, IntrID); 4272 case Intrinsic::amdgcn_atomic_inc: 4273 return legalizeAtomicIncDec(MI, B, true); 4274 case Intrinsic::amdgcn_atomic_dec: 4275 return legalizeAtomicIncDec(MI, B, false); 4276 case Intrinsic::trap: 4277 return legalizeTrapIntrinsic(MI, MRI, B); 4278 case Intrinsic::debugtrap: 4279 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4280 default: { 4281 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4282 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4283 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4284 return true; 4285 } 4286 } 4287 4288 return true; 4289 } 4290