1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Round the number of elements to the next power of two elements 40 static LLT getPow2VectorType(LLT Ty) { 41 unsigned NElts = Ty.getNumElements(); 42 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 43 return Ty.changeNumElements(Pow2NElts); 44 } 45 46 // Round the number of bits to the next power of two bits 47 static LLT getPow2ScalarType(LLT Ty) { 48 unsigned Bits = Ty.getSizeInBits(); 49 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 50 return LLT::scalar(Pow2Bits); 51 } 52 53 static LegalityPredicate isMultiple32(unsigned TypeIdx, 54 unsigned MaxSize = 1024) { 55 return [=](const LegalityQuery &Query) { 56 const LLT Ty = Query.Types[TypeIdx]; 57 const LLT EltTy = Ty.getScalarType(); 58 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 59 }; 60 } 61 62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 63 return [=](const LegalityQuery &Query) { 64 const LLT Ty = Query.Types[TypeIdx]; 65 return Ty.isVector() && 66 Ty.getNumElements() % 2 != 0 && 67 Ty.getElementType().getSizeInBits() < 32 && 68 Ty.getSizeInBits() % 32 != 0; 69 }; 70 } 71 72 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 const LLT EltTy = Ty.getScalarType(); 76 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 77 }; 78 } 79 80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 const LLT EltTy = Ty.getElementType(); 84 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 85 }; 86 } 87 88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getElementType(); 92 unsigned Size = Ty.getSizeInBits(); 93 unsigned Pieces = (Size + 63) / 64; 94 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 95 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 96 }; 97 } 98 99 // Increase the number of vector elements to reach the next multiple of 32-bit 100 // type. 101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 105 const LLT EltTy = Ty.getElementType(); 106 const int Size = Ty.getSizeInBits(); 107 const int EltSize = EltTy.getSizeInBits(); 108 const int NextMul32 = (Size + 31) / 32; 109 110 assert(EltSize < 32); 111 112 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 113 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 114 }; 115 } 116 117 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 118 return [=](const LegalityQuery &Query) { 119 const LLT QueryTy = Query.Types[TypeIdx]; 120 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 121 }; 122 } 123 124 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 125 return [=](const LegalityQuery &Query) { 126 const LLT QueryTy = Query.Types[TypeIdx]; 127 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 128 }; 129 } 130 131 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 132 return [=](const LegalityQuery &Query) { 133 const LLT QueryTy = Query.Types[TypeIdx]; 134 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 135 }; 136 } 137 138 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 139 // v2s16. 140 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 141 return [=](const LegalityQuery &Query) { 142 const LLT Ty = Query.Types[TypeIdx]; 143 if (Ty.isVector()) { 144 const int EltSize = Ty.getElementType().getSizeInBits(); 145 return EltSize == 32 || EltSize == 64 || 146 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 147 EltSize == 128 || EltSize == 256; 148 } 149 150 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 151 }; 152 } 153 154 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT QueryTy = Query.Types[TypeIdx]; 157 if (!QueryTy.isVector()) 158 return false; 159 const LLT EltTy = QueryTy.getElementType(); 160 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 161 }; 162 } 163 164 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 165 return [=](const LegalityQuery &Query) { 166 const LLT Ty = Query.Types[TypeIdx]; 167 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 168 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 169 }; 170 } 171 172 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 173 const GCNTargetMachine &TM) 174 : ST(ST_) { 175 using namespace TargetOpcode; 176 177 auto GetAddrSpacePtr = [&TM](unsigned AS) { 178 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 179 }; 180 181 const LLT S1 = LLT::scalar(1); 182 const LLT S16 = LLT::scalar(16); 183 const LLT S32 = LLT::scalar(32); 184 const LLT S64 = LLT::scalar(64); 185 const LLT S128 = LLT::scalar(128); 186 const LLT S256 = LLT::scalar(256); 187 const LLT S512 = LLT::scalar(512); 188 const LLT S1024 = LLT::scalar(1024); 189 190 const LLT V2S16 = LLT::vector(2, 16); 191 const LLT V4S16 = LLT::vector(4, 16); 192 193 const LLT V2S32 = LLT::vector(2, 32); 194 const LLT V3S32 = LLT::vector(3, 32); 195 const LLT V4S32 = LLT::vector(4, 32); 196 const LLT V5S32 = LLT::vector(5, 32); 197 const LLT V6S32 = LLT::vector(6, 32); 198 const LLT V7S32 = LLT::vector(7, 32); 199 const LLT V8S32 = LLT::vector(8, 32); 200 const LLT V9S32 = LLT::vector(9, 32); 201 const LLT V10S32 = LLT::vector(10, 32); 202 const LLT V11S32 = LLT::vector(11, 32); 203 const LLT V12S32 = LLT::vector(12, 32); 204 const LLT V13S32 = LLT::vector(13, 32); 205 const LLT V14S32 = LLT::vector(14, 32); 206 const LLT V15S32 = LLT::vector(15, 32); 207 const LLT V16S32 = LLT::vector(16, 32); 208 const LLT V32S32 = LLT::vector(32, 32); 209 210 const LLT V2S64 = LLT::vector(2, 64); 211 const LLT V3S64 = LLT::vector(3, 64); 212 const LLT V4S64 = LLT::vector(4, 64); 213 const LLT V5S64 = LLT::vector(5, 64); 214 const LLT V6S64 = LLT::vector(6, 64); 215 const LLT V7S64 = LLT::vector(7, 64); 216 const LLT V8S64 = LLT::vector(8, 64); 217 const LLT V16S64 = LLT::vector(16, 64); 218 219 std::initializer_list<LLT> AllS32Vectors = 220 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 221 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 222 std::initializer_list<LLT> AllS64Vectors = 223 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 224 225 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 226 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 227 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 228 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 229 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 230 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 231 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 232 233 const LLT CodePtr = FlatPtr; 234 235 const std::initializer_list<LLT> AddrSpaces64 = { 236 GlobalPtr, ConstantPtr, FlatPtr 237 }; 238 239 const std::initializer_list<LLT> AddrSpaces32 = { 240 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 241 }; 242 243 const std::initializer_list<LLT> FPTypesBase = { 244 S32, S64 245 }; 246 247 const std::initializer_list<LLT> FPTypes16 = { 248 S32, S64, S16 249 }; 250 251 const std::initializer_list<LLT> FPTypesPK16 = { 252 S32, S64, S16, V2S16 253 }; 254 255 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 256 257 setAction({G_BRCOND, S1}, Legal); // VCC branches 258 setAction({G_BRCOND, S32}, Legal); // SCC branches 259 260 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 261 // elements for v3s16 262 getActionDefinitionsBuilder(G_PHI) 263 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 264 .legalFor(AllS32Vectors) 265 .legalFor(AllS64Vectors) 266 .legalFor(AddrSpaces64) 267 .legalFor(AddrSpaces32) 268 .clampScalar(0, S32, S256) 269 .widenScalarToNextPow2(0, 32) 270 .clampMaxNumElements(0, S32, 16) 271 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 272 .legalIf(isPointer(0)); 273 274 if (ST.hasVOP3PInsts()) { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32, S16, V2S16}) 277 .clampScalar(0, S16, S32) 278 .clampMaxNumElements(0, S16, 2) 279 .scalarize(0) 280 .widenScalarToNextPow2(0, 32); 281 } else if (ST.has16BitInsts()) { 282 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 283 .legalFor({S32, S16}) 284 .clampScalar(0, S16, S32) 285 .scalarize(0) 286 .widenScalarToNextPow2(0, 32); 287 } else { 288 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 } 293 294 // FIXME: Not really legal. Placeholder for custom lowering. 295 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 296 .customFor({S32, S64}) 297 .clampScalar(0, S32, S64) 298 .widenScalarToNextPow2(0, 32) 299 .scalarize(0); 300 301 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 302 .legalFor({S32}) 303 .clampScalar(0, S32, S32) 304 .scalarize(0); 305 306 // Report legal for any types we can handle anywhere. For the cases only legal 307 // on the SALU, RegBankSelect will be able to re-legalize. 308 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 309 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 310 .clampScalar(0, S32, S64) 311 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 312 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 313 .widenScalarToNextPow2(0) 314 .scalarize(0); 315 316 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 317 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 318 .legalFor({{S32, S1}, {S32, S32}}) 319 .minScalar(0, S32) 320 // TODO: .scalarize(0) 321 .lower(); 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 .lower(); 327 328 329 getActionDefinitionsBuilder(G_CONSTANT) 330 .legalFor({S1, S32, S64, S16, GlobalPtr, 331 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 332 .clampScalar(0, S32, S64) 333 .widenScalarToNextPow2(0) 334 .legalIf(isPointer(0)); 335 336 getActionDefinitionsBuilder(G_FCONSTANT) 337 .legalFor({S32, S64, S16}) 338 .clampScalar(0, S16, S64); 339 340 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 341 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 342 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 343 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 344 .clampScalarOrElt(0, S32, S1024) 345 .legalIf(isMultiple32(0)) 346 .widenScalarToNextPow2(0, 32) 347 .clampMaxNumElements(0, S32, 16); 348 349 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 350 351 // If the amount is divergent, we have to do a wave reduction to get the 352 // maximum value, so this is expanded during RegBankSelect. 353 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 354 .legalFor({{PrivatePtr, S32}}); 355 356 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 357 .unsupportedFor({PrivatePtr}) 358 .custom(); 359 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 360 361 auto &FPOpActions = getActionDefinitionsBuilder( 362 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 363 .legalFor({S32, S64}); 364 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 365 .customFor({S32, S64}); 366 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 367 .customFor({S32, S64}); 368 369 if (ST.has16BitInsts()) { 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.legalFor({S16, V2S16}); 372 else 373 FPOpActions.legalFor({S16}); 374 375 TrigActions.customFor({S16}); 376 FDIVActions.customFor({S16}); 377 } 378 379 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 380 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 381 382 if (ST.hasVOP3PInsts()) { 383 MinNumMaxNum.customFor(FPTypesPK16) 384 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 385 .clampMaxNumElements(0, S16, 2) 386 .clampScalar(0, S16, S64) 387 .scalarize(0); 388 } else if (ST.has16BitInsts()) { 389 MinNumMaxNum.customFor(FPTypes16) 390 .clampScalar(0, S16, S64) 391 .scalarize(0); 392 } else { 393 MinNumMaxNum.customFor(FPTypesBase) 394 .clampScalar(0, S32, S64) 395 .scalarize(0); 396 } 397 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.clampMaxNumElements(0, S16, 2); 400 401 FPOpActions 402 .scalarize(0) 403 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 404 405 TrigActions 406 .scalarize(0) 407 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 408 409 FDIVActions 410 .scalarize(0) 411 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 412 413 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 414 .legalFor(FPTypesPK16) 415 .clampMaxNumElements(0, S16, 2) 416 .scalarize(0) 417 .clampScalar(0, S16, S64); 418 419 if (ST.has16BitInsts()) { 420 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 421 .legalFor({S32, S64, S16}) 422 .scalarize(0) 423 .clampScalar(0, S16, S64); 424 } else { 425 getActionDefinitionsBuilder(G_FSQRT) 426 .legalFor({S32, S64}) 427 .scalarize(0) 428 .clampScalar(0, S32, S64); 429 430 if (ST.hasFractBug()) { 431 getActionDefinitionsBuilder(G_FFLOOR) 432 .customFor({S64}) 433 .legalFor({S32, S64}) 434 .scalarize(0) 435 .clampScalar(0, S32, S64); 436 } else { 437 getActionDefinitionsBuilder(G_FFLOOR) 438 .legalFor({S32, S64}) 439 .scalarize(0) 440 .clampScalar(0, S32, S64); 441 } 442 } 443 444 getActionDefinitionsBuilder(G_FPTRUNC) 445 .legalFor({{S32, S64}, {S16, S32}}) 446 .scalarize(0) 447 .lower(); 448 449 getActionDefinitionsBuilder(G_FPEXT) 450 .legalFor({{S64, S32}, {S32, S16}}) 451 .lowerFor({{S64, S16}}) // FIXME: Implement 452 .scalarize(0); 453 454 getActionDefinitionsBuilder(G_FSUB) 455 // Use actual fsub instruction 456 .legalFor({S32}) 457 // Must use fadd + fneg 458 .lowerFor({S64, S16, V2S16}) 459 .scalarize(0) 460 .clampScalar(0, S32, S64); 461 462 // Whether this is legal depends on the floating point mode for the function. 463 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 464 if (ST.hasMadF16()) 465 FMad.customFor({S32, S16}); 466 else 467 FMad.customFor({S32}); 468 FMad.scalarize(0) 469 .lower(); 470 471 // TODO: Do we need to clamp maximum bitwidth? 472 getActionDefinitionsBuilder(G_TRUNC) 473 .legalIf(isScalar(0)) 474 .legalFor({{V2S16, V2S32}}) 475 .clampMaxNumElements(0, S16, 2) 476 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 477 // situations (like an invalid implicit use), we don't want to infinite loop 478 // in the legalizer. 479 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 480 .alwaysLegal(); 481 482 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 483 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 484 {S32, S1}, {S64, S1}, {S16, S1}}) 485 .scalarize(0) 486 .clampScalar(0, S32, S64) 487 .widenScalarToNextPow2(1, 32); 488 489 // TODO: Split s1->s64 during regbankselect for VALU. 490 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 491 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 492 .lowerFor({{S32, S64}}) 493 .lowerIf(typeIs(1, S1)) 494 .customFor({{S64, S64}}); 495 if (ST.has16BitInsts()) 496 IToFP.legalFor({{S16, S16}}); 497 IToFP.clampScalar(1, S32, S64) 498 .scalarize(0) 499 .widenScalarToNextPow2(1); 500 501 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 502 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 503 .customFor({{S64, S64}}); 504 if (ST.has16BitInsts()) 505 FPToI.legalFor({{S16, S16}}); 506 else 507 FPToI.minScalar(1, S32); 508 509 FPToI.minScalar(0, S32) 510 .scalarize(0) 511 .lower(); 512 513 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 514 .scalarize(0) 515 .lower(); 516 517 if (ST.has16BitInsts()) { 518 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 519 .legalFor({S16, S32, S64}) 520 .clampScalar(0, S16, S64) 521 .scalarize(0); 522 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 523 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 524 .legalFor({S32, S64}) 525 .clampScalar(0, S32, S64) 526 .scalarize(0); 527 } else { 528 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 529 .legalFor({S32}) 530 .customFor({S64}) 531 .clampScalar(0, S32, S64) 532 .scalarize(0); 533 } 534 535 // FIXME: Clamp offset operand. 536 getActionDefinitionsBuilder(G_PTR_ADD) 537 .legalIf(isPointer(0)) 538 .scalarize(0); 539 540 getActionDefinitionsBuilder(G_PTRMASK) 541 .legalIf(typeInSet(1, {S64, S32})) 542 .minScalar(1, S32) 543 .maxScalarIf(sizeIs(0, 32), 1, S32) 544 .maxScalarIf(sizeIs(0, 64), 1, S64) 545 .scalarize(0); 546 547 auto &CmpBuilder = 548 getActionDefinitionsBuilder(G_ICMP) 549 // The compare output type differs based on the register bank of the output, 550 // so make both s1 and s32 legal. 551 // 552 // Scalar compares producing output in scc will be promoted to s32, as that 553 // is the allocatable register type that will be needed for the copy from 554 // scc. This will be promoted during RegBankSelect, and we assume something 555 // before that won't try to use s32 result types. 556 // 557 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 558 // bank. 559 .legalForCartesianProduct( 560 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 561 .legalForCartesianProduct( 562 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 563 if (ST.has16BitInsts()) { 564 CmpBuilder.legalFor({{S1, S16}}); 565 } 566 567 CmpBuilder 568 .widenScalarToNextPow2(1) 569 .clampScalar(1, S32, S64) 570 .scalarize(0) 571 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 572 573 getActionDefinitionsBuilder(G_FCMP) 574 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 575 .widenScalarToNextPow2(1) 576 .clampScalar(1, S32, S64) 577 .scalarize(0); 578 579 // FIXME: fpow has a selection pattern that should move to custom lowering. 580 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 581 if (ST.has16BitInsts()) 582 Exp2Ops.legalFor({S32, S16}); 583 else 584 Exp2Ops.legalFor({S32}); 585 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 586 Exp2Ops.scalarize(0); 587 588 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 589 if (ST.has16BitInsts()) 590 ExpOps.customFor({{S32}, {S16}}); 591 else 592 ExpOps.customFor({S32}); 593 ExpOps.clampScalar(0, MinScalarFPTy, S32) 594 .scalarize(0); 595 596 // The 64-bit versions produce 32-bit results, but only on the SALU. 597 getActionDefinitionsBuilder(G_CTPOP) 598 .legalFor({{S32, S32}, {S32, S64}}) 599 .clampScalar(0, S32, S32) 600 .clampScalar(1, S32, S64) 601 .scalarize(0) 602 .widenScalarToNextPow2(0, 32) 603 .widenScalarToNextPow2(1, 32); 604 605 // The hardware instructions return a different result on 0 than the generic 606 // instructions expect. The hardware produces -1, but these produce the 607 // bitwidth. 608 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 609 .scalarize(0) 610 .clampScalar(0, S32, S32) 611 .clampScalar(1, S32, S64) 612 .widenScalarToNextPow2(0, 32) 613 .widenScalarToNextPow2(1, 32) 614 .lower(); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 getActionDefinitionsBuilder(G_BITREVERSE) 626 .legalFor({S32}) 627 .clampScalar(0, S32, S32) 628 .scalarize(0); 629 630 if (ST.has16BitInsts()) { 631 getActionDefinitionsBuilder(G_BSWAP) 632 .legalFor({S16, S32, V2S16}) 633 .clampMaxNumElements(0, S16, 2) 634 // FIXME: Fixing non-power-of-2 before clamp is workaround for 635 // narrowScalar limitation. 636 .widenScalarToNextPow2(0) 637 .clampScalar(0, S16, S32) 638 .scalarize(0); 639 640 if (ST.hasVOP3PInsts()) { 641 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 642 .legalFor({S32, S16, V2S16}) 643 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 644 .clampMaxNumElements(0, S16, 2) 645 .minScalar(0, S16) 646 .widenScalarToNextPow2(0) 647 .scalarize(0) 648 .lower(); 649 } else { 650 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 651 .legalFor({S32, S16}) 652 .widenScalarToNextPow2(0) 653 .minScalar(0, S16) 654 .scalarize(0) 655 .lower(); 656 } 657 } else { 658 // TODO: Should have same legality without v_perm_b32 659 getActionDefinitionsBuilder(G_BSWAP) 660 .legalFor({S32}) 661 .lowerIf(scalarNarrowerThan(0, 32)) 662 // FIXME: Fixing non-power-of-2 before clamp is workaround for 663 // narrowScalar limitation. 664 .widenScalarToNextPow2(0) 665 .maxScalar(0, S32) 666 .scalarize(0) 667 .lower(); 668 669 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 670 .legalFor({S32}) 671 .minScalar(0, S32) 672 .widenScalarToNextPow2(0) 673 .scalarize(0) 674 .lower(); 675 } 676 677 getActionDefinitionsBuilder(G_INTTOPTR) 678 // List the common cases 679 .legalForCartesianProduct(AddrSpaces64, {S64}) 680 .legalForCartesianProduct(AddrSpaces32, {S32}) 681 .scalarize(0) 682 // Accept any address space as long as the size matches 683 .legalIf(sameSize(0, 1)) 684 .widenScalarIf(smallerThan(1, 0), 685 [](const LegalityQuery &Query) { 686 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 687 }) 688 .narrowScalarIf(largerThan(1, 0), 689 [](const LegalityQuery &Query) { 690 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 691 }); 692 693 getActionDefinitionsBuilder(G_PTRTOINT) 694 // List the common cases 695 .legalForCartesianProduct(AddrSpaces64, {S64}) 696 .legalForCartesianProduct(AddrSpaces32, {S32}) 697 .scalarize(0) 698 // Accept any address space as long as the size matches 699 .legalIf(sameSize(0, 1)) 700 .widenScalarIf(smallerThan(0, 1), 701 [](const LegalityQuery &Query) { 702 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 703 }) 704 .narrowScalarIf( 705 largerThan(0, 1), 706 [](const LegalityQuery &Query) { 707 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 708 }); 709 710 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 711 .scalarize(0) 712 .custom(); 713 714 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 715 // handle some operations by just promoting the register during 716 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 717 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 718 switch (AS) { 719 // FIXME: Private element size. 720 case AMDGPUAS::PRIVATE_ADDRESS: 721 return 32; 722 // FIXME: Check subtarget 723 case AMDGPUAS::LOCAL_ADDRESS: 724 return ST.useDS128() ? 128 : 64; 725 726 // Treat constant and global as identical. SMRD loads are sometimes usable 727 // for global loads (ideally constant address space should be eliminated) 728 // depending on the context. Legality cannot be context dependent, but 729 // RegBankSelect can split the load as necessary depending on the pointer 730 // register bank/uniformity and if the memory is invariant or not written in 731 // a kernel. 732 case AMDGPUAS::CONSTANT_ADDRESS: 733 case AMDGPUAS::GLOBAL_ADDRESS: 734 return IsLoad ? 512 : 128; 735 default: 736 return 128; 737 } 738 }; 739 740 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 741 bool IsLoad) -> bool { 742 const LLT DstTy = Query.Types[0]; 743 744 // Split vector extloads. 745 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 746 unsigned Align = Query.MMODescrs[0].AlignInBits; 747 748 if (MemSize < DstTy.getSizeInBits()) 749 MemSize = std::max(MemSize, Align); 750 751 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 752 return true; 753 754 const LLT PtrTy = Query.Types[1]; 755 unsigned AS = PtrTy.getAddressSpace(); 756 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 757 return true; 758 759 // Catch weird sized loads that don't evenly divide into the access sizes 760 // TODO: May be able to widen depending on alignment etc. 761 unsigned NumRegs = (MemSize + 31) / 32; 762 if (NumRegs == 3) { 763 if (!ST.hasDwordx3LoadStores()) 764 return true; 765 } else { 766 // If the alignment allows, these should have been widened. 767 if (!isPowerOf2_32(NumRegs)) 768 return true; 769 } 770 771 if (Align < MemSize) { 772 const SITargetLowering *TLI = ST.getTargetLowering(); 773 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 774 } 775 776 return false; 777 }; 778 779 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 780 unsigned Size = Query.Types[0].getSizeInBits(); 781 if (isPowerOf2_32(Size)) 782 return false; 783 784 if (Size == 96 && ST.hasDwordx3LoadStores()) 785 return false; 786 787 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 788 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 789 return false; 790 791 unsigned Align = Query.MMODescrs[0].AlignInBits; 792 unsigned RoundedSize = NextPowerOf2(Size); 793 return (Align >= RoundedSize); 794 }; 795 796 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 797 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 798 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 799 800 // TODO: Refine based on subtargets which support unaligned access or 128-bit 801 // LDS 802 // TODO: Unsupported flat for SI. 803 804 for (unsigned Op : {G_LOAD, G_STORE}) { 805 const bool IsStore = Op == G_STORE; 806 807 auto &Actions = getActionDefinitionsBuilder(Op); 808 // Whitelist the common cases. 809 // TODO: Loads to s16 on gfx9 810 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 811 {V2S32, GlobalPtr, 64, GlobalAlign32}, 812 {V4S32, GlobalPtr, 128, GlobalAlign32}, 813 {S128, GlobalPtr, 128, GlobalAlign32}, 814 {S64, GlobalPtr, 64, GlobalAlign32}, 815 {V2S64, GlobalPtr, 128, GlobalAlign32}, 816 {V2S16, GlobalPtr, 32, GlobalAlign32}, 817 {S32, GlobalPtr, 8, GlobalAlign8}, 818 {S32, GlobalPtr, 16, GlobalAlign16}, 819 820 {S32, LocalPtr, 32, 32}, 821 {S64, LocalPtr, 64, 32}, 822 {V2S32, LocalPtr, 64, 32}, 823 {S32, LocalPtr, 8, 8}, 824 {S32, LocalPtr, 16, 16}, 825 {V2S16, LocalPtr, 32, 32}, 826 827 {S32, PrivatePtr, 32, 32}, 828 {S32, PrivatePtr, 8, 8}, 829 {S32, PrivatePtr, 16, 16}, 830 {V2S16, PrivatePtr, 32, 32}, 831 832 {S32, FlatPtr, 32, GlobalAlign32}, 833 {S32, FlatPtr, 16, GlobalAlign16}, 834 {S32, FlatPtr, 8, GlobalAlign8}, 835 {V2S16, FlatPtr, 32, GlobalAlign32}, 836 837 {S32, ConstantPtr, 32, GlobalAlign32}, 838 {V2S32, ConstantPtr, 64, GlobalAlign32}, 839 {V4S32, ConstantPtr, 128, GlobalAlign32}, 840 {S64, ConstantPtr, 64, GlobalAlign32}, 841 {S128, ConstantPtr, 128, GlobalAlign32}, 842 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 843 Actions 844 .customIf(typeIs(1, Constant32Ptr)) 845 // Widen suitably aligned loads by loading extra elements. 846 .moreElementsIf([=](const LegalityQuery &Query) { 847 const LLT Ty = Query.Types[0]; 848 return Op == G_LOAD && Ty.isVector() && 849 shouldWidenLoadResult(Query); 850 }, moreElementsToNextPow2(0)) 851 .widenScalarIf([=](const LegalityQuery &Query) { 852 const LLT Ty = Query.Types[0]; 853 return Op == G_LOAD && !Ty.isVector() && 854 shouldWidenLoadResult(Query); 855 }, widenScalarOrEltToNextPow2(0)) 856 .narrowScalarIf( 857 [=](const LegalityQuery &Query) -> bool { 858 return !Query.Types[0].isVector() && 859 needToSplitMemOp(Query, Op == G_LOAD); 860 }, 861 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 862 const LLT DstTy = Query.Types[0]; 863 const LLT PtrTy = Query.Types[1]; 864 865 const unsigned DstSize = DstTy.getSizeInBits(); 866 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 867 868 // Split extloads. 869 if (DstSize > MemSize) 870 return std::make_pair(0, LLT::scalar(MemSize)); 871 872 if (!isPowerOf2_32(DstSize)) { 873 // We're probably decomposing an odd sized store. Try to split 874 // to the widest type. TODO: Account for alignment. As-is it 875 // should be OK, since the new parts will be further legalized. 876 unsigned FloorSize = PowerOf2Floor(DstSize); 877 return std::make_pair(0, LLT::scalar(FloorSize)); 878 } 879 880 if (DstSize > 32 && (DstSize % 32 != 0)) { 881 // FIXME: Need a way to specify non-extload of larger size if 882 // suitably aligned. 883 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 884 } 885 886 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 887 Op == G_LOAD); 888 if (MemSize > MaxSize) 889 return std::make_pair(0, LLT::scalar(MaxSize)); 890 891 unsigned Align = Query.MMODescrs[0].AlignInBits; 892 return std::make_pair(0, LLT::scalar(Align)); 893 }) 894 .fewerElementsIf( 895 [=](const LegalityQuery &Query) -> bool { 896 return Query.Types[0].isVector() && 897 needToSplitMemOp(Query, Op == G_LOAD); 898 }, 899 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 900 const LLT DstTy = Query.Types[0]; 901 const LLT PtrTy = Query.Types[1]; 902 903 LLT EltTy = DstTy.getElementType(); 904 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 905 Op == G_LOAD); 906 907 // FIXME: Handle widened to power of 2 results better. This ends 908 // up scalarizing. 909 // FIXME: 3 element stores scalarized on SI 910 911 // Split if it's too large for the address space. 912 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 913 unsigned NumElts = DstTy.getNumElements(); 914 unsigned EltSize = EltTy.getSizeInBits(); 915 916 if (MaxSize % EltSize == 0) { 917 return std::make_pair( 918 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 919 } 920 921 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 922 923 // FIXME: Refine when odd breakdowns handled 924 // The scalars will need to be re-legalized. 925 if (NumPieces == 1 || NumPieces >= NumElts || 926 NumElts % NumPieces != 0) 927 return std::make_pair(0, EltTy); 928 929 return std::make_pair(0, 930 LLT::vector(NumElts / NumPieces, EltTy)); 931 } 932 933 // FIXME: We could probably handle weird extending loads better. 934 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 935 if (DstTy.getSizeInBits() > MemSize) 936 return std::make_pair(0, EltTy); 937 938 unsigned EltSize = EltTy.getSizeInBits(); 939 unsigned DstSize = DstTy.getSizeInBits(); 940 if (!isPowerOf2_32(DstSize)) { 941 // We're probably decomposing an odd sized store. Try to split 942 // to the widest type. TODO: Account for alignment. As-is it 943 // should be OK, since the new parts will be further legalized. 944 unsigned FloorSize = PowerOf2Floor(DstSize); 945 return std::make_pair( 946 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 947 } 948 949 // Need to split because of alignment. 950 unsigned Align = Query.MMODescrs[0].AlignInBits; 951 if (EltSize > Align && 952 (EltSize / Align < DstTy.getNumElements())) { 953 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 954 } 955 956 // May need relegalization for the scalars. 957 return std::make_pair(0, EltTy); 958 }) 959 .minScalar(0, S32); 960 961 if (IsStore) 962 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 963 964 // TODO: Need a bitcast lower option? 965 Actions 966 .legalIf([=](const LegalityQuery &Query) { 967 const LLT Ty0 = Query.Types[0]; 968 unsigned Size = Ty0.getSizeInBits(); 969 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 972 // FIXME: Widening store from alignment not valid. 973 if (MemSize < Size) 974 MemSize = std::max(MemSize, Align); 975 976 // No extending vector loads. 977 if (Size > MemSize && Ty0.isVector()) 978 return false; 979 980 switch (MemSize) { 981 case 8: 982 case 16: 983 return Size == 32; 984 case 32: 985 case 64: 986 case 128: 987 return true; 988 case 96: 989 return ST.hasDwordx3LoadStores(); 990 case 256: 991 case 512: 992 return true; 993 default: 994 return false; 995 } 996 }) 997 .widenScalarToNextPow2(0) 998 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 999 } 1000 1001 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1002 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1003 {S32, GlobalPtr, 16, 2 * 8}, 1004 {S32, LocalPtr, 8, 8}, 1005 {S32, LocalPtr, 16, 16}, 1006 {S32, PrivatePtr, 8, 8}, 1007 {S32, PrivatePtr, 16, 16}, 1008 {S32, ConstantPtr, 8, 8}, 1009 {S32, ConstantPtr, 16, 2 * 8}}); 1010 if (ST.hasFlatAddressSpace()) { 1011 ExtLoads.legalForTypesWithMemDesc( 1012 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1013 } 1014 1015 ExtLoads.clampScalar(0, S32, S32) 1016 .widenScalarToNextPow2(0) 1017 .unsupportedIfMemSizeNotPow2() 1018 .lower(); 1019 1020 auto &Atomics = getActionDefinitionsBuilder( 1021 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1022 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1023 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1024 G_ATOMICRMW_UMIN}) 1025 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1026 {S64, GlobalPtr}, {S64, LocalPtr}}); 1027 if (ST.hasFlatAddressSpace()) { 1028 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1029 } 1030 1031 if (ST.hasLDSFPAtomics()) { 1032 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1033 .legalFor({{S32, LocalPtr}}); 1034 } 1035 1036 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1037 // demarshalling 1038 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1039 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1040 {S32, FlatPtr}, {S64, FlatPtr}}) 1041 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1042 {S32, RegionPtr}, {S64, RegionPtr}}); 1043 // TODO: Pointer types, any 32-bit or 64-bit vector 1044 1045 // Condition should be s32 for scalar, s1 for vector. 1046 getActionDefinitionsBuilder(G_SELECT) 1047 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1048 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1049 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1050 .clampScalar(0, S16, S64) 1051 .scalarize(1) 1052 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1053 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1054 .clampMaxNumElements(0, S32, 2) 1055 .clampMaxNumElements(0, LocalPtr, 2) 1056 .clampMaxNumElements(0, PrivatePtr, 2) 1057 .scalarize(0) 1058 .widenScalarToNextPow2(0) 1059 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1060 1061 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1062 // be more flexible with the shift amount type. 1063 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1064 .legalFor({{S32, S32}, {S64, S32}}); 1065 if (ST.has16BitInsts()) { 1066 if (ST.hasVOP3PInsts()) { 1067 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1068 .clampMaxNumElements(0, S16, 2); 1069 } else 1070 Shifts.legalFor({{S16, S16}}); 1071 1072 // TODO: Support 16-bit shift amounts for all types 1073 Shifts.widenScalarIf( 1074 [=](const LegalityQuery &Query) { 1075 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1076 // 32-bit amount. 1077 const LLT ValTy = Query.Types[0]; 1078 const LLT AmountTy = Query.Types[1]; 1079 return ValTy.getSizeInBits() <= 16 && 1080 AmountTy.getSizeInBits() < 16; 1081 }, changeTo(1, S16)); 1082 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1083 Shifts.clampScalar(1, S32, S32); 1084 Shifts.clampScalar(0, S16, S64); 1085 Shifts.widenScalarToNextPow2(0, 16); 1086 } else { 1087 // Make sure we legalize the shift amount type first, as the general 1088 // expansion for the shifted type will produce much worse code if it hasn't 1089 // been truncated already. 1090 Shifts.clampScalar(1, S32, S32); 1091 Shifts.clampScalar(0, S32, S64); 1092 Shifts.widenScalarToNextPow2(0, 32); 1093 } 1094 Shifts.scalarize(0); 1095 1096 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1097 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1098 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1099 unsigned IdxTypeIdx = 2; 1100 1101 getActionDefinitionsBuilder(Op) 1102 .customIf([=](const LegalityQuery &Query) { 1103 const LLT EltTy = Query.Types[EltTypeIdx]; 1104 const LLT VecTy = Query.Types[VecTypeIdx]; 1105 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1106 return (EltTy.getSizeInBits() == 16 || 1107 EltTy.getSizeInBits() % 32 == 0) && 1108 VecTy.getSizeInBits() % 32 == 0 && 1109 VecTy.getSizeInBits() <= 1024 && 1110 IdxTy.getSizeInBits() == 32; 1111 }) 1112 .clampScalar(EltTypeIdx, S32, S64) 1113 .clampScalar(VecTypeIdx, S32, S64) 1114 .clampScalar(IdxTypeIdx, S32, S32); 1115 } 1116 1117 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1118 .unsupportedIf([=](const LegalityQuery &Query) { 1119 const LLT &EltTy = Query.Types[1].getElementType(); 1120 return Query.Types[0] != EltTy; 1121 }); 1122 1123 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1124 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1125 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1126 1127 // FIXME: Doesn't handle extract of illegal sizes. 1128 getActionDefinitionsBuilder(Op) 1129 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1130 // FIXME: Multiples of 16 should not be legal. 1131 .legalIf([=](const LegalityQuery &Query) { 1132 const LLT BigTy = Query.Types[BigTyIdx]; 1133 const LLT LitTy = Query.Types[LitTyIdx]; 1134 return (BigTy.getSizeInBits() % 32 == 0) && 1135 (LitTy.getSizeInBits() % 16 == 0); 1136 }) 1137 .widenScalarIf( 1138 [=](const LegalityQuery &Query) { 1139 const LLT BigTy = Query.Types[BigTyIdx]; 1140 return (BigTy.getScalarSizeInBits() < 16); 1141 }, 1142 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1143 .widenScalarIf( 1144 [=](const LegalityQuery &Query) { 1145 const LLT LitTy = Query.Types[LitTyIdx]; 1146 return (LitTy.getScalarSizeInBits() < 16); 1147 }, 1148 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1149 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1150 .widenScalarToNextPow2(BigTyIdx, 32); 1151 1152 } 1153 1154 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1155 .legalForCartesianProduct(AllS32Vectors, {S32}) 1156 .legalForCartesianProduct(AllS64Vectors, {S64}) 1157 .clampNumElements(0, V16S32, V32S32) 1158 .clampNumElements(0, V2S64, V16S64) 1159 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1160 1161 if (ST.hasScalarPackInsts()) { 1162 BuildVector 1163 // FIXME: Should probably widen s1 vectors straight to s32 1164 .minScalarOrElt(0, S16) 1165 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1166 .minScalar(1, S32); 1167 1168 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1169 .legalFor({V2S16, S32}) 1170 .lower(); 1171 BuildVector.minScalarOrElt(0, S32); 1172 } else { 1173 BuildVector.customFor({V2S16, S16}); 1174 BuildVector.minScalarOrElt(0, S32); 1175 1176 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1177 .customFor({V2S16, S32}) 1178 .lower(); 1179 } 1180 1181 BuildVector.legalIf(isRegisterType(0)); 1182 1183 // FIXME: Clamp maximum size 1184 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1185 .legalIf(isRegisterType(0)); 1186 1187 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1188 // pre-legalize. 1189 if (ST.hasVOP3PInsts()) { 1190 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1191 .customFor({V2S16, V2S16}) 1192 .lower(); 1193 } else 1194 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1195 1196 // Merge/Unmerge 1197 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1198 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1199 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1200 1201 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1202 const LLT Ty = Query.Types[TypeIdx]; 1203 if (Ty.isVector()) { 1204 const LLT &EltTy = Ty.getElementType(); 1205 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1206 return true; 1207 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1208 return true; 1209 } 1210 return false; 1211 }; 1212 1213 auto &Builder = getActionDefinitionsBuilder(Op) 1214 .lowerFor({{S16, V2S16}}) 1215 .lowerIf([=](const LegalityQuery &Query) { 1216 const LLT BigTy = Query.Types[BigTyIdx]; 1217 return BigTy.getSizeInBits() == 32; 1218 }) 1219 // Try to widen to s16 first for small types. 1220 // TODO: Only do this on targets with legal s16 shifts 1221 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1222 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1223 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1224 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1225 elementTypeIs(1, S16)), 1226 changeTo(1, V2S16)) 1227 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1228 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1229 // valid. 1230 .clampScalar(LitTyIdx, S32, S512) 1231 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1232 // Break up vectors with weird elements into scalars 1233 .fewerElementsIf( 1234 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1235 scalarize(0)) 1236 .fewerElementsIf( 1237 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1238 scalarize(1)) 1239 .clampScalar(BigTyIdx, S32, S1024); 1240 1241 if (Op == G_MERGE_VALUES) { 1242 Builder.widenScalarIf( 1243 // TODO: Use 16-bit shifts if legal for 8-bit values? 1244 [=](const LegalityQuery &Query) { 1245 const LLT Ty = Query.Types[LitTyIdx]; 1246 return Ty.getSizeInBits() < 32; 1247 }, 1248 changeTo(LitTyIdx, S32)); 1249 } 1250 1251 Builder.widenScalarIf( 1252 [=](const LegalityQuery &Query) { 1253 const LLT Ty = Query.Types[BigTyIdx]; 1254 return !isPowerOf2_32(Ty.getSizeInBits()) && 1255 Ty.getSizeInBits() % 16 != 0; 1256 }, 1257 [=](const LegalityQuery &Query) { 1258 // Pick the next power of 2, or a multiple of 64 over 128. 1259 // Whichever is smaller. 1260 const LLT &Ty = Query.Types[BigTyIdx]; 1261 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1262 if (NewSizeInBits >= 256) { 1263 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1264 if (RoundedTo < NewSizeInBits) 1265 NewSizeInBits = RoundedTo; 1266 } 1267 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1268 }) 1269 .legalIf([=](const LegalityQuery &Query) { 1270 const LLT &BigTy = Query.Types[BigTyIdx]; 1271 const LLT &LitTy = Query.Types[LitTyIdx]; 1272 1273 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1274 return false; 1275 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1276 return false; 1277 1278 return BigTy.getSizeInBits() % 16 == 0 && 1279 LitTy.getSizeInBits() % 16 == 0 && 1280 BigTy.getSizeInBits() <= 1024; 1281 }) 1282 // Any vectors left are the wrong size. Scalarize them. 1283 .scalarize(0) 1284 .scalarize(1); 1285 } 1286 1287 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1288 // RegBankSelect. 1289 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1290 .legalFor({{S32}, {S64}}); 1291 1292 if (ST.hasVOP3PInsts()) { 1293 SextInReg.lowerFor({{V2S16}}) 1294 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1295 // get more vector shift opportunities, since we'll get those when 1296 // expanded. 1297 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1298 } else if (ST.has16BitInsts()) { 1299 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1300 } else { 1301 // Prefer to promote to s32 before lowering if we don't have 16-bit 1302 // shifts. This avoid a lot of intermediate truncate and extend operations. 1303 SextInReg.lowerFor({{S32}, {S64}}); 1304 } 1305 1306 SextInReg 1307 .scalarize(0) 1308 .clampScalar(0, S32, S64) 1309 .lower(); 1310 1311 getActionDefinitionsBuilder(G_FSHR) 1312 .legalFor({{S32, S32}}) 1313 .scalarize(0) 1314 .lower(); 1315 1316 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1317 .legalFor({S64}); 1318 1319 getActionDefinitionsBuilder({ 1320 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1321 G_FCOPYSIGN, 1322 1323 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1324 G_READ_REGISTER, 1325 G_WRITE_REGISTER, 1326 1327 G_SADDO, G_SSUBO, 1328 1329 // TODO: Implement 1330 G_FMINIMUM, G_FMAXIMUM, 1331 G_FSHL 1332 }).lower(); 1333 1334 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1335 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1336 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1337 .unsupported(); 1338 1339 computeTables(); 1340 verify(*ST.getInstrInfo()); 1341 } 1342 1343 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1344 MachineRegisterInfo &MRI, 1345 MachineIRBuilder &B, 1346 GISelChangeObserver &Observer) const { 1347 switch (MI.getOpcode()) { 1348 case TargetOpcode::G_ADDRSPACE_CAST: 1349 return legalizeAddrSpaceCast(MI, MRI, B); 1350 case TargetOpcode::G_FRINT: 1351 return legalizeFrint(MI, MRI, B); 1352 case TargetOpcode::G_FCEIL: 1353 return legalizeFceil(MI, MRI, B); 1354 case TargetOpcode::G_INTRINSIC_TRUNC: 1355 return legalizeIntrinsicTrunc(MI, MRI, B); 1356 case TargetOpcode::G_SITOFP: 1357 return legalizeITOFP(MI, MRI, B, true); 1358 case TargetOpcode::G_UITOFP: 1359 return legalizeITOFP(MI, MRI, B, false); 1360 case TargetOpcode::G_FPTOSI: 1361 return legalizeFPTOI(MI, MRI, B, true); 1362 case TargetOpcode::G_FPTOUI: 1363 return legalizeFPTOI(MI, MRI, B, false); 1364 case TargetOpcode::G_FMINNUM: 1365 case TargetOpcode::G_FMAXNUM: 1366 case TargetOpcode::G_FMINNUM_IEEE: 1367 case TargetOpcode::G_FMAXNUM_IEEE: 1368 return legalizeMinNumMaxNum(MI, MRI, B); 1369 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1370 return legalizeExtractVectorElt(MI, MRI, B); 1371 case TargetOpcode::G_INSERT_VECTOR_ELT: 1372 return legalizeInsertVectorElt(MI, MRI, B); 1373 case TargetOpcode::G_SHUFFLE_VECTOR: 1374 return legalizeShuffleVector(MI, MRI, B); 1375 case TargetOpcode::G_FSIN: 1376 case TargetOpcode::G_FCOS: 1377 return legalizeSinCos(MI, MRI, B); 1378 case TargetOpcode::G_GLOBAL_VALUE: 1379 return legalizeGlobalValue(MI, MRI, B); 1380 case TargetOpcode::G_LOAD: 1381 return legalizeLoad(MI, MRI, B, Observer); 1382 case TargetOpcode::G_FMAD: 1383 return legalizeFMad(MI, MRI, B); 1384 case TargetOpcode::G_FDIV: 1385 return legalizeFDIV(MI, MRI, B); 1386 case TargetOpcode::G_UDIV: 1387 case TargetOpcode::G_UREM: 1388 return legalizeUDIV_UREM(MI, MRI, B); 1389 case TargetOpcode::G_SDIV: 1390 case TargetOpcode::G_SREM: 1391 return legalizeSDIV_SREM(MI, MRI, B); 1392 case TargetOpcode::G_ATOMIC_CMPXCHG: 1393 return legalizeAtomicCmpXChg(MI, MRI, B); 1394 case TargetOpcode::G_FLOG: 1395 return legalizeFlog(MI, B, numbers::ln2f); 1396 case TargetOpcode::G_FLOG10: 1397 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1398 case TargetOpcode::G_FEXP: 1399 return legalizeFExp(MI, B); 1400 case TargetOpcode::G_FPOW: 1401 return legalizeFPow(MI, B); 1402 case TargetOpcode::G_FFLOOR: 1403 return legalizeFFloor(MI, MRI, B); 1404 case TargetOpcode::G_BUILD_VECTOR: 1405 return legalizeBuildVector(MI, MRI, B); 1406 default: 1407 return false; 1408 } 1409 1410 llvm_unreachable("expected switch to return"); 1411 } 1412 1413 Register AMDGPULegalizerInfo::getSegmentAperture( 1414 unsigned AS, 1415 MachineRegisterInfo &MRI, 1416 MachineIRBuilder &B) const { 1417 MachineFunction &MF = B.getMF(); 1418 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1419 const LLT S32 = LLT::scalar(32); 1420 1421 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1422 1423 if (ST.hasApertureRegs()) { 1424 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1425 // getreg. 1426 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1427 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1428 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1429 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1430 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1431 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1432 unsigned Encoding = 1433 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1434 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1435 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1436 1437 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1438 1439 B.buildInstr(AMDGPU::S_GETREG_B32) 1440 .addDef(GetReg) 1441 .addImm(Encoding); 1442 MRI.setType(GetReg, S32); 1443 1444 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1445 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1446 } 1447 1448 Register QueuePtr = MRI.createGenericVirtualRegister( 1449 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1450 1451 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1452 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1453 return Register(); 1454 1455 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1456 // private_segment_aperture_base_hi. 1457 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1458 1459 // TODO: can we be smarter about machine pointer info? 1460 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1461 MachineMemOperand *MMO = MF.getMachineMemOperand( 1462 PtrInfo, 1463 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1464 MachineMemOperand::MOInvariant, 1465 4, commonAlignment(Align(64), StructOffset)); 1466 1467 Register LoadAddr; 1468 1469 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1470 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1471 } 1472 1473 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1474 MachineInstr &MI, MachineRegisterInfo &MRI, 1475 MachineIRBuilder &B) const { 1476 MachineFunction &MF = B.getMF(); 1477 1478 B.setInstr(MI); 1479 1480 const LLT S32 = LLT::scalar(32); 1481 Register Dst = MI.getOperand(0).getReg(); 1482 Register Src = MI.getOperand(1).getReg(); 1483 1484 LLT DstTy = MRI.getType(Dst); 1485 LLT SrcTy = MRI.getType(Src); 1486 unsigned DestAS = DstTy.getAddressSpace(); 1487 unsigned SrcAS = SrcTy.getAddressSpace(); 1488 1489 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1490 // vector element. 1491 assert(!DstTy.isVector()); 1492 1493 const AMDGPUTargetMachine &TM 1494 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1495 1496 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1497 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1498 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1499 return true; 1500 } 1501 1502 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1503 // Truncate. 1504 B.buildExtract(Dst, Src, 0); 1505 MI.eraseFromParent(); 1506 return true; 1507 } 1508 1509 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1510 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1511 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1512 1513 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1514 // another. Merge operands are required to be the same type, but creating an 1515 // extra ptrtoint would be kind of pointless. 1516 auto HighAddr = B.buildConstant( 1517 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1518 B.buildMerge(Dst, {Src, HighAddr}); 1519 MI.eraseFromParent(); 1520 return true; 1521 } 1522 1523 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1524 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1525 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1526 unsigned NullVal = TM.getNullPointerValue(DestAS); 1527 1528 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1529 auto FlatNull = B.buildConstant(SrcTy, 0); 1530 1531 // Extract low 32-bits of the pointer. 1532 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1533 1534 auto CmpRes = 1535 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1536 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1537 1538 MI.eraseFromParent(); 1539 return true; 1540 } 1541 1542 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1543 return false; 1544 1545 if (!ST.hasFlatAddressSpace()) 1546 return false; 1547 1548 auto SegmentNull = 1549 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1550 auto FlatNull = 1551 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1552 1553 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1554 if (!ApertureReg.isValid()) 1555 return false; 1556 1557 auto CmpRes = 1558 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1559 1560 // Coerce the type of the low half of the result so we can use merge_values. 1561 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1562 1563 // TODO: Should we allow mismatched types but matching sizes in merges to 1564 // avoid the ptrtoint? 1565 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1566 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1567 1568 MI.eraseFromParent(); 1569 return true; 1570 } 1571 1572 bool AMDGPULegalizerInfo::legalizeFrint( 1573 MachineInstr &MI, MachineRegisterInfo &MRI, 1574 MachineIRBuilder &B) const { 1575 B.setInstr(MI); 1576 1577 Register Src = MI.getOperand(1).getReg(); 1578 LLT Ty = MRI.getType(Src); 1579 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1580 1581 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1582 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1583 1584 auto C1 = B.buildFConstant(Ty, C1Val); 1585 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1586 1587 // TODO: Should this propagate fast-math-flags? 1588 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1589 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1590 1591 auto C2 = B.buildFConstant(Ty, C2Val); 1592 auto Fabs = B.buildFAbs(Ty, Src); 1593 1594 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1595 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1596 return true; 1597 } 1598 1599 bool AMDGPULegalizerInfo::legalizeFceil( 1600 MachineInstr &MI, MachineRegisterInfo &MRI, 1601 MachineIRBuilder &B) const { 1602 B.setInstr(MI); 1603 1604 const LLT S1 = LLT::scalar(1); 1605 const LLT S64 = LLT::scalar(64); 1606 1607 Register Src = MI.getOperand(1).getReg(); 1608 assert(MRI.getType(Src) == S64); 1609 1610 // result = trunc(src) 1611 // if (src > 0.0 && src != result) 1612 // result += 1.0 1613 1614 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1615 1616 const auto Zero = B.buildFConstant(S64, 0.0); 1617 const auto One = B.buildFConstant(S64, 1.0); 1618 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1619 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1620 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1621 auto Add = B.buildSelect(S64, And, One, Zero); 1622 1623 // TODO: Should this propagate fast-math-flags? 1624 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1625 return true; 1626 } 1627 1628 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1629 MachineIRBuilder &B) { 1630 const unsigned FractBits = 52; 1631 const unsigned ExpBits = 11; 1632 LLT S32 = LLT::scalar(32); 1633 1634 auto Const0 = B.buildConstant(S32, FractBits - 32); 1635 auto Const1 = B.buildConstant(S32, ExpBits); 1636 1637 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1638 .addUse(Const0.getReg(0)) 1639 .addUse(Const1.getReg(0)); 1640 1641 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1642 } 1643 1644 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1645 MachineInstr &MI, MachineRegisterInfo &MRI, 1646 MachineIRBuilder &B) const { 1647 B.setInstr(MI); 1648 1649 const LLT S1 = LLT::scalar(1); 1650 const LLT S32 = LLT::scalar(32); 1651 const LLT S64 = LLT::scalar(64); 1652 1653 Register Src = MI.getOperand(1).getReg(); 1654 assert(MRI.getType(Src) == S64); 1655 1656 // TODO: Should this use extract since the low half is unused? 1657 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1658 Register Hi = Unmerge.getReg(1); 1659 1660 // Extract the upper half, since this is where we will find the sign and 1661 // exponent. 1662 auto Exp = extractF64Exponent(Hi, B); 1663 1664 const unsigned FractBits = 52; 1665 1666 // Extract the sign bit. 1667 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1668 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1669 1670 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1671 1672 const auto Zero32 = B.buildConstant(S32, 0); 1673 1674 // Extend back to 64-bits. 1675 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1676 1677 auto Shr = B.buildAShr(S64, FractMask, Exp); 1678 auto Not = B.buildNot(S64, Shr); 1679 auto Tmp0 = B.buildAnd(S64, Src, Not); 1680 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1681 1682 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1683 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1684 1685 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1686 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1687 return true; 1688 } 1689 1690 bool AMDGPULegalizerInfo::legalizeITOFP( 1691 MachineInstr &MI, MachineRegisterInfo &MRI, 1692 MachineIRBuilder &B, bool Signed) const { 1693 B.setInstr(MI); 1694 1695 Register Dst = MI.getOperand(0).getReg(); 1696 Register Src = MI.getOperand(1).getReg(); 1697 1698 const LLT S64 = LLT::scalar(64); 1699 const LLT S32 = LLT::scalar(32); 1700 1701 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1702 1703 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1704 1705 auto CvtHi = Signed ? 1706 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1707 B.buildUITOFP(S64, Unmerge.getReg(1)); 1708 1709 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1710 1711 auto ThirtyTwo = B.buildConstant(S32, 32); 1712 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1713 .addUse(CvtHi.getReg(0)) 1714 .addUse(ThirtyTwo.getReg(0)); 1715 1716 // TODO: Should this propagate fast-math-flags? 1717 B.buildFAdd(Dst, LdExp, CvtLo); 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 // TODO: Copied from DAG implementation. Verify logic and document how this 1723 // actually works. 1724 bool AMDGPULegalizerInfo::legalizeFPTOI( 1725 MachineInstr &MI, MachineRegisterInfo &MRI, 1726 MachineIRBuilder &B, bool Signed) const { 1727 B.setInstr(MI); 1728 1729 Register Dst = MI.getOperand(0).getReg(); 1730 Register Src = MI.getOperand(1).getReg(); 1731 1732 const LLT S64 = LLT::scalar(64); 1733 const LLT S32 = LLT::scalar(32); 1734 1735 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1736 1737 unsigned Flags = MI.getFlags(); 1738 1739 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1740 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1741 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1742 1743 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1744 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1745 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1746 1747 auto Hi = Signed ? 1748 B.buildFPTOSI(S32, FloorMul) : 1749 B.buildFPTOUI(S32, FloorMul); 1750 auto Lo = B.buildFPTOUI(S32, Fma); 1751 1752 B.buildMerge(Dst, { Lo, Hi }); 1753 MI.eraseFromParent(); 1754 1755 return true; 1756 } 1757 1758 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1759 MachineInstr &MI, MachineRegisterInfo &MRI, 1760 MachineIRBuilder &B) const { 1761 MachineFunction &MF = B.getMF(); 1762 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1763 1764 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1765 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1766 1767 // With ieee_mode disabled, the instructions have the correct behavior 1768 // already for G_FMINNUM/G_FMAXNUM 1769 if (!MFI->getMode().IEEE) 1770 return !IsIEEEOp; 1771 1772 if (IsIEEEOp) 1773 return true; 1774 1775 MachineIRBuilder HelperBuilder(MI); 1776 GISelObserverWrapper DummyObserver; 1777 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1778 HelperBuilder.setInstr(MI); 1779 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1780 } 1781 1782 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1783 MachineInstr &MI, MachineRegisterInfo &MRI, 1784 MachineIRBuilder &B) const { 1785 // TODO: Should move some of this into LegalizerHelper. 1786 1787 // TODO: Promote dynamic indexing of s16 to s32 1788 1789 // FIXME: Artifact combiner probably should have replaced the truncated 1790 // constant before this, so we shouldn't need 1791 // getConstantVRegValWithLookThrough. 1792 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1793 MI.getOperand(2).getReg(), MRI); 1794 if (!IdxVal) // Dynamic case will be selected to register indexing. 1795 return true; 1796 1797 Register Dst = MI.getOperand(0).getReg(); 1798 Register Vec = MI.getOperand(1).getReg(); 1799 1800 LLT VecTy = MRI.getType(Vec); 1801 LLT EltTy = VecTy.getElementType(); 1802 assert(EltTy == MRI.getType(Dst)); 1803 1804 B.setInstr(MI); 1805 1806 if (IdxVal->Value < VecTy.getNumElements()) 1807 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1808 else 1809 B.buildUndef(Dst); 1810 1811 MI.eraseFromParent(); 1812 return true; 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B) const { 1818 // TODO: Should move some of this into LegalizerHelper. 1819 1820 // TODO: Promote dynamic indexing of s16 to s32 1821 1822 // FIXME: Artifact combiner probably should have replaced the truncated 1823 // constant before this, so we shouldn't need 1824 // getConstantVRegValWithLookThrough. 1825 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1826 MI.getOperand(3).getReg(), MRI); 1827 if (!IdxVal) // Dynamic case will be selected to register indexing. 1828 return true; 1829 1830 Register Dst = MI.getOperand(0).getReg(); 1831 Register Vec = MI.getOperand(1).getReg(); 1832 Register Ins = MI.getOperand(2).getReg(); 1833 1834 LLT VecTy = MRI.getType(Vec); 1835 LLT EltTy = VecTy.getElementType(); 1836 assert(EltTy == MRI.getType(Ins)); 1837 1838 B.setInstr(MI); 1839 1840 if (IdxVal->Value < VecTy.getNumElements()) 1841 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1842 else 1843 B.buildUndef(Dst); 1844 1845 MI.eraseFromParent(); 1846 return true; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 const LLT V2S16 = LLT::vector(2, 16); 1853 1854 Register Dst = MI.getOperand(0).getReg(); 1855 Register Src0 = MI.getOperand(1).getReg(); 1856 LLT DstTy = MRI.getType(Dst); 1857 LLT SrcTy = MRI.getType(Src0); 1858 1859 if (SrcTy == V2S16 && DstTy == V2S16 && 1860 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1861 return true; 1862 1863 MachineIRBuilder HelperBuilder(MI); 1864 GISelObserverWrapper DummyObserver; 1865 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1866 HelperBuilder.setInstr(MI); 1867 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1868 } 1869 1870 bool AMDGPULegalizerInfo::legalizeSinCos( 1871 MachineInstr &MI, MachineRegisterInfo &MRI, 1872 MachineIRBuilder &B) const { 1873 B.setInstr(MI); 1874 1875 Register DstReg = MI.getOperand(0).getReg(); 1876 Register SrcReg = MI.getOperand(1).getReg(); 1877 LLT Ty = MRI.getType(DstReg); 1878 unsigned Flags = MI.getFlags(); 1879 1880 Register TrigVal; 1881 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1882 if (ST.hasTrigReducedRange()) { 1883 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1884 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1885 .addUse(MulVal.getReg(0)) 1886 .setMIFlags(Flags).getReg(0); 1887 } else 1888 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1889 1890 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1891 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1892 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1893 .addUse(TrigVal) 1894 .setMIFlags(Flags); 1895 MI.eraseFromParent(); 1896 return true; 1897 } 1898 1899 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1900 Register DstReg, LLT PtrTy, 1901 MachineIRBuilder &B, const GlobalValue *GV, 1902 unsigned Offset, unsigned GAFlags) const { 1903 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1904 // to the following code sequence: 1905 // 1906 // For constant address space: 1907 // s_getpc_b64 s[0:1] 1908 // s_add_u32 s0, s0, $symbol 1909 // s_addc_u32 s1, s1, 0 1910 // 1911 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1912 // a fixup or relocation is emitted to replace $symbol with a literal 1913 // constant, which is a pc-relative offset from the encoding of the $symbol 1914 // operand to the global variable. 1915 // 1916 // For global address space: 1917 // s_getpc_b64 s[0:1] 1918 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1919 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1920 // 1921 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1922 // fixups or relocations are emitted to replace $symbol@*@lo and 1923 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1924 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1925 // operand to the global variable. 1926 // 1927 // What we want here is an offset from the value returned by s_getpc 1928 // (which is the address of the s_add_u32 instruction) to the global 1929 // variable, but since the encoding of $symbol starts 4 bytes after the start 1930 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1931 // small. This requires us to add 4 to the global variable offset in order to 1932 // compute the correct address. 1933 1934 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1935 1936 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1937 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1938 1939 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1940 .addDef(PCReg); 1941 1942 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1943 if (GAFlags == SIInstrInfo::MO_NONE) 1944 MIB.addImm(0); 1945 else 1946 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1947 1948 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1949 1950 if (PtrTy.getSizeInBits() == 32) 1951 B.buildExtract(DstReg, PCReg, 0); 1952 return true; 1953 } 1954 1955 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1956 MachineInstr &MI, MachineRegisterInfo &MRI, 1957 MachineIRBuilder &B) const { 1958 Register DstReg = MI.getOperand(0).getReg(); 1959 LLT Ty = MRI.getType(DstReg); 1960 unsigned AS = Ty.getAddressSpace(); 1961 1962 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1963 MachineFunction &MF = B.getMF(); 1964 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1965 B.setInstr(MI); 1966 1967 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1968 if (!MFI->isEntryFunction()) { 1969 const Function &Fn = MF.getFunction(); 1970 DiagnosticInfoUnsupported BadLDSDecl( 1971 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1972 DS_Warning); 1973 Fn.getContext().diagnose(BadLDSDecl); 1974 1975 // We currently don't have a way to correctly allocate LDS objects that 1976 // aren't directly associated with a kernel. We do force inlining of 1977 // functions that use local objects. However, if these dead functions are 1978 // not eliminated, we don't want a compile time error. Just emit a warning 1979 // and a trap, since there should be no callable path here. 1980 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1981 B.buildUndef(DstReg); 1982 MI.eraseFromParent(); 1983 return true; 1984 } 1985 1986 // TODO: We could emit code to handle the initialization somewhere. 1987 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1988 const SITargetLowering *TLI = ST.getTargetLowering(); 1989 if (!TLI->shouldUseLDSConstAddress(GV)) { 1990 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1991 return true; // Leave in place; 1992 } 1993 1994 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1995 MI.eraseFromParent(); 1996 return true; 1997 } 1998 1999 const Function &Fn = MF.getFunction(); 2000 DiagnosticInfoUnsupported BadInit( 2001 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2002 Fn.getContext().diagnose(BadInit); 2003 return true; 2004 } 2005 2006 const SITargetLowering *TLI = ST.getTargetLowering(); 2007 2008 if (TLI->shouldEmitFixup(GV)) { 2009 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 if (TLI->shouldEmitPCReloc(GV)) { 2015 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2016 MI.eraseFromParent(); 2017 return true; 2018 } 2019 2020 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2021 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2022 2023 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2024 MachinePointerInfo::getGOT(MF), 2025 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2026 MachineMemOperand::MOInvariant, 2027 8 /*Size*/, Align(8)); 2028 2029 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2030 2031 if (Ty.getSizeInBits() == 32) { 2032 // Truncate if this is a 32-bit constant adrdess. 2033 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2034 B.buildExtract(DstReg, Load, 0); 2035 } else 2036 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2037 2038 MI.eraseFromParent(); 2039 return true; 2040 } 2041 2042 bool AMDGPULegalizerInfo::legalizeLoad( 2043 MachineInstr &MI, MachineRegisterInfo &MRI, 2044 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2045 B.setInstr(MI); 2046 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2047 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2048 Observer.changingInstr(MI); 2049 MI.getOperand(1).setReg(Cast.getReg(0)); 2050 Observer.changedInstr(MI); 2051 return true; 2052 } 2053 2054 bool AMDGPULegalizerInfo::legalizeFMad( 2055 MachineInstr &MI, MachineRegisterInfo &MRI, 2056 MachineIRBuilder &B) const { 2057 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2058 assert(Ty.isScalar()); 2059 2060 MachineFunction &MF = B.getMF(); 2061 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2062 2063 // TODO: Always legal with future ftz flag. 2064 // FIXME: Do we need just output? 2065 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2066 return true; 2067 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2068 return true; 2069 2070 MachineIRBuilder HelperBuilder(MI); 2071 GISelObserverWrapper DummyObserver; 2072 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2073 HelperBuilder.setInstr(MI); 2074 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2075 } 2076 2077 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2078 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2079 Register DstReg = MI.getOperand(0).getReg(); 2080 Register PtrReg = MI.getOperand(1).getReg(); 2081 Register CmpVal = MI.getOperand(2).getReg(); 2082 Register NewVal = MI.getOperand(3).getReg(); 2083 2084 assert(SITargetLowering::isFlatGlobalAddrSpace( 2085 MRI.getType(PtrReg).getAddressSpace()) && 2086 "this should not have been custom lowered"); 2087 2088 LLT ValTy = MRI.getType(CmpVal); 2089 LLT VecTy = LLT::vector(2, ValTy); 2090 2091 B.setInstr(MI); 2092 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2093 2094 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2095 .addDef(DstReg) 2096 .addUse(PtrReg) 2097 .addUse(PackedVal) 2098 .setMemRefs(MI.memoperands()); 2099 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeFlog( 2105 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2106 Register Dst = MI.getOperand(0).getReg(); 2107 Register Src = MI.getOperand(1).getReg(); 2108 LLT Ty = B.getMRI()->getType(Dst); 2109 unsigned Flags = MI.getFlags(); 2110 B.setInstr(MI); 2111 2112 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2113 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2114 2115 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2121 MachineIRBuilder &B) const { 2122 Register Dst = MI.getOperand(0).getReg(); 2123 Register Src = MI.getOperand(1).getReg(); 2124 unsigned Flags = MI.getFlags(); 2125 LLT Ty = B.getMRI()->getType(Dst); 2126 B.setInstr(MI); 2127 2128 auto K = B.buildFConstant(Ty, numbers::log2e); 2129 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2130 B.buildFExp2(Dst, Mul, Flags); 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2136 MachineIRBuilder &B) const { 2137 Register Dst = MI.getOperand(0).getReg(); 2138 Register Src0 = MI.getOperand(1).getReg(); 2139 Register Src1 = MI.getOperand(2).getReg(); 2140 unsigned Flags = MI.getFlags(); 2141 LLT Ty = B.getMRI()->getType(Dst); 2142 B.setInstr(MI); 2143 const LLT S16 = LLT::scalar(16); 2144 const LLT S32 = LLT::scalar(32); 2145 2146 if (Ty == S32) { 2147 auto Log = B.buildFLog2(S32, Src0, Flags); 2148 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2149 .addUse(Log.getReg(0)) 2150 .addUse(Src1) 2151 .setMIFlags(Flags); 2152 B.buildFExp2(Dst, Mul, Flags); 2153 } else if (Ty == S16) { 2154 // There's no f16 fmul_legacy, so we need to convert for it. 2155 auto Log = B.buildFLog2(S16, Src0, Flags); 2156 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2157 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2158 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2159 .addUse(Ext0.getReg(0)) 2160 .addUse(Ext1.getReg(0)) 2161 .setMIFlags(Flags); 2162 2163 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2164 } else 2165 return false; 2166 2167 MI.eraseFromParent(); 2168 return true; 2169 } 2170 2171 // Find a source register, ignoring any possible source modifiers. 2172 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2173 Register ModSrc = OrigSrc; 2174 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2175 ModSrc = SrcFNeg->getOperand(1).getReg(); 2176 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2177 ModSrc = SrcFAbs->getOperand(1).getReg(); 2178 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2179 ModSrc = SrcFAbs->getOperand(1).getReg(); 2180 return ModSrc; 2181 } 2182 2183 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2184 MachineRegisterInfo &MRI, 2185 MachineIRBuilder &B) const { 2186 B.setInstr(MI); 2187 2188 const LLT S1 = LLT::scalar(1); 2189 const LLT S64 = LLT::scalar(64); 2190 Register Dst = MI.getOperand(0).getReg(); 2191 Register OrigSrc = MI.getOperand(1).getReg(); 2192 unsigned Flags = MI.getFlags(); 2193 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2194 "this should not have been custom lowered"); 2195 2196 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2197 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2198 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2199 // V_FRACT bug is: 2200 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2201 // 2202 // Convert floor(x) to (x - fract(x)) 2203 2204 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2205 .addUse(OrigSrc) 2206 .setMIFlags(Flags); 2207 2208 // Give source modifier matching some assistance before obscuring a foldable 2209 // pattern. 2210 2211 // TODO: We can avoid the neg on the fract? The input sign to fract 2212 // shouldn't matter? 2213 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2214 2215 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2216 2217 Register Min = MRI.createGenericVirtualRegister(S64); 2218 2219 // We don't need to concern ourselves with the snan handling difference, so 2220 // use the one which will directly select. 2221 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2222 if (MFI->getMode().IEEE) 2223 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2224 else 2225 B.buildFMinNum(Min, Fract, Const, Flags); 2226 2227 Register CorrectedFract = Min; 2228 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2229 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2230 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2231 } 2232 2233 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2234 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2235 2236 MI.eraseFromParent(); 2237 return true; 2238 } 2239 2240 // Turn an illegal packed v2s16 build vector into bit operations. 2241 // TODO: This should probably be a bitcast action in LegalizerHelper. 2242 bool AMDGPULegalizerInfo::legalizeBuildVector( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2244 Register Dst = MI.getOperand(0).getReg(); 2245 const LLT S32 = LLT::scalar(32); 2246 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2247 2248 Register Src0 = MI.getOperand(1).getReg(); 2249 Register Src1 = MI.getOperand(2).getReg(); 2250 assert(MRI.getType(Src0) == LLT::scalar(16)); 2251 2252 B.setInstr(MI); 2253 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2254 B.buildBitcast(Dst, Merge); 2255 2256 MI.eraseFromParent(); 2257 return true; 2258 } 2259 2260 // Return the use branch instruction, otherwise null if the usage is invalid. 2261 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2262 MachineRegisterInfo &MRI, 2263 MachineInstr *&Br, 2264 MachineBasicBlock *&UncondBrTarget) { 2265 Register CondDef = MI.getOperand(0).getReg(); 2266 if (!MRI.hasOneNonDBGUse(CondDef)) 2267 return nullptr; 2268 2269 MachineBasicBlock *Parent = MI.getParent(); 2270 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2271 if (UseMI.getParent() != Parent || 2272 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2273 return nullptr; 2274 2275 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2276 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2277 if (Next == Parent->end()) { 2278 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2279 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2280 return nullptr; 2281 UncondBrTarget = &*NextMBB; 2282 } else { 2283 if (Next->getOpcode() != AMDGPU::G_BR) 2284 return nullptr; 2285 Br = &*Next; 2286 UncondBrTarget = Br->getOperand(0).getMBB(); 2287 } 2288 2289 return &UseMI; 2290 } 2291 2292 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2293 MachineRegisterInfo &MRI, 2294 Register LiveIn, 2295 Register PhyReg) const { 2296 assert(PhyReg.isPhysical() && "Physical register expected"); 2297 2298 // Insert the live-in copy, if required, by defining destination virtual 2299 // register. 2300 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2301 if (!MRI.getVRegDef(LiveIn)) { 2302 // FIXME: Should have scoped insert pt 2303 MachineBasicBlock &OrigInsBB = B.getMBB(); 2304 auto OrigInsPt = B.getInsertPt(); 2305 2306 MachineBasicBlock &EntryMBB = B.getMF().front(); 2307 EntryMBB.addLiveIn(PhyReg); 2308 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2309 B.buildCopy(LiveIn, PhyReg); 2310 2311 B.setInsertPt(OrigInsBB, OrigInsPt); 2312 } 2313 2314 return LiveIn; 2315 } 2316 2317 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2318 MachineRegisterInfo &MRI, 2319 Register PhyReg, LLT Ty, 2320 bool InsertLiveInCopy) const { 2321 assert(PhyReg.isPhysical() && "Physical register expected"); 2322 2323 // Get or create virtual live-in regester 2324 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2325 if (!LiveIn) { 2326 LiveIn = MRI.createGenericVirtualRegister(Ty); 2327 MRI.addLiveIn(PhyReg, LiveIn); 2328 } 2329 2330 // When the actual true copy required is from virtual register to physical 2331 // register (to be inserted later), live-in copy insertion from physical 2332 // to register virtual register is not required 2333 if (!InsertLiveInCopy) 2334 return LiveIn; 2335 2336 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2337 } 2338 2339 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2340 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2341 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2342 const ArgDescriptor *Arg; 2343 const TargetRegisterClass *RC; 2344 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2345 if (!Arg) { 2346 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2347 return nullptr; 2348 } 2349 return Arg; 2350 } 2351 2352 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2353 const ArgDescriptor *Arg) const { 2354 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2355 return false; // TODO: Handle these 2356 2357 Register SrcReg = Arg->getRegister(); 2358 assert(SrcReg.isPhysical() && "Physical register expected"); 2359 assert(DstReg.isVirtual() && "Virtual register expected"); 2360 2361 MachineRegisterInfo &MRI = *B.getMRI(); 2362 2363 LLT Ty = MRI.getType(DstReg); 2364 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2365 2366 if (Arg->isMasked()) { 2367 // TODO: Should we try to emit this once in the entry block? 2368 const LLT S32 = LLT::scalar(32); 2369 const unsigned Mask = Arg->getMask(); 2370 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2371 2372 Register AndMaskSrc = LiveIn; 2373 2374 if (Shift != 0) { 2375 auto ShiftAmt = B.buildConstant(S32, Shift); 2376 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2377 } 2378 2379 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2380 } else { 2381 B.buildCopy(DstReg, LiveIn); 2382 } 2383 2384 return true; 2385 } 2386 2387 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2388 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2389 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2390 B.setInstr(MI); 2391 2392 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2393 if (!Arg) 2394 return false; 2395 2396 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2397 return false; 2398 2399 MI.eraseFromParent(); 2400 return true; 2401 } 2402 2403 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2404 MachineRegisterInfo &MRI, 2405 MachineIRBuilder &B) const { 2406 B.setInstr(MI); 2407 Register Dst = MI.getOperand(0).getReg(); 2408 LLT DstTy = MRI.getType(Dst); 2409 LLT S16 = LLT::scalar(16); 2410 LLT S32 = LLT::scalar(32); 2411 LLT S64 = LLT::scalar(64); 2412 2413 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2414 return true; 2415 2416 if (DstTy == S16) 2417 return legalizeFDIV16(MI, MRI, B); 2418 if (DstTy == S32) 2419 return legalizeFDIV32(MI, MRI, B); 2420 if (DstTy == S64) 2421 return legalizeFDIV64(MI, MRI, B); 2422 2423 return false; 2424 } 2425 2426 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2427 const LLT S32 = LLT::scalar(32); 2428 2429 auto Cvt0 = B.buildUITOFP(S32, Src); 2430 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2431 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2432 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2433 return B.buildFPTOUI(S32, Mul).getReg(0); 2434 } 2435 2436 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2437 Register DstReg, 2438 Register Num, 2439 Register Den, 2440 bool IsRem) const { 2441 const LLT S1 = LLT::scalar(1); 2442 const LLT S32 = LLT::scalar(32); 2443 2444 // RCP = URECIP(Den) = 2^32 / Den + e 2445 // e is rounding error. 2446 auto RCP = buildDivRCP(B, Den); 2447 2448 // RCP_LO = mul(RCP, Den) 2449 auto RCP_LO = B.buildMul(S32, RCP, Den); 2450 2451 // RCP_HI = mulhu (RCP, Den) */ 2452 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2453 2454 // NEG_RCP_LO = -RCP_LO 2455 auto Zero = B.buildConstant(S32, 0); 2456 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2457 2458 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2459 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2460 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2461 2462 // Calculate the rounding error from the URECIP instruction 2463 // E = mulhu(ABS_RCP_LO, RCP) 2464 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2465 2466 // RCP_A_E = RCP + E 2467 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2468 2469 // RCP_S_E = RCP - E 2470 auto RCP_S_E = B.buildSub(S32, RCP, E); 2471 2472 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2473 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2474 2475 // Quotient = mulhu(Tmp0, Num)stmp 2476 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2477 2478 // Num_S_Remainder = Quotient * Den 2479 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2480 2481 // Remainder = Num - Num_S_Remainder 2482 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2483 2484 // Remainder_GE_Den = Remainder >= Den 2485 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2486 2487 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2488 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2489 Num, Num_S_Remainder); 2490 2491 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2492 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2493 2494 // Calculate Division result: 2495 2496 // Quotient_A_One = Quotient + 1 2497 auto One = B.buildConstant(S32, 1); 2498 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2499 2500 // Quotient_S_One = Quotient - 1 2501 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2502 2503 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2504 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2505 2506 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2507 if (IsRem) { 2508 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2509 2510 // Calculate Rem result: 2511 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2512 2513 // Remainder_A_Den = Remainder + Den 2514 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2515 2516 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2517 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2518 2519 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2520 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2521 } else { 2522 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2523 } 2524 } 2525 2526 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2527 MachineRegisterInfo &MRI, 2528 MachineIRBuilder &B) const { 2529 B.setInstr(MI); 2530 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2531 Register DstReg = MI.getOperand(0).getReg(); 2532 Register Num = MI.getOperand(1).getReg(); 2533 Register Den = MI.getOperand(2).getReg(); 2534 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2535 MI.eraseFromParent(); 2536 return true; 2537 } 2538 2539 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2540 // 2541 // Return lo, hi of result 2542 // 2543 // %cvt.lo = G_UITOFP Val.lo 2544 // %cvt.hi = G_UITOFP Val.hi 2545 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2546 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2547 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2548 // %mul2 = G_FMUL %mul1, 2**(-32) 2549 // %trunc = G_INTRINSIC_TRUNC %mul2 2550 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2551 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2552 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2553 Register Val) { 2554 const LLT S32 = LLT::scalar(32); 2555 auto Unmerge = B.buildUnmerge(S32, Val); 2556 2557 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2558 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2559 2560 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2561 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2562 2563 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2564 auto Mul1 = 2565 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2566 2567 // 2**(-32) 2568 auto Mul2 = 2569 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2570 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2571 2572 // -(2**32) 2573 auto Mad2 = B.buildFMAD(S32, Trunc, 2574 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2575 2576 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2577 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2578 2579 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2580 } 2581 2582 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2583 MachineRegisterInfo &MRI, 2584 MachineIRBuilder &B) const { 2585 B.setInstr(MI); 2586 2587 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2588 const LLT S32 = LLT::scalar(32); 2589 const LLT S64 = LLT::scalar(64); 2590 const LLT S1 = LLT::scalar(1); 2591 Register Numer = MI.getOperand(1).getReg(); 2592 Register Denom = MI.getOperand(2).getReg(); 2593 Register RcpLo, RcpHi; 2594 2595 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2596 2597 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2598 2599 auto Zero64 = B.buildConstant(S64, 0); 2600 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2601 2602 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2603 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2604 2605 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2606 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2607 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2608 2609 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2610 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2611 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2612 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2613 2614 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2615 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2616 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2617 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2618 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2619 2620 auto Zero32 = B.buildConstant(S32, 0); 2621 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2622 auto Add2_HiC = 2623 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2624 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2625 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2626 2627 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2628 Register NumerLo = UnmergeNumer.getReg(0); 2629 Register NumerHi = UnmergeNumer.getReg(1); 2630 2631 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2632 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2633 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2634 Register Mul3_Lo = UnmergeMul3.getReg(0); 2635 Register Mul3_Hi = UnmergeMul3.getReg(1); 2636 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2637 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2638 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2639 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2640 2641 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2642 Register DenomLo = UnmergeDenom.getReg(0); 2643 Register DenomHi = UnmergeDenom.getReg(1); 2644 2645 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2646 auto C1 = B.buildSExt(S32, CmpHi); 2647 2648 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2649 auto C2 = B.buildSExt(S32, CmpLo); 2650 2651 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2652 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2653 2654 // TODO: Here and below portions of the code can be enclosed into if/endif. 2655 // Currently control flow is unconditional and we have 4 selects after 2656 // potential endif to substitute PHIs. 2657 2658 // if C3 != 0 ... 2659 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2660 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2661 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2662 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2663 2664 auto One64 = B.buildConstant(S64, 1); 2665 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2666 2667 auto C4 = 2668 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2669 auto C5 = 2670 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2671 auto C6 = B.buildSelect( 2672 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2673 2674 // if (C6 != 0) 2675 auto Add4 = B.buildAdd(S64, Add3, One64); 2676 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2677 2678 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2679 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2680 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2681 2682 // endif C6 2683 // endif C3 2684 2685 if (IsDiv) { 2686 auto Sel1 = B.buildSelect( 2687 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2688 B.buildSelect(MI.getOperand(0), 2689 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2690 } else { 2691 auto Sel2 = B.buildSelect( 2692 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2693 B.buildSelect(MI.getOperand(0), 2694 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2695 } 2696 2697 MI.eraseFromParent(); 2698 return true; 2699 } 2700 2701 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2702 MachineRegisterInfo &MRI, 2703 MachineIRBuilder &B) const { 2704 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2705 if (Ty == LLT::scalar(32)) 2706 return legalizeUDIV_UREM32(MI, MRI, B); 2707 if (Ty == LLT::scalar(64)) 2708 return legalizeUDIV_UREM64(MI, MRI, B); 2709 return false; 2710 } 2711 2712 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2713 MachineRegisterInfo &MRI, 2714 MachineIRBuilder &B) const { 2715 B.setInstr(MI); 2716 const LLT S32 = LLT::scalar(32); 2717 2718 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2719 Register DstReg = MI.getOperand(0).getReg(); 2720 Register LHS = MI.getOperand(1).getReg(); 2721 Register RHS = MI.getOperand(2).getReg(); 2722 2723 auto ThirtyOne = B.buildConstant(S32, 31); 2724 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2725 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2726 2727 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2728 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2729 2730 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2731 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2732 2733 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2734 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2735 2736 if (IsRem) { 2737 auto RSign = LHSign; // Remainder sign is the same as LHS 2738 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2739 B.buildSub(DstReg, UDivRem, RSign); 2740 } else { 2741 auto DSign = B.buildXor(S32, LHSign, RHSign); 2742 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2743 B.buildSub(DstReg, UDivRem, DSign); 2744 } 2745 2746 MI.eraseFromParent(); 2747 return true; 2748 } 2749 2750 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2751 MachineRegisterInfo &MRI, 2752 MachineIRBuilder &B) const { 2753 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2754 return legalizeSDIV_SREM32(MI, MRI, B); 2755 return false; 2756 } 2757 2758 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2759 MachineRegisterInfo &MRI, 2760 MachineIRBuilder &B) const { 2761 Register Res = MI.getOperand(0).getReg(); 2762 Register LHS = MI.getOperand(1).getReg(); 2763 Register RHS = MI.getOperand(2).getReg(); 2764 2765 uint16_t Flags = MI.getFlags(); 2766 2767 LLT ResTy = MRI.getType(Res); 2768 LLT S32 = LLT::scalar(32); 2769 LLT S64 = LLT::scalar(64); 2770 2771 const MachineFunction &MF = B.getMF(); 2772 bool Unsafe = 2773 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2774 2775 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2776 return false; 2777 2778 if (!Unsafe && ResTy == S32 && 2779 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2780 return false; 2781 2782 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2783 // 1 / x -> RCP(x) 2784 if (CLHS->isExactlyValue(1.0)) { 2785 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2786 .addUse(RHS) 2787 .setMIFlags(Flags); 2788 2789 MI.eraseFromParent(); 2790 return true; 2791 } 2792 2793 // -1 / x -> RCP( FNEG(x) ) 2794 if (CLHS->isExactlyValue(-1.0)) { 2795 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2796 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2797 .addUse(FNeg.getReg(0)) 2798 .setMIFlags(Flags); 2799 2800 MI.eraseFromParent(); 2801 return true; 2802 } 2803 } 2804 2805 // x / y -> x * (1.0 / y) 2806 if (Unsafe) { 2807 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2808 .addUse(RHS) 2809 .setMIFlags(Flags); 2810 B.buildFMul(Res, LHS, RCP, Flags); 2811 2812 MI.eraseFromParent(); 2813 return true; 2814 } 2815 2816 return false; 2817 } 2818 2819 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2820 MachineRegisterInfo &MRI, 2821 MachineIRBuilder &B) const { 2822 B.setInstr(MI); 2823 Register Res = MI.getOperand(0).getReg(); 2824 Register LHS = MI.getOperand(1).getReg(); 2825 Register RHS = MI.getOperand(2).getReg(); 2826 2827 uint16_t Flags = MI.getFlags(); 2828 2829 LLT S16 = LLT::scalar(16); 2830 LLT S32 = LLT::scalar(32); 2831 2832 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2833 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2834 2835 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2836 .addUse(RHSExt.getReg(0)) 2837 .setMIFlags(Flags); 2838 2839 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2840 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2841 2842 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2843 .addUse(RDst.getReg(0)) 2844 .addUse(RHS) 2845 .addUse(LHS) 2846 .setMIFlags(Flags); 2847 2848 MI.eraseFromParent(); 2849 return true; 2850 } 2851 2852 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2853 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2854 static void toggleSPDenormMode(bool Enable, 2855 MachineIRBuilder &B, 2856 const GCNSubtarget &ST, 2857 AMDGPU::SIModeRegisterDefaults Mode) { 2858 // Set SP denorm mode to this value. 2859 unsigned SPDenormMode = 2860 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2861 2862 if (ST.hasDenormModeInst()) { 2863 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2864 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2865 2866 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2867 B.buildInstr(AMDGPU::S_DENORM_MODE) 2868 .addImm(NewDenormModeValue); 2869 2870 } else { 2871 // Select FP32 bit field in mode register. 2872 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2873 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2874 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2875 2876 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2877 .addImm(SPDenormMode) 2878 .addImm(SPDenormModeBitField); 2879 } 2880 } 2881 2882 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2883 MachineRegisterInfo &MRI, 2884 MachineIRBuilder &B) const { 2885 B.setInstr(MI); 2886 Register Res = MI.getOperand(0).getReg(); 2887 Register LHS = MI.getOperand(1).getReg(); 2888 Register RHS = MI.getOperand(2).getReg(); 2889 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2890 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2891 2892 uint16_t Flags = MI.getFlags(); 2893 2894 LLT S32 = LLT::scalar(32); 2895 LLT S1 = LLT::scalar(1); 2896 2897 auto One = B.buildFConstant(S32, 1.0f); 2898 2899 auto DenominatorScaled = 2900 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2901 .addUse(LHS) 2902 .addUse(RHS) 2903 .addImm(0) 2904 .setMIFlags(Flags); 2905 auto NumeratorScaled = 2906 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2907 .addUse(LHS) 2908 .addUse(RHS) 2909 .addImm(1) 2910 .setMIFlags(Flags); 2911 2912 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2913 .addUse(DenominatorScaled.getReg(0)) 2914 .setMIFlags(Flags); 2915 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2916 2917 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2918 // aren't modeled as reading it. 2919 if (!Mode.allFP32Denormals()) 2920 toggleSPDenormMode(true, B, ST, Mode); 2921 2922 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2923 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2924 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2925 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2926 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2927 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2928 2929 if (!Mode.allFP32Denormals()) 2930 toggleSPDenormMode(false, B, ST, Mode); 2931 2932 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2933 .addUse(Fma4.getReg(0)) 2934 .addUse(Fma1.getReg(0)) 2935 .addUse(Fma3.getReg(0)) 2936 .addUse(NumeratorScaled.getReg(1)) 2937 .setMIFlags(Flags); 2938 2939 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2940 .addUse(Fmas.getReg(0)) 2941 .addUse(RHS) 2942 .addUse(LHS) 2943 .setMIFlags(Flags); 2944 2945 MI.eraseFromParent(); 2946 return true; 2947 } 2948 2949 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2950 MachineRegisterInfo &MRI, 2951 MachineIRBuilder &B) const { 2952 B.setInstr(MI); 2953 Register Res = MI.getOperand(0).getReg(); 2954 Register LHS = MI.getOperand(1).getReg(); 2955 Register RHS = MI.getOperand(2).getReg(); 2956 2957 uint16_t Flags = MI.getFlags(); 2958 2959 LLT S64 = LLT::scalar(64); 2960 LLT S1 = LLT::scalar(1); 2961 2962 auto One = B.buildFConstant(S64, 1.0); 2963 2964 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2965 .addUse(LHS) 2966 .addUse(RHS) 2967 .addImm(0) 2968 .setMIFlags(Flags); 2969 2970 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2971 2972 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2973 .addUse(DivScale0.getReg(0)) 2974 .setMIFlags(Flags); 2975 2976 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2977 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2978 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2979 2980 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2981 .addUse(LHS) 2982 .addUse(RHS) 2983 .addImm(1) 2984 .setMIFlags(Flags); 2985 2986 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2987 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 2988 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2989 2990 Register Scale; 2991 if (!ST.hasUsableDivScaleConditionOutput()) { 2992 // Workaround a hardware bug on SI where the condition output from div_scale 2993 // is not usable. 2994 2995 LLT S32 = LLT::scalar(32); 2996 2997 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2998 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2999 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3000 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3001 3002 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3003 Scale1Unmerge.getReg(1)); 3004 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3005 Scale0Unmerge.getReg(1)); 3006 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3007 } else { 3008 Scale = DivScale1.getReg(1); 3009 } 3010 3011 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3012 .addUse(Fma4.getReg(0)) 3013 .addUse(Fma3.getReg(0)) 3014 .addUse(Mul.getReg(0)) 3015 .addUse(Scale) 3016 .setMIFlags(Flags); 3017 3018 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3019 .addUse(Fmas.getReg(0)) 3020 .addUse(RHS) 3021 .addUse(LHS) 3022 .setMIFlags(Flags); 3023 3024 MI.eraseFromParent(); 3025 return true; 3026 } 3027 3028 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3029 MachineRegisterInfo &MRI, 3030 MachineIRBuilder &B) const { 3031 B.setInstr(MI); 3032 Register Res = MI.getOperand(0).getReg(); 3033 Register LHS = MI.getOperand(2).getReg(); 3034 Register RHS = MI.getOperand(3).getReg(); 3035 uint16_t Flags = MI.getFlags(); 3036 3037 LLT S32 = LLT::scalar(32); 3038 LLT S1 = LLT::scalar(1); 3039 3040 auto Abs = B.buildFAbs(S32, RHS, Flags); 3041 const APFloat C0Val(1.0f); 3042 3043 auto C0 = B.buildConstant(S32, 0x6f800000); 3044 auto C1 = B.buildConstant(S32, 0x2f800000); 3045 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3046 3047 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3048 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3049 3050 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3051 3052 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3053 .addUse(Mul0.getReg(0)) 3054 .setMIFlags(Flags); 3055 3056 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3057 3058 B.buildFMul(Res, Sel, Mul1, Flags); 3059 3060 MI.eraseFromParent(); 3061 return true; 3062 } 3063 3064 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3065 MachineRegisterInfo &MRI, 3066 MachineIRBuilder &B) const { 3067 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3068 if (!MFI->isEntryFunction()) { 3069 return legalizePreloadedArgIntrin(MI, MRI, B, 3070 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3071 } 3072 3073 B.setInstr(MI); 3074 3075 uint64_t Offset = 3076 ST.getTargetLowering()->getImplicitParameterOffset( 3077 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3078 Register DstReg = MI.getOperand(0).getReg(); 3079 LLT DstTy = MRI.getType(DstReg); 3080 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3081 3082 const ArgDescriptor *Arg; 3083 const TargetRegisterClass *RC; 3084 std::tie(Arg, RC) 3085 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3086 if (!Arg) 3087 return false; 3088 3089 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3090 if (!loadInputValue(KernargPtrReg, B, Arg)) 3091 return false; 3092 3093 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3094 MI.eraseFromParent(); 3095 return true; 3096 } 3097 3098 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3099 MachineRegisterInfo &MRI, 3100 MachineIRBuilder &B, 3101 unsigned AddrSpace) const { 3102 B.setInstr(MI); 3103 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3104 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3105 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3106 MI.eraseFromParent(); 3107 return true; 3108 } 3109 3110 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3111 // offset (the offset that is included in bounds checking and swizzling, to be 3112 // split between the instruction's voffset and immoffset fields) and soffset 3113 // (the offset that is excluded from bounds checking and swizzling, to go in 3114 // the instruction's soffset field). This function takes the first kind of 3115 // offset and figures out how to split it between voffset and immoffset. 3116 std::tuple<Register, unsigned, unsigned> 3117 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3118 Register OrigOffset) const { 3119 const unsigned MaxImm = 4095; 3120 Register BaseReg; 3121 unsigned TotalConstOffset; 3122 MachineInstr *OffsetDef; 3123 const LLT S32 = LLT::scalar(32); 3124 3125 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3126 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3127 3128 unsigned ImmOffset = TotalConstOffset; 3129 3130 // If the immediate value is too big for the immoffset field, put the value 3131 // and -4096 into the immoffset field so that the value that is copied/added 3132 // for the voffset field is a multiple of 4096, and it stands more chance 3133 // of being CSEd with the copy/add for another similar load/store. 3134 // However, do not do that rounding down to a multiple of 4096 if that is a 3135 // negative number, as it appears to be illegal to have a negative offset 3136 // in the vgpr, even if adding the immediate offset makes it positive. 3137 unsigned Overflow = ImmOffset & ~MaxImm; 3138 ImmOffset -= Overflow; 3139 if ((int32_t)Overflow < 0) { 3140 Overflow += ImmOffset; 3141 ImmOffset = 0; 3142 } 3143 3144 if (Overflow != 0) { 3145 if (!BaseReg) { 3146 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3147 } else { 3148 auto OverflowVal = B.buildConstant(S32, Overflow); 3149 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3150 } 3151 } 3152 3153 if (!BaseReg) 3154 BaseReg = B.buildConstant(S32, 0).getReg(0); 3155 3156 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3157 } 3158 3159 /// Handle register layout difference for f16 images for some subtargets. 3160 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3161 MachineRegisterInfo &MRI, 3162 Register Reg) const { 3163 if (!ST.hasUnpackedD16VMem()) 3164 return Reg; 3165 3166 const LLT S16 = LLT::scalar(16); 3167 const LLT S32 = LLT::scalar(32); 3168 LLT StoreVT = MRI.getType(Reg); 3169 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3170 3171 auto Unmerge = B.buildUnmerge(S16, Reg); 3172 3173 SmallVector<Register, 4> WideRegs; 3174 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3175 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3176 3177 int NumElts = StoreVT.getNumElements(); 3178 3179 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3180 } 3181 3182 Register AMDGPULegalizerInfo::fixStoreSourceType( 3183 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3184 MachineRegisterInfo *MRI = B.getMRI(); 3185 LLT Ty = MRI->getType(VData); 3186 3187 const LLT S16 = LLT::scalar(16); 3188 3189 // Fixup illegal register types for i8 stores. 3190 if (Ty == LLT::scalar(8) || Ty == S16) { 3191 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3192 return AnyExt; 3193 } 3194 3195 if (Ty.isVector()) { 3196 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3197 if (IsFormat) 3198 return handleD16VData(B, *MRI, VData); 3199 } 3200 } 3201 3202 return VData; 3203 } 3204 3205 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3206 MachineRegisterInfo &MRI, 3207 MachineIRBuilder &B, 3208 bool IsTyped, 3209 bool IsFormat) const { 3210 B.setInstr(MI); 3211 3212 Register VData = MI.getOperand(1).getReg(); 3213 LLT Ty = MRI.getType(VData); 3214 LLT EltTy = Ty.getScalarType(); 3215 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3216 const LLT S32 = LLT::scalar(32); 3217 3218 VData = fixStoreSourceType(B, VData, IsFormat); 3219 Register RSrc = MI.getOperand(2).getReg(); 3220 3221 MachineMemOperand *MMO = *MI.memoperands_begin(); 3222 const int MemSize = MMO->getSize(); 3223 3224 unsigned ImmOffset; 3225 unsigned TotalOffset; 3226 3227 // The typed intrinsics add an immediate after the registers. 3228 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3229 3230 // The struct intrinsic variants add one additional operand over raw. 3231 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3232 Register VIndex; 3233 int OpOffset = 0; 3234 if (HasVIndex) { 3235 VIndex = MI.getOperand(3).getReg(); 3236 OpOffset = 1; 3237 } 3238 3239 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3240 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3241 3242 unsigned Format = 0; 3243 if (IsTyped) { 3244 Format = MI.getOperand(5 + OpOffset).getImm(); 3245 ++OpOffset; 3246 } 3247 3248 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3249 3250 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3251 if (TotalOffset != 0) 3252 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3253 3254 unsigned Opc; 3255 if (IsTyped) { 3256 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3257 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3258 } else if (IsFormat) { 3259 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3260 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3261 } else { 3262 switch (MemSize) { 3263 case 1: 3264 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3265 break; 3266 case 2: 3267 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3268 break; 3269 default: 3270 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3271 break; 3272 } 3273 } 3274 3275 if (!VIndex) 3276 VIndex = B.buildConstant(S32, 0).getReg(0); 3277 3278 auto MIB = B.buildInstr(Opc) 3279 .addUse(VData) // vdata 3280 .addUse(RSrc) // rsrc 3281 .addUse(VIndex) // vindex 3282 .addUse(VOffset) // voffset 3283 .addUse(SOffset) // soffset 3284 .addImm(ImmOffset); // offset(imm) 3285 3286 if (IsTyped) 3287 MIB.addImm(Format); 3288 3289 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3290 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3291 .addMemOperand(MMO); 3292 3293 MI.eraseFromParent(); 3294 return true; 3295 } 3296 3297 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3298 MachineRegisterInfo &MRI, 3299 MachineIRBuilder &B, 3300 bool IsFormat, 3301 bool IsTyped) const { 3302 B.setInstr(MI); 3303 3304 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3305 MachineMemOperand *MMO = *MI.memoperands_begin(); 3306 const int MemSize = MMO->getSize(); 3307 const LLT S32 = LLT::scalar(32); 3308 3309 Register Dst = MI.getOperand(0).getReg(); 3310 Register RSrc = MI.getOperand(2).getReg(); 3311 3312 // The typed intrinsics add an immediate after the registers. 3313 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3314 3315 // The struct intrinsic variants add one additional operand over raw. 3316 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3317 Register VIndex; 3318 int OpOffset = 0; 3319 if (HasVIndex) { 3320 VIndex = MI.getOperand(3).getReg(); 3321 OpOffset = 1; 3322 } 3323 3324 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3325 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3326 3327 unsigned Format = 0; 3328 if (IsTyped) { 3329 Format = MI.getOperand(5 + OpOffset).getImm(); 3330 ++OpOffset; 3331 } 3332 3333 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3334 unsigned ImmOffset; 3335 unsigned TotalOffset; 3336 3337 LLT Ty = MRI.getType(Dst); 3338 LLT EltTy = Ty.getScalarType(); 3339 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3340 const bool Unpacked = ST.hasUnpackedD16VMem(); 3341 3342 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3343 if (TotalOffset != 0) 3344 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3345 3346 unsigned Opc; 3347 3348 if (IsTyped) { 3349 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3350 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3351 } else if (IsFormat) { 3352 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3353 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3354 } else { 3355 switch (MemSize) { 3356 case 1: 3357 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3358 break; 3359 case 2: 3360 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3361 break; 3362 default: 3363 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3364 break; 3365 } 3366 } 3367 3368 Register LoadDstReg; 3369 3370 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3371 LLT UnpackedTy = Ty.changeElementSize(32); 3372 3373 if (IsExtLoad) 3374 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3375 else if (Unpacked && IsD16 && Ty.isVector()) 3376 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3377 else 3378 LoadDstReg = Dst; 3379 3380 if (!VIndex) 3381 VIndex = B.buildConstant(S32, 0).getReg(0); 3382 3383 auto MIB = B.buildInstr(Opc) 3384 .addDef(LoadDstReg) // vdata 3385 .addUse(RSrc) // rsrc 3386 .addUse(VIndex) // vindex 3387 .addUse(VOffset) // voffset 3388 .addUse(SOffset) // soffset 3389 .addImm(ImmOffset); // offset(imm) 3390 3391 if (IsTyped) 3392 MIB.addImm(Format); 3393 3394 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3395 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3396 .addMemOperand(MMO); 3397 3398 if (LoadDstReg != Dst) { 3399 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3400 3401 // Widen result for extending loads was widened. 3402 if (IsExtLoad) 3403 B.buildTrunc(Dst, LoadDstReg); 3404 else { 3405 // Repack to original 16-bit vector result 3406 // FIXME: G_TRUNC should work, but legalization currently fails 3407 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3408 SmallVector<Register, 4> Repack; 3409 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3410 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3411 B.buildMerge(Dst, Repack); 3412 } 3413 } 3414 3415 MI.eraseFromParent(); 3416 return true; 3417 } 3418 3419 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3420 MachineIRBuilder &B, 3421 bool IsInc) const { 3422 B.setInstr(MI); 3423 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3424 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3425 B.buildInstr(Opc) 3426 .addDef(MI.getOperand(0).getReg()) 3427 .addUse(MI.getOperand(2).getReg()) 3428 .addUse(MI.getOperand(3).getReg()) 3429 .cloneMemRefs(MI); 3430 MI.eraseFromParent(); 3431 return true; 3432 } 3433 3434 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3435 switch (IntrID) { 3436 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3437 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3438 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3439 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3440 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3441 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3442 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3443 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3444 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3445 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3446 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3447 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3448 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3449 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3450 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3451 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3452 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3453 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3454 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3455 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3456 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3457 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3458 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3459 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3460 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3461 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3462 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3463 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3464 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3466 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3467 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3468 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3469 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3470 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3471 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3472 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3473 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3474 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3475 default: 3476 llvm_unreachable("unhandled atomic opcode"); 3477 } 3478 } 3479 3480 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3481 MachineIRBuilder &B, 3482 Intrinsic::ID IID) const { 3483 B.setInstr(MI); 3484 3485 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3486 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3487 3488 Register Dst = MI.getOperand(0).getReg(); 3489 Register VData = MI.getOperand(2).getReg(); 3490 3491 Register CmpVal; 3492 int OpOffset = 0; 3493 3494 if (IsCmpSwap) { 3495 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3496 ++OpOffset; 3497 } 3498 3499 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3500 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3501 3502 // The struct intrinsic variants add one additional operand over raw. 3503 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3504 Register VIndex; 3505 if (HasVIndex) { 3506 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3507 ++OpOffset; 3508 } 3509 3510 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3511 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3512 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3513 3514 MachineMemOperand *MMO = *MI.memoperands_begin(); 3515 3516 unsigned ImmOffset; 3517 unsigned TotalOffset; 3518 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3519 if (TotalOffset != 0) 3520 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3521 3522 if (!VIndex) 3523 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3524 3525 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3526 .addDef(Dst) 3527 .addUse(VData); // vdata 3528 3529 if (IsCmpSwap) 3530 MIB.addReg(CmpVal); 3531 3532 MIB.addUse(RSrc) // rsrc 3533 .addUse(VIndex) // vindex 3534 .addUse(VOffset) // voffset 3535 .addUse(SOffset) // soffset 3536 .addImm(ImmOffset) // offset(imm) 3537 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3538 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3539 .addMemOperand(MMO); 3540 3541 MI.eraseFromParent(); 3542 return true; 3543 } 3544 3545 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3546 /// vector with s16 typed elements. 3547 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3548 SmallVectorImpl<Register> &PackedAddrs, 3549 int AddrIdx, int DimIdx, int NumVAddrs, 3550 int NumGradients) { 3551 const LLT S16 = LLT::scalar(16); 3552 const LLT V2S16 = LLT::vector(2, 16); 3553 3554 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3555 MachineOperand &SrcOp = MI.getOperand(I); 3556 if (!SrcOp.isReg()) 3557 continue; // _L to _LZ may have eliminated this. 3558 3559 Register AddrReg = SrcOp.getReg(); 3560 3561 if (I < DimIdx) { 3562 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3563 PackedAddrs.push_back(AddrReg); 3564 } else { 3565 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3566 // derivatives dx/dh and dx/dv are packed with undef. 3567 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3568 ((NumGradients / 2) % 2 == 1 && 3569 (I == DimIdx + (NumGradients / 2) - 1 || 3570 I == DimIdx + NumGradients - 1)) || 3571 // Check for _L to _LZ optimization 3572 !MI.getOperand(I + 1).isReg()) { 3573 PackedAddrs.push_back( 3574 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3575 .getReg(0)); 3576 } else { 3577 PackedAddrs.push_back( 3578 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3579 .getReg(0)); 3580 ++I; 3581 } 3582 } 3583 } 3584 } 3585 3586 /// Convert from separate vaddr components to a single vector address register, 3587 /// and replace the remaining operands with $noreg. 3588 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3589 int DimIdx, int NumVAddrs) { 3590 const LLT S32 = LLT::scalar(32); 3591 3592 SmallVector<Register, 8> AddrRegs; 3593 for (int I = 0; I != NumVAddrs; ++I) { 3594 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3595 if (SrcOp.isReg()) { 3596 AddrRegs.push_back(SrcOp.getReg()); 3597 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3598 } 3599 } 3600 3601 int NumAddrRegs = AddrRegs.size(); 3602 if (NumAddrRegs != 1) { 3603 // Round up to 8 elements for v5-v7 3604 // FIXME: Missing intermediate sized register classes and instructions. 3605 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3606 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3607 auto Undef = B.buildUndef(S32); 3608 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3609 NumAddrRegs = RoundedNumRegs; 3610 } 3611 3612 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3613 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3614 } 3615 3616 for (int I = 1; I != NumVAddrs; ++I) { 3617 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3618 if (SrcOp.isReg()) 3619 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3620 } 3621 } 3622 3623 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3624 /// 3625 /// Depending on the subtarget, load/store with 16-bit element data need to be 3626 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3627 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3628 /// registers. 3629 /// 3630 /// We don't want to directly select image instructions just yet, but also want 3631 /// to exposes all register repacking to the legalizer/combiners. We also don't 3632 /// want a selected instrution entering RegBankSelect. In order to avoid 3633 /// defining a multitude of intermediate image instructions, directly hack on 3634 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3635 /// now unnecessary arguments with $noreg. 3636 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3637 MachineInstr &MI, MachineIRBuilder &B, 3638 GISelChangeObserver &Observer, 3639 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3640 B.setInstr(MI); 3641 3642 const int NumDefs = MI.getNumExplicitDefs(); 3643 bool IsTFE = NumDefs == 2; 3644 // We are only processing the operands of d16 image operations on subtargets 3645 // that use the unpacked register layout, or need to repack the TFE result. 3646 3647 // TODO: Do we need to guard against already legalized intrinsics? 3648 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3649 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3650 3651 MachineRegisterInfo *MRI = B.getMRI(); 3652 const LLT S32 = LLT::scalar(32); 3653 const LLT S16 = LLT::scalar(16); 3654 const LLT V2S16 = LLT::vector(2, 16); 3655 3656 // Index of first address argument 3657 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3658 3659 // Check for 16 bit addresses and pack if true. 3660 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3661 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3662 const bool IsA16 = AddrTy == S16; 3663 3664 int NumVAddrs, NumGradients; 3665 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3666 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3667 getDMaskIdx(BaseOpcode, NumDefs); 3668 unsigned DMask = 0; 3669 3670 int DMaskLanes = 0; 3671 if (!BaseOpcode->Atomic) { 3672 DMask = MI.getOperand(DMaskIdx).getImm(); 3673 if (BaseOpcode->Gather4) { 3674 DMaskLanes = 4; 3675 } else if (DMask != 0) { 3676 DMaskLanes = countPopulation(DMask); 3677 } else if (!IsTFE && !BaseOpcode->Store) { 3678 // If dmask is 0, this is a no-op load. This can be eliminated. 3679 B.buildUndef(MI.getOperand(0)); 3680 MI.eraseFromParent(); 3681 return true; 3682 } 3683 } 3684 3685 Observer.changingInstr(MI); 3686 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3687 3688 unsigned NewOpcode = NumDefs == 0 ? 3689 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3690 3691 // Track that we legalized this 3692 MI.setDesc(B.getTII().get(NewOpcode)); 3693 3694 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3695 // dmask to be at least 1 otherwise the instruction will fail 3696 if (IsTFE && DMask == 0) { 3697 DMask = 0x1; 3698 DMaskLanes = 1; 3699 MI.getOperand(DMaskIdx).setImm(DMask); 3700 } 3701 3702 if (BaseOpcode->Atomic) { 3703 Register VData0 = MI.getOperand(2).getReg(); 3704 LLT Ty = MRI->getType(VData0); 3705 3706 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3707 if (Ty.isVector()) 3708 return false; 3709 3710 if (BaseOpcode->AtomicX2) { 3711 Register VData1 = MI.getOperand(3).getReg(); 3712 // The two values are packed in one register. 3713 LLT PackedTy = LLT::vector(2, Ty); 3714 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3715 MI.getOperand(2).setReg(Concat.getReg(0)); 3716 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3717 } 3718 } 3719 3720 int CorrectedNumVAddrs = NumVAddrs; 3721 3722 // Optimize _L to _LZ when _L is zero 3723 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3724 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3725 const ConstantFP *ConstantLod; 3726 const int LodIdx = AddrIdx + NumVAddrs - 1; 3727 3728 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3729 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3730 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3731 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3732 LZMappingInfo->LZ, ImageDimIntr->Dim); 3733 3734 // The starting indexes should remain in the same place. 3735 --NumVAddrs; 3736 --CorrectedNumVAddrs; 3737 3738 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3739 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3740 MI.RemoveOperand(LodIdx); 3741 } 3742 } 3743 } 3744 3745 // Optimize _mip away, when 'lod' is zero 3746 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3747 int64_t ConstantLod; 3748 const int LodIdx = AddrIdx + NumVAddrs - 1; 3749 3750 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3751 if (ConstantLod == 0) { 3752 // TODO: Change intrinsic opcode and remove operand instead or replacing 3753 // it with 0, as the _L to _LZ handling is done above. 3754 MI.getOperand(LodIdx).ChangeToImmediate(0); 3755 --CorrectedNumVAddrs; 3756 } 3757 } 3758 } 3759 3760 // If the register allocator cannot place the address registers contiguously 3761 // without introducing moves, then using the non-sequential address encoding 3762 // is always preferable, since it saves VALU instructions and is usually a 3763 // wash in terms of code size or even better. 3764 // 3765 // However, we currently have no way of hinting to the register allocator 3766 // that MIMG addresses should be placed contiguously when it is possible to 3767 // do so, so force non-NSA for the common 2-address case as a heuristic. 3768 // 3769 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3770 // allocation when possible. 3771 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3772 3773 // Rewrite the addressing register layout before doing anything else. 3774 if (IsA16) { 3775 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3776 // should be introduced. 3777 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3778 return false; 3779 3780 if (NumVAddrs > 1) { 3781 SmallVector<Register, 4> PackedRegs; 3782 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3783 NumGradients); 3784 3785 if (!UseNSA && PackedRegs.size() > 1) { 3786 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3787 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3788 PackedRegs[0] = Concat.getReg(0); 3789 PackedRegs.resize(1); 3790 } 3791 3792 const int NumPacked = PackedRegs.size(); 3793 for (int I = 0; I != NumVAddrs; ++I) { 3794 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3795 if (!SrcOp.isReg()) { 3796 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3797 continue; 3798 } 3799 3800 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3801 3802 if (I < NumPacked) 3803 SrcOp.setReg(PackedRegs[I]); 3804 else 3805 SrcOp.setReg(AMDGPU::NoRegister); 3806 } 3807 } 3808 } else if (!UseNSA && NumVAddrs > 1) { 3809 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3810 } 3811 3812 3813 if (BaseOpcode->Store) { // No TFE for stores? 3814 // TODO: Handle dmask trim 3815 Register VData = MI.getOperand(1).getReg(); 3816 LLT Ty = MRI->getType(VData); 3817 if (!Ty.isVector() || Ty.getElementType() != S16) 3818 return true; 3819 3820 B.setInstr(MI); 3821 3822 Register RepackedReg = handleD16VData(B, *MRI, VData); 3823 if (RepackedReg != VData) { 3824 MI.getOperand(1).setReg(RepackedReg); 3825 } 3826 3827 return true; 3828 } 3829 3830 Register DstReg = MI.getOperand(0).getReg(); 3831 LLT Ty = MRI->getType(DstReg); 3832 const LLT EltTy = Ty.getScalarType(); 3833 const bool IsD16 = Ty.getScalarType() == S16; 3834 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3835 3836 // Confirm that the return type is large enough for the dmask specified 3837 if (NumElts < DMaskLanes) 3838 return false; 3839 3840 if (NumElts > 4 || DMaskLanes > 4) 3841 return false; 3842 3843 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3844 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3845 3846 // The raw dword aligned data component of the load. The only legal cases 3847 // where this matters should be when using the packed D16 format, for 3848 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3849 LLT RoundedTy; 3850 3851 // S32 vector to to cover all data, plus TFE result element. 3852 LLT TFETy; 3853 3854 // Register type to use for each loaded component. Will be S32 or V2S16. 3855 LLT RegTy; 3856 3857 if (IsD16 && ST.hasUnpackedD16VMem()) { 3858 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3859 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3860 RegTy = S32; 3861 } else { 3862 unsigned EltSize = EltTy.getSizeInBits(); 3863 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3864 unsigned RoundedSize = 32 * RoundedElts; 3865 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3866 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3867 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3868 } 3869 3870 // The return type does not need adjustment. 3871 // TODO: Should we change s16 case to s32 or <2 x s16>? 3872 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3873 return true; 3874 3875 Register Dst1Reg; 3876 3877 // Insert after the instruction. 3878 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3879 3880 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3881 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3882 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3883 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3884 3885 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3886 3887 MI.getOperand(0).setReg(NewResultReg); 3888 3889 // In the IR, TFE is supposed to be used with a 2 element struct return 3890 // type. The intruction really returns these two values in one contiguous 3891 // register, with one additional dword beyond the loaded data. Rewrite the 3892 // return type to use a single register result. 3893 3894 if (IsTFE) { 3895 Dst1Reg = MI.getOperand(1).getReg(); 3896 if (MRI->getType(Dst1Reg) != S32) 3897 return false; 3898 3899 // TODO: Make sure the TFE operand bit is set. 3900 MI.RemoveOperand(1); 3901 3902 // Handle the easy case that requires no repack instructions. 3903 if (Ty == S32) { 3904 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3905 return true; 3906 } 3907 } 3908 3909 // Now figure out how to copy the new result register back into the old 3910 // result. 3911 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3912 3913 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3914 3915 if (ResultNumRegs == 1) { 3916 assert(!IsTFE); 3917 ResultRegs[0] = NewResultReg; 3918 } else { 3919 // We have to repack into a new vector of some kind. 3920 for (int I = 0; I != NumDataRegs; ++I) 3921 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3922 B.buildUnmerge(ResultRegs, NewResultReg); 3923 3924 // Drop the final TFE element to get the data part. The TFE result is 3925 // directly written to the right place already. 3926 if (IsTFE) 3927 ResultRegs.resize(NumDataRegs); 3928 } 3929 3930 // For an s16 scalar result, we form an s32 result with a truncate regardless 3931 // of packed vs. unpacked. 3932 if (IsD16 && !Ty.isVector()) { 3933 B.buildTrunc(DstReg, ResultRegs[0]); 3934 return true; 3935 } 3936 3937 // Avoid a build/concat_vector of 1 entry. 3938 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3939 B.buildBitcast(DstReg, ResultRegs[0]); 3940 return true; 3941 } 3942 3943 assert(Ty.isVector()); 3944 3945 if (IsD16) { 3946 // For packed D16 results with TFE enabled, all the data components are 3947 // S32. Cast back to the expected type. 3948 // 3949 // TODO: We don't really need to use load s32 elements. We would only need one 3950 // cast for the TFE result if a multiple of v2s16 was used. 3951 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3952 for (Register &Reg : ResultRegs) 3953 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3954 } else if (ST.hasUnpackedD16VMem()) { 3955 for (Register &Reg : ResultRegs) 3956 Reg = B.buildTrunc(S16, Reg).getReg(0); 3957 } 3958 } 3959 3960 auto padWithUndef = [&](LLT Ty, int NumElts) { 3961 if (NumElts == 0) 3962 return; 3963 Register Undef = B.buildUndef(Ty).getReg(0); 3964 for (int I = 0; I != NumElts; ++I) 3965 ResultRegs.push_back(Undef); 3966 }; 3967 3968 // Pad out any elements eliminated due to the dmask. 3969 LLT ResTy = MRI->getType(ResultRegs[0]); 3970 if (!ResTy.isVector()) { 3971 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3972 B.buildBuildVector(DstReg, ResultRegs); 3973 return true; 3974 } 3975 3976 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3977 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3978 3979 // Deal with the one annoying legal case. 3980 const LLT V3S16 = LLT::vector(3, 16); 3981 if (Ty == V3S16) { 3982 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3983 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3984 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3985 return true; 3986 } 3987 3988 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3989 B.buildConcatVectors(DstReg, ResultRegs); 3990 return true; 3991 } 3992 3993 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3994 MachineInstr &MI, MachineIRBuilder &B, 3995 GISelChangeObserver &Observer) const { 3996 Register Dst = MI.getOperand(0).getReg(); 3997 LLT Ty = B.getMRI()->getType(Dst); 3998 unsigned Size = Ty.getSizeInBits(); 3999 MachineFunction &MF = B.getMF(); 4000 4001 Observer.changingInstr(MI); 4002 4003 // FIXME: We don't really need this intermediate instruction. The intrinsic 4004 // should be fixed to have a memory operand. Since it's readnone, we're not 4005 // allowed to add one. 4006 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4007 MI.RemoveOperand(1); // Remove intrinsic ID 4008 4009 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4010 // TODO: Should this use datalayout alignment? 4011 const unsigned MemSize = (Size + 7) / 8; 4012 const Align MemAlign(4); 4013 MachineMemOperand *MMO = MF.getMachineMemOperand( 4014 MachinePointerInfo(), 4015 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4016 MachineMemOperand::MOInvariant, 4017 MemSize, MemAlign); 4018 MI.addMemOperand(MF, MMO); 4019 4020 // There are no 96-bit result scalar loads, but widening to 128-bit should 4021 // always be legal. We may need to restore this to a 96-bit result if it turns 4022 // out this needs to be converted to a vector load during RegBankSelect. 4023 if (!isPowerOf2_32(Size)) { 4024 LegalizerHelper Helper(MF, *this, Observer, B); 4025 B.setInstr(MI); 4026 4027 if (Ty.isVector()) 4028 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4029 else 4030 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4031 } 4032 4033 Observer.changedInstr(MI); 4034 return true; 4035 } 4036 4037 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4038 MachineRegisterInfo &MRI, 4039 MachineIRBuilder &B) const { 4040 B.setInstr(MI); 4041 4042 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4043 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4044 !ST.isTrapHandlerEnabled()) { 4045 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4046 } else { 4047 // Pass queue pointer to trap handler as input, and insert trap instruction 4048 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4049 const ArgDescriptor *Arg = 4050 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4051 if (!Arg) 4052 return false; 4053 MachineRegisterInfo &MRI = *B.getMRI(); 4054 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4055 Register LiveIn = getLiveInRegister( 4056 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4057 /*InsertLiveInCopy=*/false); 4058 if (!loadInputValue(LiveIn, B, Arg)) 4059 return false; 4060 B.buildCopy(SGPR01, LiveIn); 4061 B.buildInstr(AMDGPU::S_TRAP) 4062 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4063 .addReg(SGPR01, RegState::Implicit); 4064 } 4065 4066 MI.eraseFromParent(); 4067 return true; 4068 } 4069 4070 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4071 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4072 B.setInstr(MI); 4073 4074 // Is non-HSA path or trap-handler disabled? then, report a warning 4075 // accordingly 4076 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4077 !ST.isTrapHandlerEnabled()) { 4078 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4079 "debugtrap handler not supported", 4080 MI.getDebugLoc(), DS_Warning); 4081 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4082 Ctx.diagnose(NoTrap); 4083 } else { 4084 // Insert debug-trap instruction 4085 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4086 } 4087 4088 MI.eraseFromParent(); 4089 return true; 4090 } 4091 4092 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4093 MachineIRBuilder &B, 4094 GISelChangeObserver &Observer) const { 4095 MachineRegisterInfo &MRI = *B.getMRI(); 4096 4097 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4098 auto IntrID = MI.getIntrinsicID(); 4099 switch (IntrID) { 4100 case Intrinsic::amdgcn_if: 4101 case Intrinsic::amdgcn_else: { 4102 MachineInstr *Br = nullptr; 4103 MachineBasicBlock *UncondBrTarget = nullptr; 4104 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4105 const SIRegisterInfo *TRI 4106 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4107 4108 B.setInstr(*BrCond); 4109 Register Def = MI.getOperand(1).getReg(); 4110 Register Use = MI.getOperand(3).getReg(); 4111 4112 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4113 if (IntrID == Intrinsic::amdgcn_if) { 4114 B.buildInstr(AMDGPU::SI_IF) 4115 .addDef(Def) 4116 .addUse(Use) 4117 .addMBB(UncondBrTarget); 4118 } else { 4119 B.buildInstr(AMDGPU::SI_ELSE) 4120 .addDef(Def) 4121 .addUse(Use) 4122 .addMBB(UncondBrTarget) 4123 .addImm(0); 4124 } 4125 4126 if (Br) { 4127 Br->getOperand(0).setMBB(CondBrTarget); 4128 } else { 4129 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4130 // since we're swapping branch targets it needs to be reinserted. 4131 // FIXME: IRTranslator should probably not do this 4132 B.buildBr(*CondBrTarget); 4133 } 4134 4135 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4136 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4137 MI.eraseFromParent(); 4138 BrCond->eraseFromParent(); 4139 return true; 4140 } 4141 4142 return false; 4143 } 4144 case Intrinsic::amdgcn_loop: { 4145 MachineInstr *Br = nullptr; 4146 MachineBasicBlock *UncondBrTarget = nullptr; 4147 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4148 const SIRegisterInfo *TRI 4149 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4150 4151 B.setInstr(*BrCond); 4152 4153 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4154 Register Reg = MI.getOperand(2).getReg(); 4155 B.buildInstr(AMDGPU::SI_LOOP) 4156 .addUse(Reg) 4157 .addMBB(UncondBrTarget); 4158 4159 if (Br) 4160 Br->getOperand(0).setMBB(CondBrTarget); 4161 else 4162 B.buildBr(*CondBrTarget); 4163 4164 MI.eraseFromParent(); 4165 BrCond->eraseFromParent(); 4166 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4167 return true; 4168 } 4169 4170 return false; 4171 } 4172 case Intrinsic::amdgcn_kernarg_segment_ptr: 4173 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4174 B.setInstr(MI); 4175 // This only makes sense to call in a kernel, so just lower to null. 4176 B.buildConstant(MI.getOperand(0).getReg(), 0); 4177 MI.eraseFromParent(); 4178 return true; 4179 } 4180 4181 return legalizePreloadedArgIntrin( 4182 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4183 case Intrinsic::amdgcn_implicitarg_ptr: 4184 return legalizeImplicitArgPtr(MI, MRI, B); 4185 case Intrinsic::amdgcn_workitem_id_x: 4186 return legalizePreloadedArgIntrin(MI, MRI, B, 4187 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4188 case Intrinsic::amdgcn_workitem_id_y: 4189 return legalizePreloadedArgIntrin(MI, MRI, B, 4190 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4191 case Intrinsic::amdgcn_workitem_id_z: 4192 return legalizePreloadedArgIntrin(MI, MRI, B, 4193 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4194 case Intrinsic::amdgcn_workgroup_id_x: 4195 return legalizePreloadedArgIntrin(MI, MRI, B, 4196 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4197 case Intrinsic::amdgcn_workgroup_id_y: 4198 return legalizePreloadedArgIntrin(MI, MRI, B, 4199 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4200 case Intrinsic::amdgcn_workgroup_id_z: 4201 return legalizePreloadedArgIntrin(MI, MRI, B, 4202 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4203 case Intrinsic::amdgcn_dispatch_ptr: 4204 return legalizePreloadedArgIntrin(MI, MRI, B, 4205 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4206 case Intrinsic::amdgcn_queue_ptr: 4207 return legalizePreloadedArgIntrin(MI, MRI, B, 4208 AMDGPUFunctionArgInfo::QUEUE_PTR); 4209 case Intrinsic::amdgcn_implicit_buffer_ptr: 4210 return legalizePreloadedArgIntrin( 4211 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4212 case Intrinsic::amdgcn_dispatch_id: 4213 return legalizePreloadedArgIntrin(MI, MRI, B, 4214 AMDGPUFunctionArgInfo::DISPATCH_ID); 4215 case Intrinsic::amdgcn_fdiv_fast: 4216 return legalizeFDIVFastIntrin(MI, MRI, B); 4217 case Intrinsic::amdgcn_is_shared: 4218 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4219 case Intrinsic::amdgcn_is_private: 4220 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4221 case Intrinsic::amdgcn_wavefrontsize: { 4222 B.setInstr(MI); 4223 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4224 MI.eraseFromParent(); 4225 return true; 4226 } 4227 case Intrinsic::amdgcn_s_buffer_load: 4228 return legalizeSBufferLoad(MI, B, Observer); 4229 case Intrinsic::amdgcn_raw_buffer_store: 4230 case Intrinsic::amdgcn_struct_buffer_store: 4231 return legalizeBufferStore(MI, MRI, B, false, false); 4232 case Intrinsic::amdgcn_raw_buffer_store_format: 4233 case Intrinsic::amdgcn_struct_buffer_store_format: 4234 return legalizeBufferStore(MI, MRI, B, false, true); 4235 case Intrinsic::amdgcn_raw_tbuffer_store: 4236 case Intrinsic::amdgcn_struct_tbuffer_store: 4237 return legalizeBufferStore(MI, MRI, B, true, true); 4238 case Intrinsic::amdgcn_raw_buffer_load: 4239 case Intrinsic::amdgcn_struct_buffer_load: 4240 return legalizeBufferLoad(MI, MRI, B, false, false); 4241 case Intrinsic::amdgcn_raw_buffer_load_format: 4242 case Intrinsic::amdgcn_struct_buffer_load_format: 4243 return legalizeBufferLoad(MI, MRI, B, true, false); 4244 case Intrinsic::amdgcn_raw_tbuffer_load: 4245 case Intrinsic::amdgcn_struct_tbuffer_load: 4246 return legalizeBufferLoad(MI, MRI, B, true, true); 4247 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4248 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4249 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4250 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4251 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4252 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4253 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4255 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4256 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4257 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4259 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4261 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4262 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4263 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4264 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4265 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4266 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4267 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4269 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4270 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4271 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4272 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4273 return legalizeBufferAtomic(MI, B, IntrID); 4274 case Intrinsic::amdgcn_atomic_inc: 4275 return legalizeAtomicIncDec(MI, B, true); 4276 case Intrinsic::amdgcn_atomic_dec: 4277 return legalizeAtomicIncDec(MI, B, false); 4278 case Intrinsic::trap: 4279 return legalizeTrapIntrinsic(MI, MRI, B); 4280 case Intrinsic::debugtrap: 4281 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4282 default: { 4283 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4284 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4285 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4286 return true; 4287 } 4288 } 4289 4290 return true; 4291 } 4292