1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 using namespace llvm; 27 using namespace LegalizeActions; 28 using namespace LegalizeMutations; 29 using namespace LegalityPredicates; 30 31 32 static LegalityPredicate isMultiple32(unsigned TypeIdx, 33 unsigned MaxSize = 512) { 34 return [=](const LegalityQuery &Query) { 35 const LLT Ty = Query.Types[TypeIdx]; 36 const LLT EltTy = Ty.getScalarType(); 37 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 38 }; 39 } 40 41 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 42 return [=](const LegalityQuery &Query) { 43 const LLT Ty = Query.Types[TypeIdx]; 44 return Ty.isVector() && 45 Ty.getNumElements() % 2 != 0 && 46 Ty.getElementType().getSizeInBits() < 32; 47 }; 48 } 49 50 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 51 return [=](const LegalityQuery &Query) { 52 const LLT Ty = Query.Types[TypeIdx]; 53 const LLT EltTy = Ty.getElementType(); 54 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 55 }; 56 } 57 58 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 59 return [=](const LegalityQuery &Query) { 60 const LLT Ty = Query.Types[TypeIdx]; 61 const LLT EltTy = Ty.getElementType(); 62 unsigned Size = Ty.getSizeInBits(); 63 unsigned Pieces = (Size + 63) / 64; 64 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 65 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 66 }; 67 } 68 69 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 const LLT QueryTy = Query.Types[TypeIdx]; 72 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 73 }; 74 } 75 76 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 77 return [=](const LegalityQuery &Query) { 78 const LLT QueryTy = Query.Types[TypeIdx]; 79 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 80 }; 81 } 82 83 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, 84 const GCNTargetMachine &TM) { 85 using namespace TargetOpcode; 86 87 auto GetAddrSpacePtr = [&TM](unsigned AS) { 88 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 89 }; 90 91 const LLT S1 = LLT::scalar(1); 92 const LLT S8 = LLT::scalar(8); 93 const LLT S16 = LLT::scalar(16); 94 const LLT S32 = LLT::scalar(32); 95 const LLT S64 = LLT::scalar(64); 96 const LLT S128 = LLT::scalar(128); 97 const LLT S256 = LLT::scalar(256); 98 const LLT S512 = LLT::scalar(512); 99 100 const LLT V2S16 = LLT::vector(2, 16); 101 const LLT V4S16 = LLT::vector(4, 16); 102 const LLT V8S16 = LLT::vector(8, 16); 103 104 const LLT V2S32 = LLT::vector(2, 32); 105 const LLT V3S32 = LLT::vector(3, 32); 106 const LLT V4S32 = LLT::vector(4, 32); 107 const LLT V5S32 = LLT::vector(5, 32); 108 const LLT V6S32 = LLT::vector(6, 32); 109 const LLT V7S32 = LLT::vector(7, 32); 110 const LLT V8S32 = LLT::vector(8, 32); 111 const LLT V9S32 = LLT::vector(9, 32); 112 const LLT V10S32 = LLT::vector(10, 32); 113 const LLT V11S32 = LLT::vector(11, 32); 114 const LLT V12S32 = LLT::vector(12, 32); 115 const LLT V13S32 = LLT::vector(13, 32); 116 const LLT V14S32 = LLT::vector(14, 32); 117 const LLT V15S32 = LLT::vector(15, 32); 118 const LLT V16S32 = LLT::vector(16, 32); 119 120 const LLT V2S64 = LLT::vector(2, 64); 121 const LLT V3S64 = LLT::vector(3, 64); 122 const LLT V4S64 = LLT::vector(4, 64); 123 const LLT V5S64 = LLT::vector(5, 64); 124 const LLT V6S64 = LLT::vector(6, 64); 125 const LLT V7S64 = LLT::vector(7, 64); 126 const LLT V8S64 = LLT::vector(8, 64); 127 128 std::initializer_list<LLT> AllS32Vectors = 129 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 130 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 131 std::initializer_list<LLT> AllS64Vectors = 132 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 133 134 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 135 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 136 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 137 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 138 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 139 140 const LLT CodePtr = FlatPtr; 141 142 const std::initializer_list<LLT> AddrSpaces64 = { 143 GlobalPtr, ConstantPtr, FlatPtr 144 }; 145 146 const std::initializer_list<LLT> AddrSpaces32 = { 147 LocalPtr, PrivatePtr 148 }; 149 150 setAction({G_BRCOND, S1}, Legal); 151 152 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 153 // elements for v3s16 154 getActionDefinitionsBuilder(G_PHI) 155 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 156 .legalFor(AllS32Vectors) 157 .legalFor(AllS64Vectors) 158 .legalFor(AddrSpaces64) 159 .legalFor(AddrSpaces32) 160 .clampScalar(0, S32, S256) 161 .widenScalarToNextPow2(0, 32) 162 .clampMaxNumElements(0, S32, 16) 163 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 164 .legalIf(isPointer(0)); 165 166 167 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UMULH, G_SMULH}) 168 .legalFor({S32}) 169 .clampScalar(0, S32, S32) 170 .scalarize(0); 171 172 // Report legal for any types we can handle anywhere. For the cases only legal 173 // on the SALU, RegBankSelect will be able to re-legalize. 174 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 175 .legalFor({S32, S1, S64, V2S32, V2S16, V4S16}) 176 .clampScalar(0, S32, S64) 177 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 178 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 179 .widenScalarToNextPow2(0) 180 .scalarize(0); 181 182 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 183 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 184 .legalFor({{S32, S1}}) 185 .clampScalar(0, S32, S32); 186 187 getActionDefinitionsBuilder(G_BITCAST) 188 .legalForCartesianProduct({S32, V2S16}) 189 .legalForCartesianProduct({S64, V2S32, V4S16}) 190 .legalForCartesianProduct({V2S64, V4S32}) 191 // Don't worry about the size constraint. 192 .legalIf(all(isPointer(0), isPointer(1))); 193 194 if (ST.has16BitInsts()) { 195 getActionDefinitionsBuilder(G_FCONSTANT) 196 .legalFor({S32, S64, S16}) 197 .clampScalar(0, S16, S64); 198 } else { 199 getActionDefinitionsBuilder(G_FCONSTANT) 200 .legalFor({S32, S64}) 201 .clampScalar(0, S32, S64); 202 } 203 204 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 205 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 206 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 207 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 208 .clampScalarOrElt(0, S32, S512) 209 .legalIf(isMultiple32(0)) 210 .widenScalarToNextPow2(0, 32) 211 .clampMaxNumElements(0, S32, 16); 212 213 214 // FIXME: i1 operands to intrinsics should always be legal, but other i1 215 // values may not be legal. We need to figure out how to distinguish 216 // between these two scenarios. 217 getActionDefinitionsBuilder(G_CONSTANT) 218 .legalFor({S1, S32, S64, GlobalPtr, 219 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 220 .clampScalar(0, S32, S64) 221 .widenScalarToNextPow2(0) 222 .legalIf(isPointer(0)); 223 224 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 225 226 auto &FPOpActions = getActionDefinitionsBuilder( 227 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 228 .legalFor({S32, S64}); 229 230 if (ST.has16BitInsts()) { 231 if (ST.hasVOP3PInsts()) 232 FPOpActions.legalFor({S16, V2S16}); 233 else 234 FPOpActions.legalFor({S16}); 235 } 236 237 if (ST.hasVOP3PInsts()) 238 FPOpActions.clampMaxNumElements(0, S16, 2); 239 FPOpActions 240 .scalarize(0) 241 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 242 243 if (ST.has16BitInsts()) { 244 getActionDefinitionsBuilder(G_FSQRT) 245 .legalFor({S32, S64, S16}) 246 .scalarize(0) 247 .clampScalar(0, S16, S64); 248 } else { 249 getActionDefinitionsBuilder(G_FSQRT) 250 .legalFor({S32, S64}) 251 .scalarize(0) 252 .clampScalar(0, S32, S64); 253 } 254 255 getActionDefinitionsBuilder(G_FPTRUNC) 256 .legalFor({{S32, S64}, {S16, S32}}) 257 .scalarize(0); 258 259 getActionDefinitionsBuilder(G_FPEXT) 260 .legalFor({{S64, S32}, {S32, S16}}) 261 .lowerFor({{S64, S16}}) // FIXME: Implement 262 .scalarize(0); 263 264 getActionDefinitionsBuilder(G_FCOPYSIGN) 265 .legalForCartesianProduct({S16, S32, S64}, {S16, S32, S64}) 266 .scalarize(0); 267 268 getActionDefinitionsBuilder(G_FSUB) 269 // Use actual fsub instruction 270 .legalFor({S32}) 271 // Must use fadd + fneg 272 .lowerFor({S64, S16, V2S16}) 273 .scalarize(0) 274 .clampScalar(0, S32, S64); 275 276 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 277 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 278 {S32, S1}, {S64, S1}, {S16, S1}, 279 // FIXME: Hack 280 {S64, LLT::scalar(33)}, 281 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 282 .scalarize(0); 283 284 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 285 .legalFor({{S32, S32}, {S64, S32}}) 286 .lowerFor({{S32, S64}}) 287 .customFor({{S64, S64}}) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 291 .legalFor({{S32, S32}, {S32, S64}}) 292 .scalarize(0); 293 294 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 295 .legalFor({S32, S64}) 296 .scalarize(0); 297 298 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 299 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 300 .legalFor({S32, S64}) 301 .clampScalar(0, S32, S64) 302 .scalarize(0); 303 } else { 304 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 305 .legalFor({S32}) 306 .customFor({S64}) 307 .clampScalar(0, S32, S64) 308 .scalarize(0); 309 } 310 311 getActionDefinitionsBuilder(G_GEP) 312 .legalForCartesianProduct(AddrSpaces64, {S64}) 313 .legalForCartesianProduct(AddrSpaces32, {S32}) 314 .scalarize(0); 315 316 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 317 318 getActionDefinitionsBuilder(G_ICMP) 319 .legalForCartesianProduct( 320 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 321 .legalFor({{S1, S32}, {S1, S64}}) 322 .widenScalarToNextPow2(1) 323 .clampScalar(1, S32, S64) 324 .scalarize(0) 325 .legalIf(all(typeIs(0, S1), isPointer(1))); 326 327 getActionDefinitionsBuilder(G_FCMP) 328 .legalFor({{S1, S32}, {S1, S64}}) 329 .widenScalarToNextPow2(1) 330 .clampScalar(1, S32, S64) 331 .scalarize(0); 332 333 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 334 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 335 G_FLOG, G_FLOG2, G_FLOG10}) 336 .legalFor({S32}) 337 .scalarize(0); 338 339 // The 64-bit versions produce 32-bit results, but only on the SALU. 340 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 341 G_CTTZ, G_CTTZ_ZERO_UNDEF, 342 G_CTPOP}) 343 .legalFor({{S32, S32}, {S32, S64}}) 344 .clampScalar(0, S32, S32) 345 .clampScalar(1, S32, S64) 346 .scalarize(0) 347 .widenScalarToNextPow2(0, 32) 348 .widenScalarToNextPow2(1, 32); 349 350 // TODO: Expand for > s32 351 getActionDefinitionsBuilder(G_BSWAP) 352 .legalFor({S32}) 353 .clampScalar(0, S32, S32) 354 .scalarize(0); 355 356 357 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 358 return [=](const LegalityQuery &Query) { 359 return Query.Types[TypeIdx0].getSizeInBits() < 360 Query.Types[TypeIdx1].getSizeInBits(); 361 }; 362 }; 363 364 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 365 return [=](const LegalityQuery &Query) { 366 return Query.Types[TypeIdx0].getSizeInBits() > 367 Query.Types[TypeIdx1].getSizeInBits(); 368 }; 369 }; 370 371 getActionDefinitionsBuilder(G_INTTOPTR) 372 // List the common cases 373 .legalForCartesianProduct(AddrSpaces64, {S64}) 374 .legalForCartesianProduct(AddrSpaces32, {S32}) 375 .scalarize(0) 376 // Accept any address space as long as the size matches 377 .legalIf(sameSize(0, 1)) 378 .widenScalarIf(smallerThan(1, 0), 379 [](const LegalityQuery &Query) { 380 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 381 }) 382 .narrowScalarIf(greaterThan(1, 0), 383 [](const LegalityQuery &Query) { 384 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 385 }); 386 387 getActionDefinitionsBuilder(G_PTRTOINT) 388 // List the common cases 389 .legalForCartesianProduct(AddrSpaces64, {S64}) 390 .legalForCartesianProduct(AddrSpaces32, {S32}) 391 .scalarize(0) 392 // Accept any address space as long as the size matches 393 .legalIf(sameSize(0, 1)) 394 .widenScalarIf(smallerThan(0, 1), 395 [](const LegalityQuery &Query) { 396 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 397 }) 398 .narrowScalarIf( 399 greaterThan(0, 1), 400 [](const LegalityQuery &Query) { 401 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 402 }); 403 404 if (ST.hasFlatAddressSpace()) { 405 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 406 .scalarize(0) 407 .custom(); 408 } 409 410 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 411 .narrowScalarIf([](const LegalityQuery &Query) { 412 unsigned Size = Query.Types[0].getSizeInBits(); 413 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 414 return (Size > 32 && MemSize < Size); 415 }, 416 [](const LegalityQuery &Query) { 417 return std::make_pair(0, LLT::scalar(32)); 418 }) 419 .fewerElementsIf([=, &ST](const LegalityQuery &Query) { 420 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 421 return (MemSize == 96) && 422 Query.Types[0].isVector() && 423 ST.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS; 424 }, 425 [=](const LegalityQuery &Query) { 426 return std::make_pair(0, V2S32); 427 }) 428 .legalIf([=, &ST](const LegalityQuery &Query) { 429 const LLT &Ty0 = Query.Types[0]; 430 431 unsigned Size = Ty0.getSizeInBits(); 432 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 433 if (Size < 32 || (Size > 32 && MemSize < Size)) 434 return false; 435 436 if (Ty0.isVector() && Size != MemSize) 437 return false; 438 439 // TODO: Decompose private loads into 4-byte components. 440 // TODO: Illegal flat loads on SI 441 switch (MemSize) { 442 case 8: 443 case 16: 444 return Size == 32; 445 case 32: 446 case 64: 447 case 128: 448 return true; 449 450 case 96: 451 // XXX hasLoadX3 452 return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); 453 454 case 256: 455 case 512: 456 // TODO: constant loads 457 default: 458 return false; 459 } 460 }) 461 .clampScalar(0, S32, S64); 462 463 464 // FIXME: Handle alignment requirements. 465 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 466 .legalForTypesWithMemDesc({ 467 {S32, GlobalPtr, 8, 8}, 468 {S32, GlobalPtr, 16, 8}, 469 {S32, LocalPtr, 8, 8}, 470 {S32, LocalPtr, 16, 8}, 471 {S32, PrivatePtr, 8, 8}, 472 {S32, PrivatePtr, 16, 8}}); 473 if (ST.hasFlatAddressSpace()) { 474 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 475 {S32, FlatPtr, 16, 8}}); 476 } 477 478 ExtLoads.clampScalar(0, S32, S32) 479 .widenScalarToNextPow2(0) 480 .unsupportedIfMemSizeNotPow2() 481 .lower(); 482 483 auto &Atomics = getActionDefinitionsBuilder( 484 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 485 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 486 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 487 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 488 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 489 {S64, GlobalPtr}, {S64, LocalPtr}}); 490 if (ST.hasFlatAddressSpace()) { 491 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 492 } 493 494 // TODO: Pointer types, any 32-bit or 64-bit vector 495 getActionDefinitionsBuilder(G_SELECT) 496 .legalForCartesianProduct({S32, S64, V2S32, V2S16, V4S16, 497 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 498 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 499 .clampScalar(0, S32, S64) 500 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 501 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 502 .scalarize(1) 503 .clampMaxNumElements(0, S32, 2) 504 .clampMaxNumElements(0, LocalPtr, 2) 505 .clampMaxNumElements(0, PrivatePtr, 2) 506 .scalarize(0) 507 .widenScalarToNextPow2(0) 508 .legalIf(all(isPointer(0), typeIs(1, S1))); 509 510 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 511 // be more flexible with the shift amount type. 512 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 513 .legalFor({{S32, S32}, {S64, S32}}); 514 if (ST.has16BitInsts()) { 515 if (ST.hasVOP3PInsts()) { 516 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 517 .clampMaxNumElements(0, S16, 2); 518 } else 519 Shifts.legalFor({{S16, S32}, {S16, S16}}); 520 521 Shifts.clampScalar(1, S16, S32); 522 Shifts.clampScalar(0, S16, S64); 523 Shifts.widenScalarToNextPow2(0, 16); 524 } else { 525 // Make sure we legalize the shift amount type first, as the general 526 // expansion for the shifted type will produce much worse code if it hasn't 527 // been truncated already. 528 Shifts.clampScalar(1, S32, S32); 529 Shifts.clampScalar(0, S32, S64); 530 Shifts.widenScalarToNextPow2(0, 32); 531 } 532 Shifts.scalarize(0); 533 534 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 535 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 536 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 537 unsigned IdxTypeIdx = 2; 538 539 getActionDefinitionsBuilder(Op) 540 .legalIf([=](const LegalityQuery &Query) { 541 const LLT &VecTy = Query.Types[VecTypeIdx]; 542 const LLT &IdxTy = Query.Types[IdxTypeIdx]; 543 return VecTy.getSizeInBits() % 32 == 0 && 544 VecTy.getSizeInBits() <= 512 && 545 IdxTy.getSizeInBits() == 32; 546 }) 547 .clampScalar(EltTypeIdx, S32, S64) 548 .clampScalar(VecTypeIdx, S32, S64) 549 .clampScalar(IdxTypeIdx, S32, S32); 550 } 551 552 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 553 .unsupportedIf([=](const LegalityQuery &Query) { 554 const LLT &EltTy = Query.Types[1].getElementType(); 555 return Query.Types[0] != EltTy; 556 }); 557 558 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 559 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 560 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 561 562 // FIXME: Doesn't handle extract of illegal sizes. 563 getActionDefinitionsBuilder(Op) 564 .legalIf([=](const LegalityQuery &Query) { 565 const LLT BigTy = Query.Types[BigTyIdx]; 566 const LLT LitTy = Query.Types[LitTyIdx]; 567 return (BigTy.getSizeInBits() % 32 == 0) && 568 (LitTy.getSizeInBits() % 16 == 0); 569 }) 570 .widenScalarIf( 571 [=](const LegalityQuery &Query) { 572 const LLT BigTy = Query.Types[BigTyIdx]; 573 return (BigTy.getScalarSizeInBits() < 16); 574 }, 575 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 576 .widenScalarIf( 577 [=](const LegalityQuery &Query) { 578 const LLT LitTy = Query.Types[LitTyIdx]; 579 return (LitTy.getScalarSizeInBits() < 16); 580 }, 581 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 582 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 583 .widenScalarToNextPow2(BigTyIdx, 32); 584 585 } 586 587 // TODO: vectors of pointers 588 getActionDefinitionsBuilder(G_BUILD_VECTOR) 589 .legalForCartesianProduct(AllS32Vectors, {S32}) 590 .legalForCartesianProduct(AllS64Vectors, {S64}) 591 .clampNumElements(0, V16S32, V16S32) 592 .clampNumElements(0, V2S64, V8S64) 593 .minScalarSameAs(1, 0) 594 // FIXME: Sort of a hack to make progress on other legalizations. 595 .legalIf([=](const LegalityQuery &Query) { 596 return Query.Types[0].getScalarSizeInBits() <= 32 || 597 Query.Types[0].getScalarSizeInBits() == 64; 598 }); 599 600 // TODO: Support any combination of v2s32 601 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 602 .legalFor({{V4S32, V2S32}, 603 {V8S32, V2S32}, 604 {V8S32, V4S32}, 605 {V4S64, V2S64}, 606 {V4S16, V2S16}, 607 {V8S16, V2S16}, 608 {V8S16, V4S16}, 609 {LLT::vector(4, LocalPtr), LLT::vector(2, LocalPtr)}, 610 {LLT::vector(4, PrivatePtr), LLT::vector(2, PrivatePtr)}}); 611 612 // Merge/Unmerge 613 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 614 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 615 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 616 617 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 618 const LLT &Ty = Query.Types[TypeIdx]; 619 if (Ty.isVector()) { 620 const LLT &EltTy = Ty.getElementType(); 621 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 622 return true; 623 if (!isPowerOf2_32(EltTy.getSizeInBits())) 624 return true; 625 } 626 return false; 627 }; 628 629 getActionDefinitionsBuilder(Op) 630 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 631 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 632 // worth considering the multiples of 64 since 2*192 and 2*384 are not 633 // valid. 634 .clampScalar(LitTyIdx, S16, S256) 635 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 636 637 // Break up vectors with weird elements into scalars 638 .fewerElementsIf( 639 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 640 scalarize(0)) 641 .fewerElementsIf( 642 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 643 scalarize(1)) 644 .clampScalar(BigTyIdx, S32, S512) 645 .widenScalarIf( 646 [=](const LegalityQuery &Query) { 647 const LLT &Ty = Query.Types[BigTyIdx]; 648 return !isPowerOf2_32(Ty.getSizeInBits()) && 649 Ty.getSizeInBits() % 16 != 0; 650 }, 651 [=](const LegalityQuery &Query) { 652 // Pick the next power of 2, or a multiple of 64 over 128. 653 // Whichever is smaller. 654 const LLT &Ty = Query.Types[BigTyIdx]; 655 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 656 if (NewSizeInBits >= 256) { 657 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 658 if (RoundedTo < NewSizeInBits) 659 NewSizeInBits = RoundedTo; 660 } 661 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 662 }) 663 .legalIf([=](const LegalityQuery &Query) { 664 const LLT &BigTy = Query.Types[BigTyIdx]; 665 const LLT &LitTy = Query.Types[LitTyIdx]; 666 667 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 668 return false; 669 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 670 return false; 671 672 return BigTy.getSizeInBits() % 16 == 0 && 673 LitTy.getSizeInBits() % 16 == 0 && 674 BigTy.getSizeInBits() <= 512; 675 }) 676 // Any vectors left are the wrong size. Scalarize them. 677 .scalarize(0) 678 .scalarize(1); 679 } 680 681 computeTables(); 682 verify(*ST.getInstrInfo()); 683 } 684 685 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 686 MachineRegisterInfo &MRI, 687 MachineIRBuilder &MIRBuilder, 688 GISelChangeObserver &Observer) const { 689 switch (MI.getOpcode()) { 690 case TargetOpcode::G_ADDRSPACE_CAST: 691 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 692 case TargetOpcode::G_FRINT: 693 return legalizeFrint(MI, MRI, MIRBuilder); 694 case TargetOpcode::G_FCEIL: 695 return legalizeFceil(MI, MRI, MIRBuilder); 696 case TargetOpcode::G_INTRINSIC_TRUNC: 697 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 698 case TargetOpcode::G_SITOFP: 699 return legalizeITOFP(MI, MRI, MIRBuilder, true); 700 case TargetOpcode::G_UITOFP: 701 return legalizeITOFP(MI, MRI, MIRBuilder, false); 702 default: 703 return false; 704 } 705 706 llvm_unreachable("expected switch to return"); 707 } 708 709 unsigned AMDGPULegalizerInfo::getSegmentAperture( 710 unsigned AS, 711 MachineRegisterInfo &MRI, 712 MachineIRBuilder &MIRBuilder) const { 713 MachineFunction &MF = MIRBuilder.getMF(); 714 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 715 const LLT S32 = LLT::scalar(32); 716 717 if (ST.hasApertureRegs()) { 718 // FIXME: Use inline constants (src_{shared, private}_base) instead of 719 // getreg. 720 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 721 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 722 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 723 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 724 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 725 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 726 unsigned Encoding = 727 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 728 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 729 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 730 731 unsigned ApertureReg = MRI.createGenericVirtualRegister(S32); 732 unsigned GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 733 734 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 735 .addDef(GetReg) 736 .addImm(Encoding); 737 MRI.setType(GetReg, S32); 738 739 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 740 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 741 .addDef(ApertureReg) 742 .addUse(GetReg) 743 .addUse(ShiftAmt.getReg(0)); 744 745 return ApertureReg; 746 } 747 748 unsigned QueuePtr = MRI.createGenericVirtualRegister( 749 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 750 751 // FIXME: Placeholder until we can track the input registers. 752 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 753 754 // Offset into amd_queue_t for group_segment_aperture_base_hi / 755 // private_segment_aperture_base_hi. 756 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 757 758 // FIXME: Don't use undef 759 Value *V = UndefValue::get(PointerType::get( 760 Type::getInt8Ty(MF.getFunction().getContext()), 761 AMDGPUAS::CONSTANT_ADDRESS)); 762 763 MachinePointerInfo PtrInfo(V, StructOffset); 764 MachineMemOperand *MMO = MF.getMachineMemOperand( 765 PtrInfo, 766 MachineMemOperand::MOLoad | 767 MachineMemOperand::MODereferenceable | 768 MachineMemOperand::MOInvariant, 769 4, 770 MinAlign(64, StructOffset)); 771 772 unsigned LoadResult = MRI.createGenericVirtualRegister(S32); 773 unsigned LoadAddr = AMDGPU::NoRegister; 774 775 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 776 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 777 return LoadResult; 778 } 779 780 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 781 MachineInstr &MI, MachineRegisterInfo &MRI, 782 MachineIRBuilder &MIRBuilder) const { 783 MachineFunction &MF = MIRBuilder.getMF(); 784 785 MIRBuilder.setInstr(MI); 786 787 unsigned Dst = MI.getOperand(0).getReg(); 788 unsigned Src = MI.getOperand(1).getReg(); 789 790 LLT DstTy = MRI.getType(Dst); 791 LLT SrcTy = MRI.getType(Src); 792 unsigned DestAS = DstTy.getAddressSpace(); 793 unsigned SrcAS = SrcTy.getAddressSpace(); 794 795 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 796 // vector element. 797 assert(!DstTy.isVector()); 798 799 const AMDGPUTargetMachine &TM 800 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 801 802 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 803 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 804 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 805 return true; 806 } 807 808 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 809 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 810 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 811 unsigned NullVal = TM.getNullPointerValue(DestAS); 812 813 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 814 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 815 816 unsigned PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 817 818 // Extract low 32-bits of the pointer. 819 MIRBuilder.buildExtract(PtrLo32, Src, 0); 820 821 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 822 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 823 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 824 825 MI.eraseFromParent(); 826 return true; 827 } 828 829 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 830 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 831 832 auto SegmentNull = 833 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 834 auto FlatNull = 835 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 836 837 unsigned ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 838 839 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 840 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 841 842 unsigned BuildPtr = MRI.createGenericVirtualRegister(DstTy); 843 844 // Coerce the type of the low half of the result so we can use merge_values. 845 unsigned SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 846 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 847 .addDef(SrcAsInt) 848 .addUse(Src); 849 850 // TODO: Should we allow mismatched types but matching sizes in merges to 851 // avoid the ptrtoint? 852 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 853 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 854 855 MI.eraseFromParent(); 856 return true; 857 } 858 859 bool AMDGPULegalizerInfo::legalizeFrint( 860 MachineInstr &MI, MachineRegisterInfo &MRI, 861 MachineIRBuilder &MIRBuilder) const { 862 MIRBuilder.setInstr(MI); 863 864 unsigned Src = MI.getOperand(1).getReg(); 865 LLT Ty = MRI.getType(Src); 866 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 867 868 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 869 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 870 871 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 872 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 873 874 // TODO: Should this propagate fast-math-flags? 875 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 876 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 877 878 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 879 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 880 881 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 882 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 883 return true; 884 } 885 886 bool AMDGPULegalizerInfo::legalizeFceil( 887 MachineInstr &MI, MachineRegisterInfo &MRI, 888 MachineIRBuilder &B) const { 889 B.setInstr(MI); 890 891 const LLT S1 = LLT::scalar(1); 892 const LLT S64 = LLT::scalar(64); 893 894 unsigned Src = MI.getOperand(1).getReg(); 895 assert(MRI.getType(Src) == S64); 896 897 // result = trunc(src) 898 // if (src > 0.0 && src != result) 899 // result += 1.0 900 901 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 902 903 const auto Zero = B.buildFConstant(S64, 0.0); 904 const auto One = B.buildFConstant(S64, 1.0); 905 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 906 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 907 auto And = B.buildAnd(S1, Lt0, NeTrunc); 908 auto Add = B.buildSelect(S64, And, One, Zero); 909 910 // TODO: Should this propagate fast-math-flags? 911 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 912 return true; 913 } 914 915 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 916 MachineIRBuilder &B) { 917 const unsigned FractBits = 52; 918 const unsigned ExpBits = 11; 919 LLT S32 = LLT::scalar(32); 920 921 auto Const0 = B.buildConstant(S32, FractBits - 32); 922 auto Const1 = B.buildConstant(S32, ExpBits); 923 924 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 925 .addUse(Const0.getReg(0)) 926 .addUse(Const1.getReg(0)); 927 928 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 929 } 930 931 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 932 MachineInstr &MI, MachineRegisterInfo &MRI, 933 MachineIRBuilder &B) const { 934 B.setInstr(MI); 935 936 const LLT S1 = LLT::scalar(1); 937 const LLT S32 = LLT::scalar(32); 938 const LLT S64 = LLT::scalar(64); 939 940 unsigned Src = MI.getOperand(1).getReg(); 941 assert(MRI.getType(Src) == S64); 942 943 // TODO: Should this use extract since the low half is unused? 944 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 945 unsigned Hi = Unmerge.getReg(1); 946 947 // Extract the upper half, since this is where we will find the sign and 948 // exponent. 949 auto Exp = extractF64Exponent(Hi, B); 950 951 const unsigned FractBits = 52; 952 953 // Extract the sign bit. 954 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 955 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 956 957 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 958 959 const auto Zero32 = B.buildConstant(S32, 0); 960 961 // Extend back to 64-bits. 962 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 963 964 auto Shr = B.buildAShr(S64, FractMask, Exp); 965 auto Not = B.buildNot(S64, Shr); 966 auto Tmp0 = B.buildAnd(S64, Src, Not); 967 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 968 969 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 970 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 971 972 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 973 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 974 return true; 975 } 976 977 bool AMDGPULegalizerInfo::legalizeITOFP( 978 MachineInstr &MI, MachineRegisterInfo &MRI, 979 MachineIRBuilder &B, bool Signed) const { 980 B.setInstr(MI); 981 982 unsigned Dst = MI.getOperand(0).getReg(); 983 unsigned Src = MI.getOperand(1).getReg(); 984 985 const LLT S64 = LLT::scalar(64); 986 const LLT S32 = LLT::scalar(32); 987 988 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 989 990 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 991 992 auto CvtHi = Signed ? 993 B.buildSITOFP(S64, Unmerge.getReg(1)) : 994 B.buildUITOFP(S64, Unmerge.getReg(1)); 995 996 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 997 998 auto ThirtyTwo = B.buildConstant(S32, 32); 999 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1000 .addUse(CvtHi.getReg(0)) 1001 .addUse(ThirtyTwo.getReg(0)); 1002 1003 // TODO: Should this propagate fast-math-flags? 1004 B.buildFAdd(Dst, LdExp, CvtLo); 1005 MI.eraseFromParent(); 1006 return true; 1007 } 1008