1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 using namespace llvm; 27 using namespace LegalizeActions; 28 using namespace LegalizeMutations; 29 using namespace LegalityPredicates; 30 31 32 static LegalityPredicate isMultiple32(unsigned TypeIdx, 33 unsigned MaxSize = 512) { 34 return [=](const LegalityQuery &Query) { 35 const LLT Ty = Query.Types[TypeIdx]; 36 const LLT EltTy = Ty.getScalarType(); 37 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 38 }; 39 } 40 41 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 42 return [=](const LegalityQuery &Query) { 43 const LLT Ty = Query.Types[TypeIdx]; 44 return Ty.isVector() && 45 Ty.getNumElements() % 2 != 0 && 46 Ty.getElementType().getSizeInBits() < 32; 47 }; 48 } 49 50 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 51 return [=](const LegalityQuery &Query) { 52 const LLT Ty = Query.Types[TypeIdx]; 53 const LLT EltTy = Ty.getElementType(); 54 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 55 }; 56 } 57 58 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 59 return [=](const LegalityQuery &Query) { 60 const LLT Ty = Query.Types[TypeIdx]; 61 const LLT EltTy = Ty.getElementType(); 62 unsigned Size = Ty.getSizeInBits(); 63 unsigned Pieces = (Size + 63) / 64; 64 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 65 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 66 }; 67 } 68 69 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 const LLT QueryTy = Query.Types[TypeIdx]; 72 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 73 }; 74 } 75 76 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 77 return [=](const LegalityQuery &Query) { 78 const LLT QueryTy = Query.Types[TypeIdx]; 79 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 80 }; 81 } 82 83 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, 84 const GCNTargetMachine &TM) { 85 using namespace TargetOpcode; 86 87 auto GetAddrSpacePtr = [&TM](unsigned AS) { 88 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 89 }; 90 91 const LLT S1 = LLT::scalar(1); 92 const LLT S8 = LLT::scalar(8); 93 const LLT S16 = LLT::scalar(16); 94 const LLT S32 = LLT::scalar(32); 95 const LLT S64 = LLT::scalar(64); 96 const LLT S128 = LLT::scalar(128); 97 const LLT S256 = LLT::scalar(256); 98 const LLT S512 = LLT::scalar(512); 99 100 const LLT V2S16 = LLT::vector(2, 16); 101 const LLT V4S16 = LLT::vector(4, 16); 102 const LLT V8S16 = LLT::vector(8, 16); 103 104 const LLT V2S32 = LLT::vector(2, 32); 105 const LLT V3S32 = LLT::vector(3, 32); 106 const LLT V4S32 = LLT::vector(4, 32); 107 const LLT V5S32 = LLT::vector(5, 32); 108 const LLT V6S32 = LLT::vector(6, 32); 109 const LLT V7S32 = LLT::vector(7, 32); 110 const LLT V8S32 = LLT::vector(8, 32); 111 const LLT V9S32 = LLT::vector(9, 32); 112 const LLT V10S32 = LLT::vector(10, 32); 113 const LLT V11S32 = LLT::vector(11, 32); 114 const LLT V12S32 = LLT::vector(12, 32); 115 const LLT V13S32 = LLT::vector(13, 32); 116 const LLT V14S32 = LLT::vector(14, 32); 117 const LLT V15S32 = LLT::vector(15, 32); 118 const LLT V16S32 = LLT::vector(16, 32); 119 120 const LLT V2S64 = LLT::vector(2, 64); 121 const LLT V3S64 = LLT::vector(3, 64); 122 const LLT V4S64 = LLT::vector(4, 64); 123 const LLT V5S64 = LLT::vector(5, 64); 124 const LLT V6S64 = LLT::vector(6, 64); 125 const LLT V7S64 = LLT::vector(7, 64); 126 const LLT V8S64 = LLT::vector(8, 64); 127 128 std::initializer_list<LLT> AllS32Vectors = 129 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 130 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 131 std::initializer_list<LLT> AllS64Vectors = 132 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 133 134 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 135 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 136 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 137 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 138 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 139 140 const LLT CodePtr = FlatPtr; 141 142 const std::initializer_list<LLT> AddrSpaces64 = { 143 GlobalPtr, ConstantPtr, FlatPtr 144 }; 145 146 const std::initializer_list<LLT> AddrSpaces32 = { 147 LocalPtr, PrivatePtr 148 }; 149 150 setAction({G_BRCOND, S1}, Legal); 151 152 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 153 // elements for v3s16 154 getActionDefinitionsBuilder(G_PHI) 155 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 156 .legalFor(AllS32Vectors) 157 .legalFor(AllS64Vectors) 158 .legalFor(AddrSpaces64) 159 .legalFor(AddrSpaces32) 160 .clampScalar(0, S32, S256) 161 .widenScalarToNextPow2(0, 32) 162 .legalIf(isPointer(0)); 163 164 165 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UMULH, G_SMULH}) 166 .legalFor({S32}) 167 .clampScalar(0, S32, S32) 168 .scalarize(0); 169 170 // Report legal for any types we can handle anywhere. For the cases only legal 171 // on the SALU, RegBankSelect will be able to re-legalize. 172 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 173 .legalFor({S32, S1, S64, V2S32, V2S16, V4S16}) 174 .clampScalar(0, S32, S64) 175 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 176 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 177 .widenScalarToNextPow2(0) 178 .scalarize(0); 179 180 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 181 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 182 .legalFor({{S32, S1}}) 183 .clampScalar(0, S32, S32); 184 185 getActionDefinitionsBuilder(G_BITCAST) 186 .legalForCartesianProduct({S32, V2S16}) 187 .legalForCartesianProduct({S64, V2S32, V4S16}) 188 .legalForCartesianProduct({V2S64, V4S32}) 189 // Don't worry about the size constraint. 190 .legalIf(all(isPointer(0), isPointer(1))); 191 192 if (ST.has16BitInsts()) { 193 getActionDefinitionsBuilder(G_FCONSTANT) 194 .legalFor({S32, S64, S16}) 195 .clampScalar(0, S16, S64); 196 } else { 197 getActionDefinitionsBuilder(G_FCONSTANT) 198 .legalFor({S32, S64}) 199 .clampScalar(0, S32, S64); 200 } 201 202 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 203 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 204 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 205 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 206 .clampScalarOrElt(0, S32, S512) 207 .legalIf(isMultiple32(0)) 208 .widenScalarToNextPow2(0, 32) 209 .clampMaxNumElements(0, S32, 16); 210 211 212 // FIXME: i1 operands to intrinsics should always be legal, but other i1 213 // values may not be legal. We need to figure out how to distinguish 214 // between these two scenarios. 215 getActionDefinitionsBuilder(G_CONSTANT) 216 .legalFor({S1, S32, S64, GlobalPtr, 217 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 218 .clampScalar(0, S32, S64) 219 .widenScalarToNextPow2(0) 220 .legalIf(isPointer(0)); 221 222 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 223 224 auto &FPOpActions = getActionDefinitionsBuilder( 225 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 226 .legalFor({S32, S64}); 227 228 if (ST.has16BitInsts()) { 229 if (ST.hasVOP3PInsts()) 230 FPOpActions.legalFor({S16, V2S16}); 231 else 232 FPOpActions.legalFor({S16}); 233 } 234 235 if (ST.hasVOP3PInsts()) 236 FPOpActions.clampMaxNumElements(0, S16, 2); 237 FPOpActions 238 .scalarize(0) 239 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 240 241 if (ST.has16BitInsts()) { 242 getActionDefinitionsBuilder(G_FSQRT) 243 .legalFor({S32, S64, S16}) 244 .scalarize(0) 245 .clampScalar(0, S16, S64); 246 } else { 247 getActionDefinitionsBuilder(G_FSQRT) 248 .legalFor({S32, S64}) 249 .scalarize(0) 250 .clampScalar(0, S32, S64); 251 } 252 253 getActionDefinitionsBuilder(G_FPTRUNC) 254 .legalFor({{S32, S64}, {S16, S32}}) 255 .scalarize(0); 256 257 getActionDefinitionsBuilder(G_FPEXT) 258 .legalFor({{S64, S32}, {S32, S16}}) 259 .lowerFor({{S64, S16}}) // FIXME: Implement 260 .scalarize(0); 261 262 getActionDefinitionsBuilder(G_FSUB) 263 // Use actual fsub instruction 264 .legalFor({S32}) 265 // Must use fadd + fneg 266 .lowerFor({S64, S16, V2S16}) 267 .scalarize(0) 268 .clampScalar(0, S32, S64); 269 270 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 271 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 272 {S32, S1}, {S64, S1}, {S16, S1}, 273 // FIXME: Hack 274 {S64, LLT::scalar(33)}, 275 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 276 .scalarize(0); 277 278 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 279 .legalFor({{S32, S32}, {S64, S32}}) 280 .scalarize(0); 281 282 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 283 .legalFor({{S32, S32}, {S32, S64}}) 284 .scalarize(0); 285 286 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) 287 .legalFor({S32, S64}) 288 .scalarize(0); 289 290 291 getActionDefinitionsBuilder(G_GEP) 292 .legalForCartesianProduct(AddrSpaces64, {S64}) 293 .legalForCartesianProduct(AddrSpaces32, {S32}) 294 .scalarize(0); 295 296 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 297 298 getActionDefinitionsBuilder(G_ICMP) 299 .legalForCartesianProduct( 300 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 301 .legalFor({{S1, S32}, {S1, S64}}) 302 .widenScalarToNextPow2(1) 303 .clampScalar(1, S32, S64) 304 .scalarize(0) 305 .legalIf(all(typeIs(0, S1), isPointer(1))); 306 307 getActionDefinitionsBuilder(G_FCMP) 308 .legalFor({{S1, S32}, {S1, S64}}) 309 .widenScalarToNextPow2(1) 310 .clampScalar(1, S32, S64) 311 .scalarize(0); 312 313 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 314 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 315 G_FLOG, G_FLOG2, G_FLOG10}) 316 .legalFor({S32}) 317 .scalarize(0); 318 319 // The 64-bit versions produce 32-bit results, but only on the SALU. 320 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 321 G_CTTZ, G_CTTZ_ZERO_UNDEF, 322 G_CTPOP}) 323 .legalFor({{S32, S32}, {S32, S64}}) 324 .clampScalar(0, S32, S32) 325 .clampScalar(1, S32, S64) 326 .scalarize(0) 327 .widenScalarToNextPow2(0, 32) 328 .widenScalarToNextPow2(1, 32); 329 330 // TODO: Expand for > s32 331 getActionDefinitionsBuilder(G_BSWAP) 332 .legalFor({S32}) 333 .clampScalar(0, S32, S32) 334 .scalarize(0); 335 336 337 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 338 return [=](const LegalityQuery &Query) { 339 return Query.Types[TypeIdx0].getSizeInBits() < 340 Query.Types[TypeIdx1].getSizeInBits(); 341 }; 342 }; 343 344 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 345 return [=](const LegalityQuery &Query) { 346 return Query.Types[TypeIdx0].getSizeInBits() > 347 Query.Types[TypeIdx1].getSizeInBits(); 348 }; 349 }; 350 351 getActionDefinitionsBuilder(G_INTTOPTR) 352 // List the common cases 353 .legalForCartesianProduct(AddrSpaces64, {S64}) 354 .legalForCartesianProduct(AddrSpaces32, {S32}) 355 .scalarize(0) 356 // Accept any address space as long as the size matches 357 .legalIf(sameSize(0, 1)) 358 .widenScalarIf(smallerThan(1, 0), 359 [](const LegalityQuery &Query) { 360 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 361 }) 362 .narrowScalarIf(greaterThan(1, 0), 363 [](const LegalityQuery &Query) { 364 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 365 }); 366 367 getActionDefinitionsBuilder(G_PTRTOINT) 368 // List the common cases 369 .legalForCartesianProduct(AddrSpaces64, {S64}) 370 .legalForCartesianProduct(AddrSpaces32, {S32}) 371 .scalarize(0) 372 // Accept any address space as long as the size matches 373 .legalIf(sameSize(0, 1)) 374 .widenScalarIf(smallerThan(0, 1), 375 [](const LegalityQuery &Query) { 376 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 377 }) 378 .narrowScalarIf( 379 greaterThan(0, 1), 380 [](const LegalityQuery &Query) { 381 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 382 }); 383 384 if (ST.hasFlatAddressSpace()) { 385 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 386 .scalarize(0) 387 .custom(); 388 } 389 390 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 391 .narrowScalarIf([](const LegalityQuery &Query) { 392 unsigned Size = Query.Types[0].getSizeInBits(); 393 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 394 return (Size > 32 && MemSize < Size); 395 }, 396 [](const LegalityQuery &Query) { 397 return std::make_pair(0, LLT::scalar(32)); 398 }) 399 .fewerElementsIf([=, &ST](const LegalityQuery &Query) { 400 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 401 return (MemSize == 96) && 402 Query.Types[0].isVector() && 403 ST.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS; 404 }, 405 [=](const LegalityQuery &Query) { 406 return std::make_pair(0, V2S32); 407 }) 408 .legalIf([=, &ST](const LegalityQuery &Query) { 409 const LLT &Ty0 = Query.Types[0]; 410 411 unsigned Size = Ty0.getSizeInBits(); 412 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 413 if (Size < 32 || (Size > 32 && MemSize < Size)) 414 return false; 415 416 if (Ty0.isVector() && Size != MemSize) 417 return false; 418 419 // TODO: Decompose private loads into 4-byte components. 420 // TODO: Illegal flat loads on SI 421 switch (MemSize) { 422 case 8: 423 case 16: 424 return Size == 32; 425 case 32: 426 case 64: 427 case 128: 428 return true; 429 430 case 96: 431 // XXX hasLoadX3 432 return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); 433 434 case 256: 435 case 512: 436 // TODO: constant loads 437 default: 438 return false; 439 } 440 }) 441 .clampScalar(0, S32, S64); 442 443 444 // FIXME: Handle alignment requirements. 445 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 446 .legalForTypesWithMemDesc({ 447 {S32, GlobalPtr, 8, 8}, 448 {S32, GlobalPtr, 16, 8}, 449 {S32, LocalPtr, 8, 8}, 450 {S32, LocalPtr, 16, 8}, 451 {S32, PrivatePtr, 8, 8}, 452 {S32, PrivatePtr, 16, 8}}); 453 if (ST.hasFlatAddressSpace()) { 454 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 455 {S32, FlatPtr, 16, 8}}); 456 } 457 458 ExtLoads.clampScalar(0, S32, S32) 459 .widenScalarToNextPow2(0) 460 .unsupportedIfMemSizeNotPow2() 461 .lower(); 462 463 auto &Atomics = getActionDefinitionsBuilder( 464 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 465 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 466 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 467 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 468 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 469 {S64, GlobalPtr}, {S64, LocalPtr}}); 470 if (ST.hasFlatAddressSpace()) { 471 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 472 } 473 474 // TODO: Pointer types, any 32-bit or 64-bit vector 475 getActionDefinitionsBuilder(G_SELECT) 476 .legalForCartesianProduct({S32, S64, V2S32, V2S16, V4S16, 477 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 478 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 479 .clampScalar(0, S32, S64) 480 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 481 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 482 .scalarize(1) 483 .clampMaxNumElements(0, S32, 2) 484 .clampMaxNumElements(0, LocalPtr, 2) 485 .clampMaxNumElements(0, PrivatePtr, 2) 486 .scalarize(0) 487 .legalIf(all(isPointer(0), typeIs(1, S1))); 488 489 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 490 // be more flexible with the shift amount type. 491 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 492 .legalFor({{S32, S32}, {S64, S32}}); 493 if (ST.has16BitInsts()) { 494 if (ST.hasVOP3PInsts()) { 495 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 496 .clampMaxNumElements(0, S16, 2); 497 } else 498 Shifts.legalFor({{S16, S32}, {S16, S16}}); 499 500 Shifts.clampScalar(1, S16, S32); 501 Shifts.clampScalar(0, S16, S64); 502 Shifts.widenScalarToNextPow2(0, 16); 503 } else { 504 // Make sure we legalize the shift amount type first, as the general 505 // expansion for the shifted type will produce much worse code if it hasn't 506 // been truncated already. 507 Shifts.clampScalar(1, S32, S32); 508 Shifts.clampScalar(0, S32, S64); 509 Shifts.widenScalarToNextPow2(0, 32); 510 } 511 Shifts.scalarize(0); 512 513 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 514 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 515 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 516 unsigned IdxTypeIdx = 2; 517 518 getActionDefinitionsBuilder(Op) 519 .legalIf([=](const LegalityQuery &Query) { 520 const LLT &VecTy = Query.Types[VecTypeIdx]; 521 const LLT &IdxTy = Query.Types[IdxTypeIdx]; 522 return VecTy.getSizeInBits() % 32 == 0 && 523 VecTy.getSizeInBits() <= 512 && 524 IdxTy.getSizeInBits() == 32; 525 }) 526 .clampScalar(EltTypeIdx, S32, S64) 527 .clampScalar(VecTypeIdx, S32, S64) 528 .clampScalar(IdxTypeIdx, S32, S32); 529 } 530 531 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 532 .unsupportedIf([=](const LegalityQuery &Query) { 533 const LLT &EltTy = Query.Types[1].getElementType(); 534 return Query.Types[0] != EltTy; 535 }); 536 537 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 538 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 539 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 540 541 // FIXME: Doesn't handle extract of illegal sizes. 542 getActionDefinitionsBuilder(Op) 543 .legalIf([=](const LegalityQuery &Query) { 544 const LLT BigTy = Query.Types[BigTyIdx]; 545 const LLT LitTy = Query.Types[LitTyIdx]; 546 return (BigTy.getSizeInBits() % 32 == 0) && 547 (LitTy.getSizeInBits() % 16 == 0); 548 }) 549 .widenScalarIf( 550 [=](const LegalityQuery &Query) { 551 const LLT BigTy = Query.Types[BigTyIdx]; 552 return (BigTy.getScalarSizeInBits() < 16); 553 }, 554 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 555 .widenScalarIf( 556 [=](const LegalityQuery &Query) { 557 const LLT LitTy = Query.Types[LitTyIdx]; 558 return (LitTy.getScalarSizeInBits() < 16); 559 }, 560 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 561 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)); 562 } 563 564 // TODO: vectors of pointers 565 getActionDefinitionsBuilder(G_BUILD_VECTOR) 566 .legalForCartesianProduct(AllS32Vectors, {S32}) 567 .legalForCartesianProduct(AllS64Vectors, {S64}) 568 .clampNumElements(0, V16S32, V16S32) 569 .clampNumElements(0, V2S64, V8S64) 570 .minScalarSameAs(1, 0) 571 // FIXME: Sort of a hack to make progress on other legalizations. 572 .legalIf([=](const LegalityQuery &Query) { 573 return Query.Types[0].getScalarSizeInBits() <= 32 || 574 Query.Types[0].getScalarSizeInBits() == 64; 575 }); 576 577 // TODO: Support any combination of v2s32 578 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 579 .legalFor({{V4S32, V2S32}, 580 {V8S32, V2S32}, 581 {V8S32, V4S32}, 582 {V4S64, V2S64}, 583 {V4S16, V2S16}, 584 {V8S16, V2S16}, 585 {V8S16, V4S16}, 586 {LLT::vector(4, LocalPtr), LLT::vector(2, LocalPtr)}, 587 {LLT::vector(4, PrivatePtr), LLT::vector(2, PrivatePtr)}}); 588 589 // Merge/Unmerge 590 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 591 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 592 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 593 594 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 595 const LLT &Ty = Query.Types[TypeIdx]; 596 if (Ty.isVector()) { 597 const LLT &EltTy = Ty.getElementType(); 598 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 599 return true; 600 if (!isPowerOf2_32(EltTy.getSizeInBits())) 601 return true; 602 } 603 return false; 604 }; 605 606 getActionDefinitionsBuilder(Op) 607 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 608 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 609 // worth considering the multiples of 64 since 2*192 and 2*384 are not 610 // valid. 611 .clampScalar(LitTyIdx, S16, S256) 612 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 613 614 // Break up vectors with weird elements into scalars 615 .fewerElementsIf( 616 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 617 scalarize(0)) 618 .fewerElementsIf( 619 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 620 scalarize(1)) 621 .clampScalar(BigTyIdx, S32, S512) 622 .widenScalarIf( 623 [=](const LegalityQuery &Query) { 624 const LLT &Ty = Query.Types[BigTyIdx]; 625 return !isPowerOf2_32(Ty.getSizeInBits()) && 626 Ty.getSizeInBits() % 16 != 0; 627 }, 628 [=](const LegalityQuery &Query) { 629 // Pick the next power of 2, or a multiple of 64 over 128. 630 // Whichever is smaller. 631 const LLT &Ty = Query.Types[BigTyIdx]; 632 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 633 if (NewSizeInBits >= 256) { 634 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 635 if (RoundedTo < NewSizeInBits) 636 NewSizeInBits = RoundedTo; 637 } 638 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 639 }) 640 .legalIf([=](const LegalityQuery &Query) { 641 const LLT &BigTy = Query.Types[BigTyIdx]; 642 const LLT &LitTy = Query.Types[LitTyIdx]; 643 644 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 645 return false; 646 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 647 return false; 648 649 return BigTy.getSizeInBits() % 16 == 0 && 650 LitTy.getSizeInBits() % 16 == 0 && 651 BigTy.getSizeInBits() <= 512; 652 }) 653 // Any vectors left are the wrong size. Scalarize them. 654 .scalarize(0) 655 .scalarize(1); 656 } 657 658 computeTables(); 659 verify(*ST.getInstrInfo()); 660 } 661 662 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 663 MachineRegisterInfo &MRI, 664 MachineIRBuilder &MIRBuilder, 665 GISelChangeObserver &Observer) const { 666 switch (MI.getOpcode()) { 667 case TargetOpcode::G_ADDRSPACE_CAST: 668 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 669 default: 670 return false; 671 } 672 673 llvm_unreachable("expected switch to return"); 674 } 675 676 unsigned AMDGPULegalizerInfo::getSegmentAperture( 677 unsigned AS, 678 MachineRegisterInfo &MRI, 679 MachineIRBuilder &MIRBuilder) const { 680 MachineFunction &MF = MIRBuilder.getMF(); 681 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 682 const LLT S32 = LLT::scalar(32); 683 684 if (ST.hasApertureRegs()) { 685 // FIXME: Use inline constants (src_{shared, private}_base) instead of 686 // getreg. 687 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 688 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 689 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 690 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 691 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 692 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 693 unsigned Encoding = 694 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 695 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 696 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 697 698 unsigned ShiftAmt = MRI.createGenericVirtualRegister(S32); 699 unsigned ApertureReg = MRI.createGenericVirtualRegister(S32); 700 unsigned GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 701 702 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 703 .addDef(GetReg) 704 .addImm(Encoding); 705 MRI.setType(GetReg, S32); 706 707 MIRBuilder.buildConstant(ShiftAmt, WidthM1 + 1); 708 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 709 .addDef(ApertureReg) 710 .addUse(GetReg) 711 .addUse(ShiftAmt); 712 713 return ApertureReg; 714 } 715 716 unsigned QueuePtr = MRI.createGenericVirtualRegister( 717 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 718 719 // FIXME: Placeholder until we can track the input registers. 720 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 721 722 // Offset into amd_queue_t for group_segment_aperture_base_hi / 723 // private_segment_aperture_base_hi. 724 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 725 726 // FIXME: Don't use undef 727 Value *V = UndefValue::get(PointerType::get( 728 Type::getInt8Ty(MF.getFunction().getContext()), 729 AMDGPUAS::CONSTANT_ADDRESS)); 730 731 MachinePointerInfo PtrInfo(V, StructOffset); 732 MachineMemOperand *MMO = MF.getMachineMemOperand( 733 PtrInfo, 734 MachineMemOperand::MOLoad | 735 MachineMemOperand::MODereferenceable | 736 MachineMemOperand::MOInvariant, 737 4, 738 MinAlign(64, StructOffset)); 739 740 unsigned LoadResult = MRI.createGenericVirtualRegister(S32); 741 unsigned LoadAddr = AMDGPU::NoRegister; 742 743 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 744 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 745 return LoadResult; 746 } 747 748 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 749 MachineInstr &MI, MachineRegisterInfo &MRI, 750 MachineIRBuilder &MIRBuilder) const { 751 MachineFunction &MF = MIRBuilder.getMF(); 752 753 MIRBuilder.setInstr(MI); 754 755 unsigned Dst = MI.getOperand(0).getReg(); 756 unsigned Src = MI.getOperand(1).getReg(); 757 758 LLT DstTy = MRI.getType(Dst); 759 LLT SrcTy = MRI.getType(Src); 760 unsigned DestAS = DstTy.getAddressSpace(); 761 unsigned SrcAS = SrcTy.getAddressSpace(); 762 763 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 764 // vector element. 765 assert(!DstTy.isVector()); 766 767 const AMDGPUTargetMachine &TM 768 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 769 770 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 771 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 772 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 773 return true; 774 } 775 776 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 777 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 778 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 779 unsigned NullVal = TM.getNullPointerValue(DestAS); 780 781 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(DstTy); 782 unsigned FlatNullReg = MRI.createGenericVirtualRegister(SrcTy); 783 784 MIRBuilder.buildConstant(SegmentNullReg, NullVal); 785 MIRBuilder.buildConstant(FlatNullReg, 0); 786 787 unsigned PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 788 789 // Extract low 32-bits of the pointer. 790 MIRBuilder.buildExtract(PtrLo32, Src, 0); 791 792 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 793 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNullReg); 794 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNullReg); 795 796 MI.eraseFromParent(); 797 return true; 798 } 799 800 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 801 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 802 803 unsigned FlatNullReg = MRI.createGenericVirtualRegister(DstTy); 804 unsigned SegmentNullReg = MRI.createGenericVirtualRegister(SrcTy); 805 MIRBuilder.buildConstant(SegmentNullReg, TM.getNullPointerValue(SrcAS)); 806 MIRBuilder.buildConstant(FlatNullReg, TM.getNullPointerValue(DestAS)); 807 808 unsigned ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 809 810 unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 811 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNullReg); 812 813 unsigned BuildPtr = MRI.createGenericVirtualRegister(DstTy); 814 815 // Coerce the type of the low half of the result so we can use merge_values. 816 unsigned SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 817 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 818 .addDef(SrcAsInt) 819 .addUse(Src); 820 821 // TODO: Should we allow mismatched types but matching sizes in merges to 822 // avoid the ptrtoint? 823 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 824 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNullReg); 825 826 MI.eraseFromParent(); 827 return true; 828 } 829