1 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run 10 // both before and after the DAG is legalized. 11 // 12 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is 13 // primarily intended to handle simplification opportunities that are implicit 14 // in the LLVM IR and exposed by the various codegen lowering phases. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/ADT/APFloat.h" 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/IntervalMap.h" 23 #include "llvm/ADT/None.h" 24 #include "llvm/ADT/Optional.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallBitVector.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/AliasAnalysis.h" 33 #include "llvm/Analysis/MemoryLocation.h" 34 #include "llvm/Analysis/TargetLibraryInfo.h" 35 #include "llvm/Analysis/VectorUtils.h" 36 #include "llvm/CodeGen/DAGCombine.h" 37 #include "llvm/CodeGen/ISDOpcodes.h" 38 #include "llvm/CodeGen/MachineFrameInfo.h" 39 #include "llvm/CodeGen/MachineFunction.h" 40 #include "llvm/CodeGen/MachineMemOperand.h" 41 #include "llvm/CodeGen/RuntimeLibcalls.h" 42 #include "llvm/CodeGen/SelectionDAG.h" 43 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" 44 #include "llvm/CodeGen/SelectionDAGNodes.h" 45 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 46 #include "llvm/CodeGen/TargetLowering.h" 47 #include "llvm/CodeGen/TargetRegisterInfo.h" 48 #include "llvm/CodeGen/TargetSubtargetInfo.h" 49 #include "llvm/CodeGen/ValueTypes.h" 50 #include "llvm/IR/Attributes.h" 51 #include "llvm/IR/Constant.h" 52 #include "llvm/IR/DataLayout.h" 53 #include "llvm/IR/DerivedTypes.h" 54 #include "llvm/IR/Function.h" 55 #include "llvm/IR/LLVMContext.h" 56 #include "llvm/IR/Metadata.h" 57 #include "llvm/Support/Casting.h" 58 #include "llvm/Support/CodeGen.h" 59 #include "llvm/Support/CommandLine.h" 60 #include "llvm/Support/Compiler.h" 61 #include "llvm/Support/Debug.h" 62 #include "llvm/Support/ErrorHandling.h" 63 #include "llvm/Support/KnownBits.h" 64 #include "llvm/Support/MachineValueType.h" 65 #include "llvm/Support/MathExtras.h" 66 #include "llvm/Support/raw_ostream.h" 67 #include "llvm/Target/TargetMachine.h" 68 #include "llvm/Target/TargetOptions.h" 69 #include <algorithm> 70 #include <cassert> 71 #include <cstdint> 72 #include <functional> 73 #include <iterator> 74 #include <string> 75 #include <tuple> 76 #include <utility> 77 78 using namespace llvm; 79 80 #define DEBUG_TYPE "dagcombine" 81 82 STATISTIC(NodesCombined , "Number of dag nodes combined"); 83 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); 84 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); 85 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); 86 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); 87 STATISTIC(SlicedLoads, "Number of load sliced"); 88 STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops"); 89 90 static cl::opt<bool> 91 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, 92 cl::desc("Enable DAG combiner's use of IR alias analysis")); 93 94 static cl::opt<bool> 95 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), 96 cl::desc("Enable DAG combiner's use of TBAA")); 97 98 #ifndef NDEBUG 99 static cl::opt<std::string> 100 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, 101 cl::desc("Only use DAG-combiner alias analysis in this" 102 " function")); 103 #endif 104 105 /// Hidden option to stress test load slicing, i.e., when this option 106 /// is enabled, load slicing bypasses most of its profitability guards. 107 static cl::opt<bool> 108 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, 109 cl::desc("Bypass the profitability model of load slicing"), 110 cl::init(false)); 111 112 static cl::opt<bool> 113 MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), 114 cl::desc("DAG combiner may split indexing from loads")); 115 116 static cl::opt<bool> 117 EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true), 118 cl::desc("DAG combiner enable merging multiple stores " 119 "into a wider store")); 120 121 static cl::opt<unsigned> TokenFactorInlineLimit( 122 "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048), 123 cl::desc("Limit the number of operands to inline for Token Factors")); 124 125 static cl::opt<unsigned> StoreMergeDependenceLimit( 126 "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10), 127 cl::desc("Limit the number of times for the same StoreNode and RootNode " 128 "to bail out in store merging dependence check")); 129 130 static cl::opt<bool> EnableReduceLoadOpStoreWidth( 131 "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true), 132 cl::desc("DAG cominber enable reducing the width of load/op/store " 133 "sequence")); 134 135 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore( 136 "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true), 137 cl::desc("DAG cominber enable load/<replace bytes>/store with " 138 "a narrower store")); 139 140 namespace { 141 142 class DAGCombiner { 143 SelectionDAG &DAG; 144 const TargetLowering &TLI; 145 const SelectionDAGTargetInfo *STI; 146 CombineLevel Level; 147 CodeGenOpt::Level OptLevel; 148 bool LegalDAG = false; 149 bool LegalOperations = false; 150 bool LegalTypes = false; 151 bool ForCodeSize; 152 bool DisableGenericCombines; 153 154 /// Worklist of all of the nodes that need to be simplified. 155 /// 156 /// This must behave as a stack -- new nodes to process are pushed onto the 157 /// back and when processing we pop off of the back. 158 /// 159 /// The worklist will not contain duplicates but may contain null entries 160 /// due to nodes being deleted from the underlying DAG. 161 SmallVector<SDNode *, 64> Worklist; 162 163 /// Mapping from an SDNode to its position on the worklist. 164 /// 165 /// This is used to find and remove nodes from the worklist (by nulling 166 /// them) when they are deleted from the underlying DAG. It relies on 167 /// stable indices of nodes within the worklist. 168 DenseMap<SDNode *, unsigned> WorklistMap; 169 /// This records all nodes attempted to add to the worklist since we 170 /// considered a new worklist entry. As we keep do not add duplicate nodes 171 /// in the worklist, this is different from the tail of the worklist. 172 SmallSetVector<SDNode *, 32> PruningList; 173 174 /// Set of nodes which have been combined (at least once). 175 /// 176 /// This is used to allow us to reliably add any operands of a DAG node 177 /// which have not yet been combined to the worklist. 178 SmallPtrSet<SDNode *, 32> CombinedNodes; 179 180 /// Map from candidate StoreNode to the pair of RootNode and count. 181 /// The count is used to track how many times we have seen the StoreNode 182 /// with the same RootNode bail out in dependence check. If we have seen 183 /// the bail out for the same pair many times over a limit, we won't 184 /// consider the StoreNode with the same RootNode as store merging 185 /// candidate again. 186 DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap; 187 188 // AA - Used for DAG load/store alias analysis. 189 AliasAnalysis *AA; 190 191 /// When an instruction is simplified, add all users of the instruction to 192 /// the work lists because they might get more simplified now. 193 void AddUsersToWorklist(SDNode *N) { 194 for (SDNode *Node : N->uses()) 195 AddToWorklist(Node); 196 } 197 198 /// Convenient shorthand to add a node and all of its user to the worklist. 199 void AddToWorklistWithUsers(SDNode *N) { 200 AddUsersToWorklist(N); 201 AddToWorklist(N); 202 } 203 204 // Prune potentially dangling nodes. This is called after 205 // any visit to a node, but should also be called during a visit after any 206 // failed combine which may have created a DAG node. 207 void clearAddedDanglingWorklistEntries() { 208 // Check any nodes added to the worklist to see if they are prunable. 209 while (!PruningList.empty()) { 210 auto *N = PruningList.pop_back_val(); 211 if (N->use_empty()) 212 recursivelyDeleteUnusedNodes(N); 213 } 214 } 215 216 SDNode *getNextWorklistEntry() { 217 // Before we do any work, remove nodes that are not in use. 218 clearAddedDanglingWorklistEntries(); 219 SDNode *N = nullptr; 220 // The Worklist holds the SDNodes in order, but it may contain null 221 // entries. 222 while (!N && !Worklist.empty()) { 223 N = Worklist.pop_back_val(); 224 } 225 226 if (N) { 227 bool GoodWorklistEntry = WorklistMap.erase(N); 228 (void)GoodWorklistEntry; 229 assert(GoodWorklistEntry && 230 "Found a worklist entry without a corresponding map entry!"); 231 } 232 return N; 233 } 234 235 /// Call the node-specific routine that folds each particular type of node. 236 SDValue visit(SDNode *N); 237 238 public: 239 DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) 240 : DAG(D), TLI(D.getTargetLoweringInfo()), 241 STI(D.getSubtarget().getSelectionDAGInfo()), 242 Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { 243 ForCodeSize = DAG.shouldOptForSize(); 244 DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel); 245 246 MaximumLegalStoreInBits = 0; 247 // We use the minimum store size here, since that's all we can guarantee 248 // for the scalable vector types. 249 for (MVT VT : MVT::all_valuetypes()) 250 if (EVT(VT).isSimple() && VT != MVT::Other && 251 TLI.isTypeLegal(EVT(VT)) && 252 VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits) 253 MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize(); 254 } 255 256 void ConsiderForPruning(SDNode *N) { 257 // Mark this for potential pruning. 258 PruningList.insert(N); 259 } 260 261 /// Add to the worklist making sure its instance is at the back (next to be 262 /// processed.) 263 void AddToWorklist(SDNode *N) { 264 assert(N->getOpcode() != ISD::DELETED_NODE && 265 "Deleted Node added to Worklist"); 266 267 // Skip handle nodes as they can't usefully be combined and confuse the 268 // zero-use deletion strategy. 269 if (N->getOpcode() == ISD::HANDLENODE) 270 return; 271 272 ConsiderForPruning(N); 273 274 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) 275 Worklist.push_back(N); 276 } 277 278 /// Remove all instances of N from the worklist. 279 void removeFromWorklist(SDNode *N) { 280 CombinedNodes.erase(N); 281 PruningList.remove(N); 282 StoreRootCountMap.erase(N); 283 284 auto It = WorklistMap.find(N); 285 if (It == WorklistMap.end()) 286 return; // Not in the worklist. 287 288 // Null out the entry rather than erasing it to avoid a linear operation. 289 Worklist[It->second] = nullptr; 290 WorklistMap.erase(It); 291 } 292 293 void deleteAndRecombine(SDNode *N); 294 bool recursivelyDeleteUnusedNodes(SDNode *N); 295 296 /// Replaces all uses of the results of one DAG node with new values. 297 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 298 bool AddTo = true); 299 300 /// Replaces all uses of the results of one DAG node with new values. 301 SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { 302 return CombineTo(N, &Res, 1, AddTo); 303 } 304 305 /// Replaces all uses of the results of one DAG node with new values. 306 SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, 307 bool AddTo = true) { 308 SDValue To[] = { Res0, Res1 }; 309 return CombineTo(N, To, 2, AddTo); 310 } 311 312 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); 313 314 private: 315 unsigned MaximumLegalStoreInBits; 316 317 /// Check the specified integer node value to see if it can be simplified or 318 /// if things it uses can be simplified by bit propagation. 319 /// If so, return true. 320 bool SimplifyDemandedBits(SDValue Op) { 321 unsigned BitWidth = Op.getScalarValueSizeInBits(); 322 APInt DemandedBits = APInt::getAllOnesValue(BitWidth); 323 return SimplifyDemandedBits(Op, DemandedBits); 324 } 325 326 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) { 327 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 328 KnownBits Known; 329 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false)) 330 return false; 331 332 // Revisit the node. 333 AddToWorklist(Op.getNode()); 334 335 CommitTargetLoweringOpt(TLO); 336 return true; 337 } 338 339 /// Check the specified vector node value to see if it can be simplified or 340 /// if things it uses can be simplified as it only uses some of the 341 /// elements. If so, return true. 342 bool SimplifyDemandedVectorElts(SDValue Op) { 343 // TODO: For now just pretend it cannot be simplified. 344 if (Op.getValueType().isScalableVector()) 345 return false; 346 347 unsigned NumElts = Op.getValueType().getVectorNumElements(); 348 APInt DemandedElts = APInt::getAllOnesValue(NumElts); 349 return SimplifyDemandedVectorElts(Op, DemandedElts); 350 } 351 352 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 353 const APInt &DemandedElts, 354 bool AssumeSingleUse = false); 355 bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, 356 bool AssumeSingleUse = false); 357 358 bool CombineToPreIndexedLoadStore(SDNode *N); 359 bool CombineToPostIndexedLoadStore(SDNode *N); 360 SDValue SplitIndexingFromLoad(LoadSDNode *LD); 361 bool SliceUpLoad(SDNode *N); 362 363 // Scalars have size 0 to distinguish from singleton vectors. 364 SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); 365 bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); 366 bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); 367 368 /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed 369 /// load. 370 /// 371 /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. 372 /// \param InVecVT type of the input vector to EVE with bitcasts resolved. 373 /// \param EltNo index of the vector element to load. 374 /// \param OriginalLoad load that EVE came from to be replaced. 375 /// \returns EVE on success SDValue() on failure. 376 SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 377 SDValue EltNo, 378 LoadSDNode *OriginalLoad); 379 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); 380 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); 381 SDValue SExtPromoteOperand(SDValue Op, EVT PVT); 382 SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); 383 SDValue PromoteIntBinOp(SDValue Op); 384 SDValue PromoteIntShiftOp(SDValue Op); 385 SDValue PromoteExtend(SDValue Op); 386 bool PromoteLoad(SDValue Op); 387 388 /// Call the node-specific routine that knows how to fold each 389 /// particular type of node. If that doesn't do anything, try the 390 /// target-specific DAG combines. 391 SDValue combine(SDNode *N); 392 393 // Visitation implementation - Implement dag node combining for different 394 // node types. The semantics are as follows: 395 // Return Value: 396 // SDValue.getNode() == 0 - No change was made 397 // SDValue.getNode() == N - N was replaced, is dead and has been handled. 398 // otherwise - N should be replaced by the returned Operand. 399 // 400 SDValue visitTokenFactor(SDNode *N); 401 SDValue visitMERGE_VALUES(SDNode *N); 402 SDValue visitADD(SDNode *N); 403 SDValue visitADDLike(SDNode *N); 404 SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); 405 SDValue visitSUB(SDNode *N); 406 SDValue visitADDSAT(SDNode *N); 407 SDValue visitSUBSAT(SDNode *N); 408 SDValue visitADDC(SDNode *N); 409 SDValue visitADDO(SDNode *N); 410 SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); 411 SDValue visitSUBC(SDNode *N); 412 SDValue visitSUBO(SDNode *N); 413 SDValue visitADDE(SDNode *N); 414 SDValue visitADDCARRY(SDNode *N); 415 SDValue visitSADDO_CARRY(SDNode *N); 416 SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); 417 SDValue visitSUBE(SDNode *N); 418 SDValue visitSUBCARRY(SDNode *N); 419 SDValue visitSSUBO_CARRY(SDNode *N); 420 SDValue visitMUL(SDNode *N); 421 SDValue visitMULFIX(SDNode *N); 422 SDValue useDivRem(SDNode *N); 423 SDValue visitSDIV(SDNode *N); 424 SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N); 425 SDValue visitUDIV(SDNode *N); 426 SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N); 427 SDValue visitREM(SDNode *N); 428 SDValue visitMULHU(SDNode *N); 429 SDValue visitMULHS(SDNode *N); 430 SDValue visitSMUL_LOHI(SDNode *N); 431 SDValue visitUMUL_LOHI(SDNode *N); 432 SDValue visitMULO(SDNode *N); 433 SDValue visitIMINMAX(SDNode *N); 434 SDValue visitAND(SDNode *N); 435 SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N); 436 SDValue visitOR(SDNode *N); 437 SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N); 438 SDValue visitXOR(SDNode *N); 439 SDValue SimplifyVBinOp(SDNode *N); 440 SDValue visitSHL(SDNode *N); 441 SDValue visitSRA(SDNode *N); 442 SDValue visitSRL(SDNode *N); 443 SDValue visitFunnelShift(SDNode *N); 444 SDValue visitRotate(SDNode *N); 445 SDValue visitABS(SDNode *N); 446 SDValue visitBSWAP(SDNode *N); 447 SDValue visitBITREVERSE(SDNode *N); 448 SDValue visitCTLZ(SDNode *N); 449 SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); 450 SDValue visitCTTZ(SDNode *N); 451 SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); 452 SDValue visitCTPOP(SDNode *N); 453 SDValue visitSELECT(SDNode *N); 454 SDValue visitVSELECT(SDNode *N); 455 SDValue visitSELECT_CC(SDNode *N); 456 SDValue visitSETCC(SDNode *N); 457 SDValue visitSETCCCARRY(SDNode *N); 458 SDValue visitSIGN_EXTEND(SDNode *N); 459 SDValue visitZERO_EXTEND(SDNode *N); 460 SDValue visitANY_EXTEND(SDNode *N); 461 SDValue visitAssertExt(SDNode *N); 462 SDValue visitAssertAlign(SDNode *N); 463 SDValue visitSIGN_EXTEND_INREG(SDNode *N); 464 SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); 465 SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); 466 SDValue visitTRUNCATE(SDNode *N); 467 SDValue visitBITCAST(SDNode *N); 468 SDValue visitFREEZE(SDNode *N); 469 SDValue visitBUILD_PAIR(SDNode *N); 470 SDValue visitFADD(SDNode *N); 471 SDValue visitSTRICT_FADD(SDNode *N); 472 SDValue visitFSUB(SDNode *N); 473 SDValue visitFMUL(SDNode *N); 474 SDValue visitFMA(SDNode *N); 475 SDValue visitFDIV(SDNode *N); 476 SDValue visitFREM(SDNode *N); 477 SDValue visitFSQRT(SDNode *N); 478 SDValue visitFCOPYSIGN(SDNode *N); 479 SDValue visitFPOW(SDNode *N); 480 SDValue visitSINT_TO_FP(SDNode *N); 481 SDValue visitUINT_TO_FP(SDNode *N); 482 SDValue visitFP_TO_SINT(SDNode *N); 483 SDValue visitFP_TO_UINT(SDNode *N); 484 SDValue visitFP_ROUND(SDNode *N); 485 SDValue visitFP_EXTEND(SDNode *N); 486 SDValue visitFNEG(SDNode *N); 487 SDValue visitFABS(SDNode *N); 488 SDValue visitFCEIL(SDNode *N); 489 SDValue visitFTRUNC(SDNode *N); 490 SDValue visitFFLOOR(SDNode *N); 491 SDValue visitFMINNUM(SDNode *N); 492 SDValue visitFMAXNUM(SDNode *N); 493 SDValue visitFMINIMUM(SDNode *N); 494 SDValue visitFMAXIMUM(SDNode *N); 495 SDValue visitBRCOND(SDNode *N); 496 SDValue visitBR_CC(SDNode *N); 497 SDValue visitLOAD(SDNode *N); 498 499 SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); 500 SDValue replaceStoreOfFPConstant(StoreSDNode *ST); 501 502 SDValue visitSTORE(SDNode *N); 503 SDValue visitLIFETIME_END(SDNode *N); 504 SDValue visitINSERT_VECTOR_ELT(SDNode *N); 505 SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); 506 SDValue visitBUILD_VECTOR(SDNode *N); 507 SDValue visitCONCAT_VECTORS(SDNode *N); 508 SDValue visitEXTRACT_SUBVECTOR(SDNode *N); 509 SDValue visitVECTOR_SHUFFLE(SDNode *N); 510 SDValue visitSCALAR_TO_VECTOR(SDNode *N); 511 SDValue visitINSERT_SUBVECTOR(SDNode *N); 512 SDValue visitMLOAD(SDNode *N); 513 SDValue visitMSTORE(SDNode *N); 514 SDValue visitMGATHER(SDNode *N); 515 SDValue visitMSCATTER(SDNode *N); 516 SDValue visitFP_TO_FP16(SDNode *N); 517 SDValue visitFP16_TO_FP(SDNode *N); 518 SDValue visitVECREDUCE(SDNode *N); 519 520 SDValue visitFADDForFMACombine(SDNode *N); 521 SDValue visitFSUBForFMACombine(SDNode *N); 522 SDValue visitFMULForFMADistributiveCombine(SDNode *N); 523 524 SDValue XformToShuffleWithZero(SDNode *N); 525 bool reassociationCanBreakAddressingModePattern(unsigned Opc, 526 const SDLoc &DL, SDValue N0, 527 SDValue N1); 528 SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, 529 SDValue N1); 530 SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 531 SDValue N1, SDNodeFlags Flags); 532 533 SDValue visitShiftByConstant(SDNode *N); 534 535 SDValue foldSelectOfConstants(SDNode *N); 536 SDValue foldVSelectOfConstants(SDNode *N); 537 SDValue foldBinOpIntoSelect(SDNode *BO); 538 bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); 539 SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N); 540 SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); 541 SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 542 SDValue N2, SDValue N3, ISD::CondCode CC, 543 bool NotExtCompare = false); 544 SDValue convertSelectOfFPConstantsToLoadOffset( 545 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 546 ISD::CondCode CC); 547 SDValue foldSignChangeInBitcast(SDNode *N); 548 SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, 549 SDValue N2, SDValue N3, ISD::CondCode CC); 550 SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 551 const SDLoc &DL); 552 SDValue foldSubToUSubSat(EVT DstVT, SDNode *N); 553 SDValue unfoldMaskedMerge(SDNode *N); 554 SDValue unfoldExtremeBitClearingToShifts(SDNode *N); 555 SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, 556 const SDLoc &DL, bool foldBooleans); 557 SDValue rebuildSetCC(SDValue N); 558 559 bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 560 SDValue &CC, bool MatchStrict = false) const; 561 bool isOneUseSetCC(SDValue N) const; 562 563 SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 564 unsigned HiOp); 565 SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); 566 SDValue CombineExtLoad(SDNode *N); 567 SDValue CombineZExtLogicopShiftLoad(SDNode *N); 568 SDValue combineRepeatedFPDivisors(SDNode *N); 569 SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); 570 SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); 571 SDValue BuildSDIV(SDNode *N); 572 SDValue BuildSDIVPow2(SDNode *N); 573 SDValue BuildUDIV(SDNode *N); 574 SDValue BuildLogBase2(SDValue V, const SDLoc &DL); 575 SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); 576 SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); 577 SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); 578 SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); 579 SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, 580 SDNodeFlags Flags, bool Reciprocal); 581 SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, 582 SDNodeFlags Flags, bool Reciprocal); 583 SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 584 bool DemandHighBits = true); 585 SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); 586 SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, 587 SDValue InnerPos, SDValue InnerNeg, 588 unsigned PosOpcode, unsigned NegOpcode, 589 const SDLoc &DL); 590 SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg, 591 SDValue InnerPos, SDValue InnerNeg, 592 unsigned PosOpcode, unsigned NegOpcode, 593 const SDLoc &DL); 594 SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); 595 SDValue MatchLoadCombine(SDNode *N); 596 SDValue mergeTruncStores(StoreSDNode *N); 597 SDValue ReduceLoadWidth(SDNode *N); 598 SDValue ReduceLoadOpStoreWidth(SDNode *N); 599 SDValue splitMergedValStore(StoreSDNode *ST); 600 SDValue TransformFPLoadStorePair(SDNode *N); 601 SDValue convertBuildVecZextToZext(SDNode *N); 602 SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); 603 SDValue reduceBuildVecTruncToBitCast(SDNode *N); 604 SDValue reduceBuildVecToShuffle(SDNode *N); 605 SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, 606 ArrayRef<int> VectorMask, SDValue VecIn1, 607 SDValue VecIn2, unsigned LeftIdx, 608 bool DidSplitVec); 609 SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); 610 611 /// Walk up chain skipping non-aliasing memory nodes, 612 /// looking for aliasing nodes and adding them to the Aliases vector. 613 void GatherAllAliases(SDNode *N, SDValue OriginalChain, 614 SmallVectorImpl<SDValue> &Aliases); 615 616 /// Return true if there is any possibility that the two addresses overlap. 617 bool isAlias(SDNode *Op0, SDNode *Op1) const; 618 619 /// Walk up chain skipping non-aliasing memory nodes, looking for a better 620 /// chain (aliasing node.) 621 SDValue FindBetterChain(SDNode *N, SDValue Chain); 622 623 /// Try to replace a store and any possibly adjacent stores on 624 /// consecutive chains with better chains. Return true only if St is 625 /// replaced. 626 /// 627 /// Notice that other chains may still be replaced even if the function 628 /// returns false. 629 bool findBetterNeighborChains(StoreSDNode *St); 630 631 // Helper for findBetterNeighborChains. Walk up store chain add additional 632 // chained stores that do not overlap and can be parallelized. 633 bool parallelizeChainedStores(StoreSDNode *St); 634 635 /// Holds a pointer to an LSBaseSDNode as well as information on where it 636 /// is located in a sequence of memory operations connected by a chain. 637 struct MemOpLink { 638 // Ptr to the mem node. 639 LSBaseSDNode *MemNode; 640 641 // Offset from the base ptr. 642 int64_t OffsetFromBase; 643 644 MemOpLink(LSBaseSDNode *N, int64_t Offset) 645 : MemNode(N), OffsetFromBase(Offset) {} 646 }; 647 648 // Classify the origin of a stored value. 649 enum class StoreSource { Unknown, Constant, Extract, Load }; 650 StoreSource getStoreSource(SDValue StoreVal) { 651 switch (StoreVal.getOpcode()) { 652 case ISD::Constant: 653 case ISD::ConstantFP: 654 return StoreSource::Constant; 655 case ISD::EXTRACT_VECTOR_ELT: 656 case ISD::EXTRACT_SUBVECTOR: 657 return StoreSource::Extract; 658 case ISD::LOAD: 659 return StoreSource::Load; 660 default: 661 return StoreSource::Unknown; 662 } 663 } 664 665 /// This is a helper function for visitMUL to check the profitability 666 /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 667 /// MulNode is the original multiply, AddNode is (add x, c1), 668 /// and ConstNode is c2. 669 bool isMulAddWithConstProfitable(SDNode *MulNode, 670 SDValue &AddNode, 671 SDValue &ConstNode); 672 673 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns 674 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns 675 /// the type of the loaded value to be extended. 676 bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 677 EVT LoadResultTy, EVT &ExtVT); 678 679 /// Helper function to calculate whether the given Load/Store can have its 680 /// width reduced to ExtVT. 681 bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType, 682 EVT &MemVT, unsigned ShAmt = 0); 683 684 /// Used by BackwardsPropagateMask to find suitable loads. 685 bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads, 686 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 687 ConstantSDNode *Mask, SDNode *&NodeToMask); 688 /// Attempt to propagate a given AND node back to load leaves so that they 689 /// can be combined into narrow loads. 690 bool BackwardsPropagateMask(SDNode *N); 691 692 /// Helper function for mergeConsecutiveStores which merges the component 693 /// store chains. 694 SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 695 unsigned NumStores); 696 697 /// This is a helper function for mergeConsecutiveStores. When the source 698 /// elements of the consecutive stores are all constants or all extracted 699 /// vector elements, try to merge them into one larger store introducing 700 /// bitcasts if necessary. \return True if a merged store was created. 701 bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, 702 EVT MemVT, unsigned NumStores, 703 bool IsConstantSrc, bool UseVector, 704 bool UseTrunc); 705 706 /// This is a helper function for mergeConsecutiveStores. Stores that 707 /// potentially may be merged with St are placed in StoreNodes. RootNode is 708 /// a chain predecessor to all store candidates. 709 void getStoreMergeCandidates(StoreSDNode *St, 710 SmallVectorImpl<MemOpLink> &StoreNodes, 711 SDNode *&Root); 712 713 /// Helper function for mergeConsecutiveStores. Checks if candidate stores 714 /// have indirect dependency through their operands. RootNode is the 715 /// predecessor to all stores calculated by getStoreMergeCandidates and is 716 /// used to prune the dependency check. \return True if safe to merge. 717 bool checkMergeStoreCandidatesForDependencies( 718 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 719 SDNode *RootNode); 720 721 /// This is a helper function for mergeConsecutiveStores. Given a list of 722 /// store candidates, find the first N that are consecutive in memory. 723 /// Returns 0 if there are not at least 2 consecutive stores to try merging. 724 unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes, 725 int64_t ElementSizeBytes) const; 726 727 /// This is a helper function for mergeConsecutiveStores. It is used for 728 /// store chains that are composed entirely of constant values. 729 bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes, 730 unsigned NumConsecutiveStores, 731 EVT MemVT, SDNode *Root, bool AllowVectors); 732 733 /// This is a helper function for mergeConsecutiveStores. It is used for 734 /// store chains that are composed entirely of extracted vector elements. 735 /// When extracting multiple vector elements, try to store them in one 736 /// vector store rather than a sequence of scalar stores. 737 bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes, 738 unsigned NumConsecutiveStores, EVT MemVT, 739 SDNode *Root); 740 741 /// This is a helper function for mergeConsecutiveStores. It is used for 742 /// store chains that are composed entirely of loaded values. 743 bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, 744 unsigned NumConsecutiveStores, EVT MemVT, 745 SDNode *Root, bool AllowVectors, 746 bool IsNonTemporalStore, bool IsNonTemporalLoad); 747 748 /// Merge consecutive store operations into a wide store. 749 /// This optimization uses wide integers or vectors when possible. 750 /// \return true if stores were merged. 751 bool mergeConsecutiveStores(StoreSDNode *St); 752 753 /// Try to transform a truncation where C is a constant: 754 /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) 755 /// 756 /// \p N needs to be a truncation and its first operand an AND. Other 757 /// requirements are checked by the function (e.g. that trunc is 758 /// single-use) and if missed an empty SDValue is returned. 759 SDValue distributeTruncateThroughAnd(SDNode *N); 760 761 /// Helper function to determine whether the target supports operation 762 /// given by \p Opcode for type \p VT, that is, whether the operation 763 /// is legal or custom before legalizing operations, and whether is 764 /// legal (but not custom) after legalization. 765 bool hasOperation(unsigned Opcode, EVT VT) { 766 return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations); 767 } 768 769 public: 770 /// Runs the dag combiner on all nodes in the work list 771 void Run(CombineLevel AtLevel); 772 773 SelectionDAG &getDAG() const { return DAG; } 774 775 /// Returns a type large enough to hold any valid shift amount - before type 776 /// legalization these can be huge. 777 EVT getShiftAmountTy(EVT LHSTy) { 778 assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); 779 return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes); 780 } 781 782 /// This method returns true if we are running before type legalization or 783 /// if the specified VT is legal. 784 bool isTypeLegal(const EVT &VT) { 785 if (!LegalTypes) return true; 786 return TLI.isTypeLegal(VT); 787 } 788 789 /// Convenience wrapper around TargetLowering::getSetCCResultType 790 EVT getSetCCResultType(EVT VT) const { 791 return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 792 } 793 794 void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 795 SDValue OrigLoad, SDValue ExtLoad, 796 ISD::NodeType ExtType); 797 }; 798 799 /// This class is a DAGUpdateListener that removes any deleted 800 /// nodes from the worklist. 801 class WorklistRemover : public SelectionDAG::DAGUpdateListener { 802 DAGCombiner &DC; 803 804 public: 805 explicit WorklistRemover(DAGCombiner &dc) 806 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 807 808 void NodeDeleted(SDNode *N, SDNode *E) override { 809 DC.removeFromWorklist(N); 810 } 811 }; 812 813 class WorklistInserter : public SelectionDAG::DAGUpdateListener { 814 DAGCombiner &DC; 815 816 public: 817 explicit WorklistInserter(DAGCombiner &dc) 818 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 819 820 // FIXME: Ideally we could add N to the worklist, but this causes exponential 821 // compile time costs in large DAGs, e.g. Halide. 822 void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } 823 }; 824 825 } // end anonymous namespace 826 827 //===----------------------------------------------------------------------===// 828 // TargetLowering::DAGCombinerInfo implementation 829 //===----------------------------------------------------------------------===// 830 831 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { 832 ((DAGCombiner*)DC)->AddToWorklist(N); 833 } 834 835 SDValue TargetLowering::DAGCombinerInfo:: 836 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) { 837 return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); 838 } 839 840 SDValue TargetLowering::DAGCombinerInfo:: 841 CombineTo(SDNode *N, SDValue Res, bool AddTo) { 842 return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); 843 } 844 845 SDValue TargetLowering::DAGCombinerInfo:: 846 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { 847 return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); 848 } 849 850 bool TargetLowering::DAGCombinerInfo:: 851 recursivelyDeleteUnusedNodes(SDNode *N) { 852 return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N); 853 } 854 855 void TargetLowering::DAGCombinerInfo:: 856 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 857 return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); 858 } 859 860 //===----------------------------------------------------------------------===// 861 // Helper Functions 862 //===----------------------------------------------------------------------===// 863 864 void DAGCombiner::deleteAndRecombine(SDNode *N) { 865 removeFromWorklist(N); 866 867 // If the operands of this node are only used by the node, they will now be 868 // dead. Make sure to re-visit them and recursively delete dead nodes. 869 for (const SDValue &Op : N->ops()) 870 // For an operand generating multiple values, one of the values may 871 // become dead allowing further simplification (e.g. split index 872 // arithmetic from an indexed load). 873 if (Op->hasOneUse() || Op->getNumValues() > 1) 874 AddToWorklist(Op.getNode()); 875 876 DAG.DeleteNode(N); 877 } 878 879 // APInts must be the same size for most operations, this helper 880 // function zero extends the shorter of the pair so that they match. 881 // We provide an Offset so that we can create bitwidths that won't overflow. 882 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) { 883 unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth()); 884 LHS = LHS.zextOrSelf(Bits); 885 RHS = RHS.zextOrSelf(Bits); 886 } 887 888 // Return true if this node is a setcc, or is a select_cc 889 // that selects between the target values used for true and false, making it 890 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to 891 // the appropriate nodes based on the type of node we are checking. This 892 // simplifies life a bit for the callers. 893 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 894 SDValue &CC, bool MatchStrict) const { 895 if (N.getOpcode() == ISD::SETCC) { 896 LHS = N.getOperand(0); 897 RHS = N.getOperand(1); 898 CC = N.getOperand(2); 899 return true; 900 } 901 902 if (MatchStrict && 903 (N.getOpcode() == ISD::STRICT_FSETCC || 904 N.getOpcode() == ISD::STRICT_FSETCCS)) { 905 LHS = N.getOperand(1); 906 RHS = N.getOperand(2); 907 CC = N.getOperand(3); 908 return true; 909 } 910 911 if (N.getOpcode() != ISD::SELECT_CC || 912 !TLI.isConstTrueVal(N.getOperand(2).getNode()) || 913 !TLI.isConstFalseVal(N.getOperand(3).getNode())) 914 return false; 915 916 if (TLI.getBooleanContents(N.getValueType()) == 917 TargetLowering::UndefinedBooleanContent) 918 return false; 919 920 LHS = N.getOperand(0); 921 RHS = N.getOperand(1); 922 CC = N.getOperand(4); 923 return true; 924 } 925 926 /// Return true if this is a SetCC-equivalent operation with only one use. 927 /// If this is true, it allows the users to invert the operation for free when 928 /// it is profitable to do so. 929 bool DAGCombiner::isOneUseSetCC(SDValue N) const { 930 SDValue N0, N1, N2; 931 if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) 932 return true; 933 return false; 934 } 935 936 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) { 937 if (!ScalarTy.isSimple()) 938 return false; 939 940 uint64_t MaskForTy = 0ULL; 941 switch (ScalarTy.getSimpleVT().SimpleTy) { 942 case MVT::i8: 943 MaskForTy = 0xFFULL; 944 break; 945 case MVT::i16: 946 MaskForTy = 0xFFFFULL; 947 break; 948 case MVT::i32: 949 MaskForTy = 0xFFFFFFFFULL; 950 break; 951 default: 952 return false; 953 break; 954 } 955 956 APInt Val; 957 if (ISD::isConstantSplatVector(N, Val)) 958 return Val.getLimitedValue() == MaskForTy; 959 960 return false; 961 } 962 963 // Determines if it is a constant integer or a splat/build vector of constant 964 // integers (and undefs). 965 // Do not permit build vector implicit truncation. 966 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { 967 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N)) 968 return !(Const->isOpaque() && NoOpaques); 969 if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR) 970 return false; 971 unsigned BitWidth = N.getScalarValueSizeInBits(); 972 for (const SDValue &Op : N->op_values()) { 973 if (Op.isUndef()) 974 continue; 975 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op); 976 if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || 977 (Const->isOpaque() && NoOpaques)) 978 return false; 979 } 980 return true; 981 } 982 983 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with 984 // undef's. 985 static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) { 986 if (V.getOpcode() != ISD::BUILD_VECTOR) 987 return false; 988 return isConstantOrConstantVector(V, NoOpaques) || 989 ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()); 990 } 991 992 // Determine if this an indexed load with an opaque target constant index. 993 static bool canSplitIdx(LoadSDNode *LD) { 994 return MaySplitLoadIndex && 995 (LD->getOperand(2).getOpcode() != ISD::TargetConstant || 996 !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque()); 997 } 998 999 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, 1000 const SDLoc &DL, 1001 SDValue N0, 1002 SDValue N1) { 1003 // Currently this only tries to ensure we don't undo the GEP splits done by 1004 // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this, 1005 // we check if the following transformation would be problematic: 1006 // (load/store (add, (add, x, offset1), offset2)) -> 1007 // (load/store (add, x, offset1+offset2)). 1008 1009 if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD) 1010 return false; 1011 1012 if (N0.hasOneUse()) 1013 return false; 1014 1015 auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1016 auto *C2 = dyn_cast<ConstantSDNode>(N1); 1017 if (!C1 || !C2) 1018 return false; 1019 1020 const APInt &C1APIntVal = C1->getAPIntValue(); 1021 const APInt &C2APIntVal = C2->getAPIntValue(); 1022 if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64) 1023 return false; 1024 1025 const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal; 1026 if (CombinedValueIntVal.getBitWidth() > 64) 1027 return false; 1028 const int64_t CombinedValue = CombinedValueIntVal.getSExtValue(); 1029 1030 for (SDNode *Node : N0->uses()) { 1031 auto LoadStore = dyn_cast<MemSDNode>(Node); 1032 if (LoadStore) { 1033 // Is x[offset2] already not a legal addressing mode? If so then 1034 // reassociating the constants breaks nothing (we test offset2 because 1035 // that's the one we hope to fold into the load or store). 1036 TargetLoweringBase::AddrMode AM; 1037 AM.HasBaseReg = true; 1038 AM.BaseOffs = C2APIntVal.getSExtValue(); 1039 EVT VT = LoadStore->getMemoryVT(); 1040 unsigned AS = LoadStore->getAddressSpace(); 1041 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); 1042 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 1043 continue; 1044 1045 // Would x[offset1+offset2] still be a legal addressing mode? 1046 AM.BaseOffs = CombinedValue; 1047 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 1048 return true; 1049 } 1050 } 1051 1052 return false; 1053 } 1054 1055 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression 1056 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. 1057 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, 1058 SDValue N0, SDValue N1) { 1059 EVT VT = N0.getValueType(); 1060 1061 if (N0.getOpcode() != Opc) 1062 return SDValue(); 1063 1064 if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { 1065 if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { 1066 // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) 1067 if (SDValue OpNode = 1068 DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1})) 1069 return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); 1070 return SDValue(); 1071 } 1072 if (N0.hasOneUse()) { 1073 // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) 1074 // iff (op x, c1) has one use 1075 SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); 1076 if (!OpNode.getNode()) 1077 return SDValue(); 1078 return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); 1079 } 1080 } 1081 return SDValue(); 1082 } 1083 1084 // Try to reassociate commutative binops. 1085 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 1086 SDValue N1, SDNodeFlags Flags) { 1087 assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); 1088 1089 // Floating-point reassociation is not allowed without loose FP math. 1090 if (N0.getValueType().isFloatingPoint() || 1091 N1.getValueType().isFloatingPoint()) 1092 if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros()) 1093 return SDValue(); 1094 1095 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) 1096 return Combined; 1097 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) 1098 return Combined; 1099 return SDValue(); 1100 } 1101 1102 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 1103 bool AddTo) { 1104 assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); 1105 ++NodesCombined; 1106 LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; 1107 To[0].getNode()->dump(&DAG); 1108 dbgs() << " and " << NumTo - 1 << " other values\n"); 1109 for (unsigned i = 0, e = NumTo; i != e; ++i) 1110 assert((!To[i].getNode() || 1111 N->getValueType(i) == To[i].getValueType()) && 1112 "Cannot combine value to value of different type!"); 1113 1114 WorklistRemover DeadNodes(*this); 1115 DAG.ReplaceAllUsesWith(N, To); 1116 if (AddTo) { 1117 // Push the new nodes and any users onto the worklist 1118 for (unsigned i = 0, e = NumTo; i != e; ++i) { 1119 if (To[i].getNode()) { 1120 AddToWorklist(To[i].getNode()); 1121 AddUsersToWorklist(To[i].getNode()); 1122 } 1123 } 1124 } 1125 1126 // Finally, if the node is now dead, remove it from the graph. The node 1127 // may not be dead if the replacement process recursively simplified to 1128 // something else needing this node. 1129 if (N->use_empty()) 1130 deleteAndRecombine(N); 1131 return SDValue(N, 0); 1132 } 1133 1134 void DAGCombiner:: 1135 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 1136 // Replace the old value with the new one. 1137 ++NodesCombined; 1138 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); 1139 dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); 1140 dbgs() << '\n'); 1141 1142 // Replace all uses. If any nodes become isomorphic to other nodes and 1143 // are deleted, make sure to remove them from our worklist. 1144 WorklistRemover DeadNodes(*this); 1145 DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); 1146 1147 // Push the new node and any (possibly new) users onto the worklist. 1148 AddToWorklistWithUsers(TLO.New.getNode()); 1149 1150 // Finally, if the node is now dead, remove it from the graph. The node 1151 // may not be dead if the replacement process recursively simplified to 1152 // something else needing this node. 1153 if (TLO.Old.getNode()->use_empty()) 1154 deleteAndRecombine(TLO.Old.getNode()); 1155 } 1156 1157 /// Check the specified integer node value to see if it can be simplified or if 1158 /// things it uses can be simplified by bit propagation. If so, return true. 1159 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 1160 const APInt &DemandedElts, 1161 bool AssumeSingleUse) { 1162 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1163 KnownBits Known; 1164 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0, 1165 AssumeSingleUse)) 1166 return false; 1167 1168 // Revisit the node. 1169 AddToWorklist(Op.getNode()); 1170 1171 CommitTargetLoweringOpt(TLO); 1172 return true; 1173 } 1174 1175 /// Check the specified vector node value to see if it can be simplified or 1176 /// if things it uses can be simplified as it only uses some of the elements. 1177 /// If so, return true. 1178 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, 1179 const APInt &DemandedElts, 1180 bool AssumeSingleUse) { 1181 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1182 APInt KnownUndef, KnownZero; 1183 if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, 1184 TLO, 0, AssumeSingleUse)) 1185 return false; 1186 1187 // Revisit the node. 1188 AddToWorklist(Op.getNode()); 1189 1190 CommitTargetLoweringOpt(TLO); 1191 return true; 1192 } 1193 1194 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { 1195 SDLoc DL(Load); 1196 EVT VT = Load->getValueType(0); 1197 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0)); 1198 1199 LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; 1200 Trunc.getNode()->dump(&DAG); dbgs() << '\n'); 1201 WorklistRemover DeadNodes(*this); 1202 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); 1203 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); 1204 deleteAndRecombine(Load); 1205 AddToWorklist(Trunc.getNode()); 1206 } 1207 1208 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { 1209 Replace = false; 1210 SDLoc DL(Op); 1211 if (ISD::isUNINDEXEDLoad(Op.getNode())) { 1212 LoadSDNode *LD = cast<LoadSDNode>(Op); 1213 EVT MemVT = LD->getMemoryVT(); 1214 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1215 : LD->getExtensionType(); 1216 Replace = true; 1217 return DAG.getExtLoad(ExtType, DL, PVT, 1218 LD->getChain(), LD->getBasePtr(), 1219 MemVT, LD->getMemOperand()); 1220 } 1221 1222 unsigned Opc = Op.getOpcode(); 1223 switch (Opc) { 1224 default: break; 1225 case ISD::AssertSext: 1226 if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) 1227 return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); 1228 break; 1229 case ISD::AssertZext: 1230 if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) 1231 return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); 1232 break; 1233 case ISD::Constant: { 1234 unsigned ExtOpc = 1235 Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1236 return DAG.getNode(ExtOpc, DL, PVT, Op); 1237 } 1238 } 1239 1240 if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) 1241 return SDValue(); 1242 return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op); 1243 } 1244 1245 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { 1246 if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) 1247 return SDValue(); 1248 EVT OldVT = Op.getValueType(); 1249 SDLoc DL(Op); 1250 bool Replace = false; 1251 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1252 if (!NewOp.getNode()) 1253 return SDValue(); 1254 AddToWorklist(NewOp.getNode()); 1255 1256 if (Replace) 1257 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1258 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp, 1259 DAG.getValueType(OldVT)); 1260 } 1261 1262 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { 1263 EVT OldVT = Op.getValueType(); 1264 SDLoc DL(Op); 1265 bool Replace = false; 1266 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1267 if (!NewOp.getNode()) 1268 return SDValue(); 1269 AddToWorklist(NewOp.getNode()); 1270 1271 if (Replace) 1272 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1273 return DAG.getZeroExtendInReg(NewOp, DL, OldVT); 1274 } 1275 1276 /// Promote the specified integer binary operation if the target indicates it is 1277 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1278 /// i32 since i16 instructions are longer. 1279 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { 1280 if (!LegalOperations) 1281 return SDValue(); 1282 1283 EVT VT = Op.getValueType(); 1284 if (VT.isVector() || !VT.isInteger()) 1285 return SDValue(); 1286 1287 // If operation type is 'undesirable', e.g. i16 on x86, consider 1288 // promoting it. 1289 unsigned Opc = Op.getOpcode(); 1290 if (TLI.isTypeDesirableForOp(Opc, VT)) 1291 return SDValue(); 1292 1293 EVT PVT = VT; 1294 // Consult target whether it is a good idea to promote this operation and 1295 // what's the right type to promote it to. 1296 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1297 assert(PVT != VT && "Don't know what type to promote to!"); 1298 1299 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1300 1301 bool Replace0 = false; 1302 SDValue N0 = Op.getOperand(0); 1303 SDValue NN0 = PromoteOperand(N0, PVT, Replace0); 1304 1305 bool Replace1 = false; 1306 SDValue N1 = Op.getOperand(1); 1307 SDValue NN1 = PromoteOperand(N1, PVT, Replace1); 1308 SDLoc DL(Op); 1309 1310 SDValue RV = 1311 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); 1312 1313 // We are always replacing N0/N1's use in N and only need additional 1314 // replacements if there are additional uses. 1315 // Note: We are checking uses of the *nodes* (SDNode) rather than values 1316 // (SDValue) here because the node may reference multiple values 1317 // (for example, the chain value of a load node). 1318 Replace0 &= !N0->hasOneUse(); 1319 Replace1 &= (N0 != N1) && !N1->hasOneUse(); 1320 1321 // Combine Op here so it is preserved past replacements. 1322 CombineTo(Op.getNode(), RV); 1323 1324 // If operands have a use ordering, make sure we deal with 1325 // predecessor first. 1326 if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { 1327 std::swap(N0, N1); 1328 std::swap(NN0, NN1); 1329 } 1330 1331 if (Replace0) { 1332 AddToWorklist(NN0.getNode()); 1333 ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); 1334 } 1335 if (Replace1) { 1336 AddToWorklist(NN1.getNode()); 1337 ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); 1338 } 1339 return Op; 1340 } 1341 return SDValue(); 1342 } 1343 1344 /// Promote the specified integer shift operation if the target indicates it is 1345 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1346 /// i32 since i16 instructions are longer. 1347 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { 1348 if (!LegalOperations) 1349 return SDValue(); 1350 1351 EVT VT = Op.getValueType(); 1352 if (VT.isVector() || !VT.isInteger()) 1353 return SDValue(); 1354 1355 // If operation type is 'undesirable', e.g. i16 on x86, consider 1356 // promoting it. 1357 unsigned Opc = Op.getOpcode(); 1358 if (TLI.isTypeDesirableForOp(Opc, VT)) 1359 return SDValue(); 1360 1361 EVT PVT = VT; 1362 // Consult target whether it is a good idea to promote this operation and 1363 // what's the right type to promote it to. 1364 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1365 assert(PVT != VT && "Don't know what type to promote to!"); 1366 1367 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1368 1369 bool Replace = false; 1370 SDValue N0 = Op.getOperand(0); 1371 SDValue N1 = Op.getOperand(1); 1372 if (Opc == ISD::SRA) 1373 N0 = SExtPromoteOperand(N0, PVT); 1374 else if (Opc == ISD::SRL) 1375 N0 = ZExtPromoteOperand(N0, PVT); 1376 else 1377 N0 = PromoteOperand(N0, PVT, Replace); 1378 1379 if (!N0.getNode()) 1380 return SDValue(); 1381 1382 SDLoc DL(Op); 1383 SDValue RV = 1384 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); 1385 1386 if (Replace) 1387 ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); 1388 1389 // Deal with Op being deleted. 1390 if (Op && Op.getOpcode() != ISD::DELETED_NODE) 1391 return RV; 1392 } 1393 return SDValue(); 1394 } 1395 1396 SDValue DAGCombiner::PromoteExtend(SDValue Op) { 1397 if (!LegalOperations) 1398 return SDValue(); 1399 1400 EVT VT = Op.getValueType(); 1401 if (VT.isVector() || !VT.isInteger()) 1402 return SDValue(); 1403 1404 // If operation type is 'undesirable', e.g. i16 on x86, consider 1405 // promoting it. 1406 unsigned Opc = Op.getOpcode(); 1407 if (TLI.isTypeDesirableForOp(Opc, VT)) 1408 return SDValue(); 1409 1410 EVT PVT = VT; 1411 // Consult target whether it is a good idea to promote this operation and 1412 // what's the right type to promote it to. 1413 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1414 assert(PVT != VT && "Don't know what type to promote to!"); 1415 // fold (aext (aext x)) -> (aext x) 1416 // fold (aext (zext x)) -> (zext x) 1417 // fold (aext (sext x)) -> (sext x) 1418 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1419 return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); 1420 } 1421 return SDValue(); 1422 } 1423 1424 bool DAGCombiner::PromoteLoad(SDValue Op) { 1425 if (!LegalOperations) 1426 return false; 1427 1428 if (!ISD::isUNINDEXEDLoad(Op.getNode())) 1429 return false; 1430 1431 EVT VT = Op.getValueType(); 1432 if (VT.isVector() || !VT.isInteger()) 1433 return false; 1434 1435 // If operation type is 'undesirable', e.g. i16 on x86, consider 1436 // promoting it. 1437 unsigned Opc = Op.getOpcode(); 1438 if (TLI.isTypeDesirableForOp(Opc, VT)) 1439 return false; 1440 1441 EVT PVT = VT; 1442 // Consult target whether it is a good idea to promote this operation and 1443 // what's the right type to promote it to. 1444 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1445 assert(PVT != VT && "Don't know what type to promote to!"); 1446 1447 SDLoc DL(Op); 1448 SDNode *N = Op.getNode(); 1449 LoadSDNode *LD = cast<LoadSDNode>(N); 1450 EVT MemVT = LD->getMemoryVT(); 1451 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1452 : LD->getExtensionType(); 1453 SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT, 1454 LD->getChain(), LD->getBasePtr(), 1455 MemVT, LD->getMemOperand()); 1456 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); 1457 1458 LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; 1459 Result.getNode()->dump(&DAG); dbgs() << '\n'); 1460 WorklistRemover DeadNodes(*this); 1461 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1462 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); 1463 deleteAndRecombine(N); 1464 AddToWorklist(Result.getNode()); 1465 return true; 1466 } 1467 return false; 1468 } 1469 1470 /// Recursively delete a node which has no uses and any operands for 1471 /// which it is the only use. 1472 /// 1473 /// Note that this both deletes the nodes and removes them from the worklist. 1474 /// It also adds any nodes who have had a user deleted to the worklist as they 1475 /// may now have only one use and subject to other combines. 1476 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { 1477 if (!N->use_empty()) 1478 return false; 1479 1480 SmallSetVector<SDNode *, 16> Nodes; 1481 Nodes.insert(N); 1482 do { 1483 N = Nodes.pop_back_val(); 1484 if (!N) 1485 continue; 1486 1487 if (N->use_empty()) { 1488 for (const SDValue &ChildN : N->op_values()) 1489 Nodes.insert(ChildN.getNode()); 1490 1491 removeFromWorklist(N); 1492 DAG.DeleteNode(N); 1493 } else { 1494 AddToWorklist(N); 1495 } 1496 } while (!Nodes.empty()); 1497 return true; 1498 } 1499 1500 //===----------------------------------------------------------------------===// 1501 // Main DAG Combiner implementation 1502 //===----------------------------------------------------------------------===// 1503 1504 void DAGCombiner::Run(CombineLevel AtLevel) { 1505 // set the instance variables, so that the various visit routines may use it. 1506 Level = AtLevel; 1507 LegalDAG = Level >= AfterLegalizeDAG; 1508 LegalOperations = Level >= AfterLegalizeVectorOps; 1509 LegalTypes = Level >= AfterLegalizeTypes; 1510 1511 WorklistInserter AddNodes(*this); 1512 1513 // Add all the dag nodes to the worklist. 1514 for (SDNode &Node : DAG.allnodes()) 1515 AddToWorklist(&Node); 1516 1517 // Create a dummy node (which is not added to allnodes), that adds a reference 1518 // to the root node, preventing it from being deleted, and tracking any 1519 // changes of the root. 1520 HandleSDNode Dummy(DAG.getRoot()); 1521 1522 // While we have a valid worklist entry node, try to combine it. 1523 while (SDNode *N = getNextWorklistEntry()) { 1524 // If N has no uses, it is dead. Make sure to revisit all N's operands once 1525 // N is deleted from the DAG, since they too may now be dead or may have a 1526 // reduced number of uses, allowing other xforms. 1527 if (recursivelyDeleteUnusedNodes(N)) 1528 continue; 1529 1530 WorklistRemover DeadNodes(*this); 1531 1532 // If this combine is running after legalizing the DAG, re-legalize any 1533 // nodes pulled off the worklist. 1534 if (LegalDAG) { 1535 SmallSetVector<SDNode *, 16> UpdatedNodes; 1536 bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); 1537 1538 for (SDNode *LN : UpdatedNodes) 1539 AddToWorklistWithUsers(LN); 1540 1541 if (!NIsValid) 1542 continue; 1543 } 1544 1545 LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); 1546 1547 // Add any operands of the new node which have not yet been combined to the 1548 // worklist as well. Because the worklist uniques things already, this 1549 // won't repeatedly process the same operand. 1550 CombinedNodes.insert(N); 1551 for (const SDValue &ChildN : N->op_values()) 1552 if (!CombinedNodes.count(ChildN.getNode())) 1553 AddToWorklist(ChildN.getNode()); 1554 1555 SDValue RV = combine(N); 1556 1557 if (!RV.getNode()) 1558 continue; 1559 1560 ++NodesCombined; 1561 1562 // If we get back the same node we passed in, rather than a new node or 1563 // zero, we know that the node must have defined multiple values and 1564 // CombineTo was used. Since CombineTo takes care of the worklist 1565 // mechanics for us, we have no work to do in this case. 1566 if (RV.getNode() == N) 1567 continue; 1568 1569 assert(N->getOpcode() != ISD::DELETED_NODE && 1570 RV.getOpcode() != ISD::DELETED_NODE && 1571 "Node was deleted but visit returned new node!"); 1572 1573 LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); 1574 1575 if (N->getNumValues() == RV.getNode()->getNumValues()) 1576 DAG.ReplaceAllUsesWith(N, RV.getNode()); 1577 else { 1578 assert(N->getValueType(0) == RV.getValueType() && 1579 N->getNumValues() == 1 && "Type mismatch"); 1580 DAG.ReplaceAllUsesWith(N, &RV); 1581 } 1582 1583 // Push the new node and any users onto the worklist. Omit this if the 1584 // new node is the EntryToken (e.g. if a store managed to get optimized 1585 // out), because re-visiting the EntryToken and its users will not uncover 1586 // any additional opportunities, but there may be a large number of such 1587 // users, potentially causing compile time explosion. 1588 if (RV.getOpcode() != ISD::EntryToken) { 1589 AddToWorklist(RV.getNode()); 1590 AddUsersToWorklist(RV.getNode()); 1591 } 1592 1593 // Finally, if the node is now dead, remove it from the graph. The node 1594 // may not be dead if the replacement process recursively simplified to 1595 // something else needing this node. This will also take care of adding any 1596 // operands which have lost a user to the worklist. 1597 recursivelyDeleteUnusedNodes(N); 1598 } 1599 1600 // If the root changed (e.g. it was a dead load, update the root). 1601 DAG.setRoot(Dummy.getValue()); 1602 DAG.RemoveDeadNodes(); 1603 } 1604 1605 SDValue DAGCombiner::visit(SDNode *N) { 1606 switch (N->getOpcode()) { 1607 default: break; 1608 case ISD::TokenFactor: return visitTokenFactor(N); 1609 case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); 1610 case ISD::ADD: return visitADD(N); 1611 case ISD::SUB: return visitSUB(N); 1612 case ISD::SADDSAT: 1613 case ISD::UADDSAT: return visitADDSAT(N); 1614 case ISD::SSUBSAT: 1615 case ISD::USUBSAT: return visitSUBSAT(N); 1616 case ISD::ADDC: return visitADDC(N); 1617 case ISD::SADDO: 1618 case ISD::UADDO: return visitADDO(N); 1619 case ISD::SUBC: return visitSUBC(N); 1620 case ISD::SSUBO: 1621 case ISD::USUBO: return visitSUBO(N); 1622 case ISD::ADDE: return visitADDE(N); 1623 case ISD::ADDCARRY: return visitADDCARRY(N); 1624 case ISD::SADDO_CARRY: return visitSADDO_CARRY(N); 1625 case ISD::SUBE: return visitSUBE(N); 1626 case ISD::SUBCARRY: return visitSUBCARRY(N); 1627 case ISD::SSUBO_CARRY: return visitSSUBO_CARRY(N); 1628 case ISD::SMULFIX: 1629 case ISD::SMULFIXSAT: 1630 case ISD::UMULFIX: 1631 case ISD::UMULFIXSAT: return visitMULFIX(N); 1632 case ISD::MUL: return visitMUL(N); 1633 case ISD::SDIV: return visitSDIV(N); 1634 case ISD::UDIV: return visitUDIV(N); 1635 case ISD::SREM: 1636 case ISD::UREM: return visitREM(N); 1637 case ISD::MULHU: return visitMULHU(N); 1638 case ISD::MULHS: return visitMULHS(N); 1639 case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); 1640 case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); 1641 case ISD::SMULO: 1642 case ISD::UMULO: return visitMULO(N); 1643 case ISD::SMIN: 1644 case ISD::SMAX: 1645 case ISD::UMIN: 1646 case ISD::UMAX: return visitIMINMAX(N); 1647 case ISD::AND: return visitAND(N); 1648 case ISD::OR: return visitOR(N); 1649 case ISD::XOR: return visitXOR(N); 1650 case ISD::SHL: return visitSHL(N); 1651 case ISD::SRA: return visitSRA(N); 1652 case ISD::SRL: return visitSRL(N); 1653 case ISD::ROTR: 1654 case ISD::ROTL: return visitRotate(N); 1655 case ISD::FSHL: 1656 case ISD::FSHR: return visitFunnelShift(N); 1657 case ISD::ABS: return visitABS(N); 1658 case ISD::BSWAP: return visitBSWAP(N); 1659 case ISD::BITREVERSE: return visitBITREVERSE(N); 1660 case ISD::CTLZ: return visitCTLZ(N); 1661 case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); 1662 case ISD::CTTZ: return visitCTTZ(N); 1663 case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); 1664 case ISD::CTPOP: return visitCTPOP(N); 1665 case ISD::SELECT: return visitSELECT(N); 1666 case ISD::VSELECT: return visitVSELECT(N); 1667 case ISD::SELECT_CC: return visitSELECT_CC(N); 1668 case ISD::SETCC: return visitSETCC(N); 1669 case ISD::SETCCCARRY: return visitSETCCCARRY(N); 1670 case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); 1671 case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); 1672 case ISD::ANY_EXTEND: return visitANY_EXTEND(N); 1673 case ISD::AssertSext: 1674 case ISD::AssertZext: return visitAssertExt(N); 1675 case ISD::AssertAlign: return visitAssertAlign(N); 1676 case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); 1677 case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); 1678 case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); 1679 case ISD::TRUNCATE: return visitTRUNCATE(N); 1680 case ISD::BITCAST: return visitBITCAST(N); 1681 case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); 1682 case ISD::FADD: return visitFADD(N); 1683 case ISD::STRICT_FADD: return visitSTRICT_FADD(N); 1684 case ISD::FSUB: return visitFSUB(N); 1685 case ISD::FMUL: return visitFMUL(N); 1686 case ISD::FMA: return visitFMA(N); 1687 case ISD::FDIV: return visitFDIV(N); 1688 case ISD::FREM: return visitFREM(N); 1689 case ISD::FSQRT: return visitFSQRT(N); 1690 case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); 1691 case ISD::FPOW: return visitFPOW(N); 1692 case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); 1693 case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); 1694 case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); 1695 case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); 1696 case ISD::FP_ROUND: return visitFP_ROUND(N); 1697 case ISD::FP_EXTEND: return visitFP_EXTEND(N); 1698 case ISD::FNEG: return visitFNEG(N); 1699 case ISD::FABS: return visitFABS(N); 1700 case ISD::FFLOOR: return visitFFLOOR(N); 1701 case ISD::FMINNUM: return visitFMINNUM(N); 1702 case ISD::FMAXNUM: return visitFMAXNUM(N); 1703 case ISD::FMINIMUM: return visitFMINIMUM(N); 1704 case ISD::FMAXIMUM: return visitFMAXIMUM(N); 1705 case ISD::FCEIL: return visitFCEIL(N); 1706 case ISD::FTRUNC: return visitFTRUNC(N); 1707 case ISD::BRCOND: return visitBRCOND(N); 1708 case ISD::BR_CC: return visitBR_CC(N); 1709 case ISD::LOAD: return visitLOAD(N); 1710 case ISD::STORE: return visitSTORE(N); 1711 case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); 1712 case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); 1713 case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); 1714 case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); 1715 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); 1716 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); 1717 case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); 1718 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); 1719 case ISD::MGATHER: return visitMGATHER(N); 1720 case ISD::MLOAD: return visitMLOAD(N); 1721 case ISD::MSCATTER: return visitMSCATTER(N); 1722 case ISD::MSTORE: return visitMSTORE(N); 1723 case ISD::LIFETIME_END: return visitLIFETIME_END(N); 1724 case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); 1725 case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); 1726 case ISD::FREEZE: return visitFREEZE(N); 1727 case ISD::VECREDUCE_FADD: 1728 case ISD::VECREDUCE_FMUL: 1729 case ISD::VECREDUCE_ADD: 1730 case ISD::VECREDUCE_MUL: 1731 case ISD::VECREDUCE_AND: 1732 case ISD::VECREDUCE_OR: 1733 case ISD::VECREDUCE_XOR: 1734 case ISD::VECREDUCE_SMAX: 1735 case ISD::VECREDUCE_SMIN: 1736 case ISD::VECREDUCE_UMAX: 1737 case ISD::VECREDUCE_UMIN: 1738 case ISD::VECREDUCE_FMAX: 1739 case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); 1740 } 1741 return SDValue(); 1742 } 1743 1744 SDValue DAGCombiner::combine(SDNode *N) { 1745 SDValue RV; 1746 if (!DisableGenericCombines) 1747 RV = visit(N); 1748 1749 // If nothing happened, try a target-specific DAG combine. 1750 if (!RV.getNode()) { 1751 assert(N->getOpcode() != ISD::DELETED_NODE && 1752 "Node was deleted but visit returned NULL!"); 1753 1754 if (N->getOpcode() >= ISD::BUILTIN_OP_END || 1755 TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { 1756 1757 // Expose the DAG combiner to the target combiner impls. 1758 TargetLowering::DAGCombinerInfo 1759 DagCombineInfo(DAG, Level, false, this); 1760 1761 RV = TLI.PerformDAGCombine(N, DagCombineInfo); 1762 } 1763 } 1764 1765 // If nothing happened still, try promoting the operation. 1766 if (!RV.getNode()) { 1767 switch (N->getOpcode()) { 1768 default: break; 1769 case ISD::ADD: 1770 case ISD::SUB: 1771 case ISD::MUL: 1772 case ISD::AND: 1773 case ISD::OR: 1774 case ISD::XOR: 1775 RV = PromoteIntBinOp(SDValue(N, 0)); 1776 break; 1777 case ISD::SHL: 1778 case ISD::SRA: 1779 case ISD::SRL: 1780 RV = PromoteIntShiftOp(SDValue(N, 0)); 1781 break; 1782 case ISD::SIGN_EXTEND: 1783 case ISD::ZERO_EXTEND: 1784 case ISD::ANY_EXTEND: 1785 RV = PromoteExtend(SDValue(N, 0)); 1786 break; 1787 case ISD::LOAD: 1788 if (PromoteLoad(SDValue(N, 0))) 1789 RV = SDValue(N, 0); 1790 break; 1791 } 1792 } 1793 1794 // If N is a commutative binary node, try to eliminate it if the commuted 1795 // version is already present in the DAG. 1796 if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && 1797 N->getNumValues() == 1) { 1798 SDValue N0 = N->getOperand(0); 1799 SDValue N1 = N->getOperand(1); 1800 1801 // Constant operands are canonicalized to RHS. 1802 if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) { 1803 SDValue Ops[] = {N1, N0}; 1804 SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, 1805 N->getFlags()); 1806 if (CSENode) 1807 return SDValue(CSENode, 0); 1808 } 1809 } 1810 1811 return RV; 1812 } 1813 1814 /// Given a node, return its input chain if it has one, otherwise return a null 1815 /// sd operand. 1816 static SDValue getInputChainForNode(SDNode *N) { 1817 if (unsigned NumOps = N->getNumOperands()) { 1818 if (N->getOperand(0).getValueType() == MVT::Other) 1819 return N->getOperand(0); 1820 if (N->getOperand(NumOps-1).getValueType() == MVT::Other) 1821 return N->getOperand(NumOps-1); 1822 for (unsigned i = 1; i < NumOps-1; ++i) 1823 if (N->getOperand(i).getValueType() == MVT::Other) 1824 return N->getOperand(i); 1825 } 1826 return SDValue(); 1827 } 1828 1829 SDValue DAGCombiner::visitTokenFactor(SDNode *N) { 1830 // If N has two operands, where one has an input chain equal to the other, 1831 // the 'other' chain is redundant. 1832 if (N->getNumOperands() == 2) { 1833 if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) 1834 return N->getOperand(0); 1835 if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) 1836 return N->getOperand(1); 1837 } 1838 1839 // Don't simplify token factors if optnone. 1840 if (OptLevel == CodeGenOpt::None) 1841 return SDValue(); 1842 1843 // Don't simplify the token factor if the node itself has too many operands. 1844 if (N->getNumOperands() > TokenFactorInlineLimit) 1845 return SDValue(); 1846 1847 // If the sole user is a token factor, we should make sure we have a 1848 // chance to merge them together. This prevents TF chains from inhibiting 1849 // optimizations. 1850 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) 1851 AddToWorklist(*(N->use_begin())); 1852 1853 SmallVector<SDNode *, 8> TFs; // List of token factors to visit. 1854 SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. 1855 SmallPtrSet<SDNode*, 16> SeenOps; 1856 bool Changed = false; // If we should replace this token factor. 1857 1858 // Start out with this token factor. 1859 TFs.push_back(N); 1860 1861 // Iterate through token factors. The TFs grows when new token factors are 1862 // encountered. 1863 for (unsigned i = 0; i < TFs.size(); ++i) { 1864 // Limit number of nodes to inline, to avoid quadratic compile times. 1865 // We have to add the outstanding Token Factors to Ops, otherwise we might 1866 // drop Ops from the resulting Token Factors. 1867 if (Ops.size() > TokenFactorInlineLimit) { 1868 for (unsigned j = i; j < TFs.size(); j++) 1869 Ops.emplace_back(TFs[j], 0); 1870 // Drop unprocessed Token Factors from TFs, so we do not add them to the 1871 // combiner worklist later. 1872 TFs.resize(i); 1873 break; 1874 } 1875 1876 SDNode *TF = TFs[i]; 1877 // Check each of the operands. 1878 for (const SDValue &Op : TF->op_values()) { 1879 switch (Op.getOpcode()) { 1880 case ISD::EntryToken: 1881 // Entry tokens don't need to be added to the list. They are 1882 // redundant. 1883 Changed = true; 1884 break; 1885 1886 case ISD::TokenFactor: 1887 if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { 1888 // Queue up for processing. 1889 TFs.push_back(Op.getNode()); 1890 Changed = true; 1891 break; 1892 } 1893 LLVM_FALLTHROUGH; 1894 1895 default: 1896 // Only add if it isn't already in the list. 1897 if (SeenOps.insert(Op.getNode()).second) 1898 Ops.push_back(Op); 1899 else 1900 Changed = true; 1901 break; 1902 } 1903 } 1904 } 1905 1906 // Re-visit inlined Token Factors, to clean them up in case they have been 1907 // removed. Skip the first Token Factor, as this is the current node. 1908 for (unsigned i = 1, e = TFs.size(); i < e; i++) 1909 AddToWorklist(TFs[i]); 1910 1911 // Remove Nodes that are chained to another node in the list. Do so 1912 // by walking up chains breath-first stopping when we've seen 1913 // another operand. In general we must climb to the EntryNode, but we can exit 1914 // early if we find all remaining work is associated with just one operand as 1915 // no further pruning is possible. 1916 1917 // List of nodes to search through and original Ops from which they originate. 1918 SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; 1919 SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. 1920 SmallPtrSet<SDNode *, 16> SeenChains; 1921 bool DidPruneOps = false; 1922 1923 unsigned NumLeftToConsider = 0; 1924 for (const SDValue &Op : Ops) { 1925 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); 1926 OpWorkCount.push_back(1); 1927 } 1928 1929 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { 1930 // If this is an Op, we can remove the op from the list. Remark any 1931 // search associated with it as from the current OpNumber. 1932 if (SeenOps.contains(Op)) { 1933 Changed = true; 1934 DidPruneOps = true; 1935 unsigned OrigOpNumber = 0; 1936 while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) 1937 OrigOpNumber++; 1938 assert((OrigOpNumber != Ops.size()) && 1939 "expected to find TokenFactor Operand"); 1940 // Re-mark worklist from OrigOpNumber to OpNumber 1941 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { 1942 if (Worklist[i].second == OrigOpNumber) { 1943 Worklist[i].second = OpNumber; 1944 } 1945 } 1946 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; 1947 OpWorkCount[OrigOpNumber] = 0; 1948 NumLeftToConsider--; 1949 } 1950 // Add if it's a new chain 1951 if (SeenChains.insert(Op).second) { 1952 OpWorkCount[OpNumber]++; 1953 Worklist.push_back(std::make_pair(Op, OpNumber)); 1954 } 1955 }; 1956 1957 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { 1958 // We need at least be consider at least 2 Ops to prune. 1959 if (NumLeftToConsider <= 1) 1960 break; 1961 auto CurNode = Worklist[i].first; 1962 auto CurOpNumber = Worklist[i].second; 1963 assert((OpWorkCount[CurOpNumber] > 0) && 1964 "Node should not appear in worklist"); 1965 switch (CurNode->getOpcode()) { 1966 case ISD::EntryToken: 1967 // Hitting EntryToken is the only way for the search to terminate without 1968 // hitting 1969 // another operand's search. Prevent us from marking this operand 1970 // considered. 1971 NumLeftToConsider++; 1972 break; 1973 case ISD::TokenFactor: 1974 for (const SDValue &Op : CurNode->op_values()) 1975 AddToWorklist(i, Op.getNode(), CurOpNumber); 1976 break; 1977 case ISD::LIFETIME_START: 1978 case ISD::LIFETIME_END: 1979 case ISD::CopyFromReg: 1980 case ISD::CopyToReg: 1981 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); 1982 break; 1983 default: 1984 if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) 1985 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); 1986 break; 1987 } 1988 OpWorkCount[CurOpNumber]--; 1989 if (OpWorkCount[CurOpNumber] == 0) 1990 NumLeftToConsider--; 1991 } 1992 1993 // If we've changed things around then replace token factor. 1994 if (Changed) { 1995 SDValue Result; 1996 if (Ops.empty()) { 1997 // The entry token is the only possible outcome. 1998 Result = DAG.getEntryNode(); 1999 } else { 2000 if (DidPruneOps) { 2001 SmallVector<SDValue, 8> PrunedOps; 2002 // 2003 for (const SDValue &Op : Ops) { 2004 if (SeenChains.count(Op.getNode()) == 0) 2005 PrunedOps.push_back(Op); 2006 } 2007 Result = DAG.getTokenFactor(SDLoc(N), PrunedOps); 2008 } else { 2009 Result = DAG.getTokenFactor(SDLoc(N), Ops); 2010 } 2011 } 2012 return Result; 2013 } 2014 return SDValue(); 2015 } 2016 2017 /// MERGE_VALUES can always be eliminated. 2018 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { 2019 WorklistRemover DeadNodes(*this); 2020 // Replacing results may cause a different MERGE_VALUES to suddenly 2021 // be CSE'd with N, and carry its uses with it. Iterate until no 2022 // uses remain, to ensure that the node can be safely deleted. 2023 // First add the users of this node to the work list so that they 2024 // can be tried again once they have new operands. 2025 AddUsersToWorklist(N); 2026 do { 2027 // Do as a single replacement to avoid rewalking use lists. 2028 SmallVector<SDValue, 8> Ops; 2029 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 2030 Ops.push_back(N->getOperand(i)); 2031 DAG.ReplaceAllUsesWith(N, Ops.data()); 2032 } while (!N->use_empty()); 2033 deleteAndRecombine(N); 2034 return SDValue(N, 0); // Return N so it doesn't get rechecked! 2035 } 2036 2037 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a 2038 /// ConstantSDNode pointer else nullptr. 2039 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { 2040 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N); 2041 return Const != nullptr && !Const->isOpaque() ? Const : nullptr; 2042 } 2043 2044 /// Return true if 'Use' is a load or a store that uses N as its base pointer 2045 /// and that N may be folded in the load / store addressing mode. 2046 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, 2047 const TargetLowering &TLI) { 2048 EVT VT; 2049 unsigned AS; 2050 2051 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { 2052 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 2053 return false; 2054 VT = LD->getMemoryVT(); 2055 AS = LD->getAddressSpace(); 2056 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { 2057 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 2058 return false; 2059 VT = ST->getMemoryVT(); 2060 AS = ST->getAddressSpace(); 2061 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { 2062 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 2063 return false; 2064 VT = LD->getMemoryVT(); 2065 AS = LD->getAddressSpace(); 2066 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { 2067 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 2068 return false; 2069 VT = ST->getMemoryVT(); 2070 AS = ST->getAddressSpace(); 2071 } else 2072 return false; 2073 2074 TargetLowering::AddrMode AM; 2075 if (N->getOpcode() == ISD::ADD) { 2076 AM.HasBaseReg = true; 2077 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2078 if (Offset) 2079 // [reg +/- imm] 2080 AM.BaseOffs = Offset->getSExtValue(); 2081 else 2082 // [reg +/- reg] 2083 AM.Scale = 1; 2084 } else if (N->getOpcode() == ISD::SUB) { 2085 AM.HasBaseReg = true; 2086 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2087 if (Offset) 2088 // [reg +/- imm] 2089 AM.BaseOffs = -Offset->getSExtValue(); 2090 else 2091 // [reg +/- reg] 2092 AM.Scale = 1; 2093 } else 2094 return false; 2095 2096 return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, 2097 VT.getTypeForEVT(*DAG.getContext()), AS); 2098 } 2099 2100 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { 2101 assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && 2102 "Unexpected binary operator"); 2103 2104 // Don't do this unless the old select is going away. We want to eliminate the 2105 // binary operator, not replace a binop with a select. 2106 // TODO: Handle ISD::SELECT_CC. 2107 unsigned SelOpNo = 0; 2108 SDValue Sel = BO->getOperand(0); 2109 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { 2110 SelOpNo = 1; 2111 Sel = BO->getOperand(1); 2112 } 2113 2114 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) 2115 return SDValue(); 2116 2117 SDValue CT = Sel.getOperand(1); 2118 if (!isConstantOrConstantVector(CT, true) && 2119 !DAG.isConstantFPBuildVectorOrConstantFP(CT)) 2120 return SDValue(); 2121 2122 SDValue CF = Sel.getOperand(2); 2123 if (!isConstantOrConstantVector(CF, true) && 2124 !DAG.isConstantFPBuildVectorOrConstantFP(CF)) 2125 return SDValue(); 2126 2127 // Bail out if any constants are opaque because we can't constant fold those. 2128 // The exception is "and" and "or" with either 0 or -1 in which case we can 2129 // propagate non constant operands into select. I.e.: 2130 // and (select Cond, 0, -1), X --> select Cond, 0, X 2131 // or X, (select Cond, -1, 0) --> select Cond, -1, X 2132 auto BinOpcode = BO->getOpcode(); 2133 bool CanFoldNonConst = 2134 (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && 2135 (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) && 2136 (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF)); 2137 2138 SDValue CBO = BO->getOperand(SelOpNo ^ 1); 2139 if (!CanFoldNonConst && 2140 !isConstantOrConstantVector(CBO, true) && 2141 !DAG.isConstantFPBuildVectorOrConstantFP(CBO)) 2142 return SDValue(); 2143 2144 EVT VT = BO->getValueType(0); 2145 2146 // We have a select-of-constants followed by a binary operator with a 2147 // constant. Eliminate the binop by pulling the constant math into the select. 2148 // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO 2149 SDLoc DL(Sel); 2150 SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) 2151 : DAG.getNode(BinOpcode, DL, VT, CT, CBO); 2152 if (!CanFoldNonConst && !NewCT.isUndef() && 2153 !isConstantOrConstantVector(NewCT, true) && 2154 !DAG.isConstantFPBuildVectorOrConstantFP(NewCT)) 2155 return SDValue(); 2156 2157 SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) 2158 : DAG.getNode(BinOpcode, DL, VT, CF, CBO); 2159 if (!CanFoldNonConst && !NewCF.isUndef() && 2160 !isConstantOrConstantVector(NewCF, true) && 2161 !DAG.isConstantFPBuildVectorOrConstantFP(NewCF)) 2162 return SDValue(); 2163 2164 SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); 2165 SelectOp->setFlags(BO->getFlags()); 2166 return SelectOp; 2167 } 2168 2169 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) { 2170 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2171 "Expecting add or sub"); 2172 2173 // Match a constant operand and a zext operand for the math instruction: 2174 // add Z, C 2175 // sub C, Z 2176 bool IsAdd = N->getOpcode() == ISD::ADD; 2177 SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0); 2178 SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1); 2179 auto *CN = dyn_cast<ConstantSDNode>(C); 2180 if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND) 2181 return SDValue(); 2182 2183 // Match the zext operand as a setcc of a boolean. 2184 if (Z.getOperand(0).getOpcode() != ISD::SETCC || 2185 Z.getOperand(0).getValueType() != MVT::i1) 2186 return SDValue(); 2187 2188 // Match the compare as: setcc (X & 1), 0, eq. 2189 SDValue SetCC = Z.getOperand(0); 2190 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 2191 if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) || 2192 SetCC.getOperand(0).getOpcode() != ISD::AND || 2193 !isOneConstant(SetCC.getOperand(0).getOperand(1))) 2194 return SDValue(); 2195 2196 // We are adding/subtracting a constant and an inverted low bit. Turn that 2197 // into a subtract/add of the low bit with incremented/decremented constant: 2198 // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1)) 2199 // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1)) 2200 EVT VT = C.getValueType(); 2201 SDLoc DL(N); 2202 SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT); 2203 SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) : 2204 DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); 2205 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit); 2206 } 2207 2208 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into 2209 /// a shift and add with a different constant. 2210 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { 2211 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2212 "Expecting add or sub"); 2213 2214 // We need a constant operand for the add/sub, and the other operand is a 2215 // logical shift right: add (srl), C or sub C, (srl). 2216 bool IsAdd = N->getOpcode() == ISD::ADD; 2217 SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0); 2218 SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1); 2219 if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) || 2220 ShiftOp.getOpcode() != ISD::SRL) 2221 return SDValue(); 2222 2223 // The shift must be of a 'not' value. 2224 SDValue Not = ShiftOp.getOperand(0); 2225 if (!Not.hasOneUse() || !isBitwiseNot(Not)) 2226 return SDValue(); 2227 2228 // The shift must be moving the sign bit to the least-significant-bit. 2229 EVT VT = ShiftOp.getValueType(); 2230 SDValue ShAmt = ShiftOp.getOperand(1); 2231 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 2232 if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1)) 2233 return SDValue(); 2234 2235 // Eliminate the 'not' by adjusting the shift and add/sub constant: 2236 // add (srl (not X), 31), C --> add (sra X, 31), (C + 1) 2237 // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1) 2238 SDLoc DL(N); 2239 auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL; 2240 SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt); 2241 if (SDValue NewC = 2242 DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT, 2243 {ConstantOp, DAG.getConstant(1, DL, VT)})) 2244 return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC); 2245 return SDValue(); 2246 } 2247 2248 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily 2249 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there 2250 /// are no common bits set in the operands). 2251 SDValue DAGCombiner::visitADDLike(SDNode *N) { 2252 SDValue N0 = N->getOperand(0); 2253 SDValue N1 = N->getOperand(1); 2254 EVT VT = N0.getValueType(); 2255 SDLoc DL(N); 2256 2257 // fold vector ops 2258 if (VT.isVector()) { 2259 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 2260 return FoldedVOp; 2261 2262 // fold (add x, 0) -> x, vector edition 2263 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 2264 return N0; 2265 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 2266 return N1; 2267 } 2268 2269 // fold (add x, undef) -> undef 2270 if (N0.isUndef()) 2271 return N0; 2272 2273 if (N1.isUndef()) 2274 return N1; 2275 2276 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2277 // canonicalize constant to RHS 2278 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2279 return DAG.getNode(ISD::ADD, DL, VT, N1, N0); 2280 // fold (add c1, c2) -> c1+c2 2281 return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}); 2282 } 2283 2284 // fold (add x, 0) -> x 2285 if (isNullConstant(N1)) 2286 return N0; 2287 2288 if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { 2289 // fold ((A-c1)+c2) -> (A+(c2-c1)) 2290 if (N0.getOpcode() == ISD::SUB && 2291 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { 2292 SDValue Sub = 2293 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)}); 2294 assert(Sub && "Constant folding failed"); 2295 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub); 2296 } 2297 2298 // fold ((c1-A)+c2) -> (c1+c2)-A 2299 if (N0.getOpcode() == ISD::SUB && 2300 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { 2301 SDValue Add = 2302 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)}); 2303 assert(Add && "Constant folding failed"); 2304 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2305 } 2306 2307 // add (sext i1 X), 1 -> zext (not i1 X) 2308 // We don't transform this pattern: 2309 // add (zext i1 X), -1 -> sext (not i1 X) 2310 // because most (?) targets generate better code for the zext form. 2311 if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && 2312 isOneOrOneSplat(N1)) { 2313 SDValue X = N0.getOperand(0); 2314 if ((!LegalOperations || 2315 (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && 2316 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && 2317 X.getScalarValueSizeInBits() == 1) { 2318 SDValue Not = DAG.getNOT(DL, X, X.getValueType()); 2319 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); 2320 } 2321 } 2322 2323 // Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is 2324 // equivalent to (add x, c0). 2325 if (N0.getOpcode() == ISD::OR && 2326 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) && 2327 DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { 2328 if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, 2329 {N1, N0.getOperand(1)})) 2330 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); 2331 } 2332 } 2333 2334 if (SDValue NewSel = foldBinOpIntoSelect(N)) 2335 return NewSel; 2336 2337 // reassociate add 2338 if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) { 2339 if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) 2340 return RADD; 2341 } 2342 // fold ((0-A) + B) -> B-A 2343 if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) 2344 return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2345 2346 // fold (A + (0-B)) -> A-B 2347 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 2348 return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); 2349 2350 // fold (A+(B-A)) -> B 2351 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) 2352 return N1.getOperand(0); 2353 2354 // fold ((B-A)+A) -> B 2355 if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) 2356 return N0.getOperand(0); 2357 2358 // fold ((A-B)+(C-A)) -> (C-B) 2359 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2360 N0.getOperand(0) == N1.getOperand(1)) 2361 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2362 N0.getOperand(1)); 2363 2364 // fold ((A-B)+(B-C)) -> (A-C) 2365 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2366 N0.getOperand(1) == N1.getOperand(0)) 2367 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 2368 N1.getOperand(1)); 2369 2370 // fold (A+(B-(A+C))) to (B-C) 2371 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2372 N0 == N1.getOperand(1).getOperand(0)) 2373 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2374 N1.getOperand(1).getOperand(1)); 2375 2376 // fold (A+(B-(C+A))) to (B-C) 2377 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2378 N0 == N1.getOperand(1).getOperand(1)) 2379 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2380 N1.getOperand(1).getOperand(0)); 2381 2382 // fold (A+((B-A)+or-C)) to (B+or-C) 2383 if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && 2384 N1.getOperand(0).getOpcode() == ISD::SUB && 2385 N0 == N1.getOperand(0).getOperand(1)) 2386 return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0), 2387 N1.getOperand(1)); 2388 2389 // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant 2390 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { 2391 SDValue N00 = N0.getOperand(0); 2392 SDValue N01 = N0.getOperand(1); 2393 SDValue N10 = N1.getOperand(0); 2394 SDValue N11 = N1.getOperand(1); 2395 2396 if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) 2397 return DAG.getNode(ISD::SUB, DL, VT, 2398 DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), 2399 DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); 2400 } 2401 2402 // fold (add (umax X, C), -C) --> (usubsat X, C) 2403 if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) { 2404 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { 2405 return (!Max && !Op) || 2406 (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue())); 2407 }; 2408 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT, 2409 /*AllowUndefs*/ true)) 2410 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), 2411 N0.getOperand(1)); 2412 } 2413 2414 if (SimplifyDemandedBits(SDValue(N, 0))) 2415 return SDValue(N, 0); 2416 2417 if (isOneOrOneSplat(N1)) { 2418 // fold (add (xor a, -1), 1) -> (sub 0, a) 2419 if (isBitwiseNot(N0)) 2420 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 2421 N0.getOperand(0)); 2422 2423 // fold (add (add (xor a, -1), b), 1) -> (sub b, a) 2424 if (N0.getOpcode() == ISD::ADD || 2425 N0.getOpcode() == ISD::UADDO || 2426 N0.getOpcode() == ISD::SADDO) { 2427 SDValue A, Xor; 2428 2429 if (isBitwiseNot(N0.getOperand(0))) { 2430 A = N0.getOperand(1); 2431 Xor = N0.getOperand(0); 2432 } else if (isBitwiseNot(N0.getOperand(1))) { 2433 A = N0.getOperand(0); 2434 Xor = N0.getOperand(1); 2435 } 2436 2437 if (Xor) 2438 return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0)); 2439 } 2440 2441 // Look for: 2442 // add (add x, y), 1 2443 // And if the target does not like this form then turn into: 2444 // sub y, (xor x, -1) 2445 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2446 N0.getOpcode() == ISD::ADD) { 2447 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2448 DAG.getAllOnesConstant(DL, VT)); 2449 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); 2450 } 2451 } 2452 2453 // (x - y) + -1 -> add (xor y, -1), x 2454 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2455 isAllOnesOrAllOnesSplat(N1)) { 2456 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1); 2457 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 2458 } 2459 2460 if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) 2461 return Combined; 2462 2463 if (SDValue Combined = visitADDLikeCommutative(N1, N0, N)) 2464 return Combined; 2465 2466 return SDValue(); 2467 } 2468 2469 SDValue DAGCombiner::visitADD(SDNode *N) { 2470 SDValue N0 = N->getOperand(0); 2471 SDValue N1 = N->getOperand(1); 2472 EVT VT = N0.getValueType(); 2473 SDLoc DL(N); 2474 2475 if (SDValue Combined = visitADDLike(N)) 2476 return Combined; 2477 2478 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 2479 return V; 2480 2481 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 2482 return V; 2483 2484 // fold (a+b) -> (a|b) iff a and b share no bits. 2485 if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && 2486 DAG.haveNoCommonBitsSet(N0, N1)) 2487 return DAG.getNode(ISD::OR, DL, VT, N0, N1); 2488 2489 // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)). 2490 if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { 2491 const APInt &C0 = N0->getConstantOperandAPInt(0); 2492 const APInt &C1 = N1->getConstantOperandAPInt(0); 2493 return DAG.getVScale(DL, VT, C0 + C1); 2494 } 2495 2496 // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2) 2497 if ((N0.getOpcode() == ISD::ADD) && 2498 (N0.getOperand(1).getOpcode() == ISD::VSCALE) && 2499 (N1.getOpcode() == ISD::VSCALE)) { 2500 const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0); 2501 const APInt &VS1 = N1->getConstantOperandAPInt(0); 2502 SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1); 2503 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS); 2504 } 2505 2506 // Fold (add step_vector(c1), step_vector(c2) to step_vector(c1+c2)) 2507 if (N0.getOpcode() == ISD::STEP_VECTOR && 2508 N1.getOpcode() == ISD::STEP_VECTOR) { 2509 const APInt &C0 = N0->getConstantOperandAPInt(0); 2510 const APInt &C1 = N1->getConstantOperandAPInt(0); 2511 EVT SVT = N0.getOperand(0).getValueType(); 2512 SDValue NewStep = DAG.getConstant(C0 + C1, DL, SVT); 2513 return DAG.getStepVector(DL, VT, NewStep); 2514 } 2515 2516 // Fold a + step_vector(c1) + step_vector(c2) to a + step_vector(c1+c2) 2517 if ((N0.getOpcode() == ISD::ADD) && 2518 (N0.getOperand(1).getOpcode() == ISD::STEP_VECTOR) && 2519 (N1.getOpcode() == ISD::STEP_VECTOR)) { 2520 const APInt &SV0 = N0.getOperand(1)->getConstantOperandAPInt(0); 2521 const APInt &SV1 = N1->getConstantOperandAPInt(0); 2522 EVT SVT = N1.getOperand(0).getValueType(); 2523 assert(N1.getOperand(0).getValueType() == 2524 N0.getOperand(1)->getOperand(0).getValueType() && 2525 "Different operand types of STEP_VECTOR."); 2526 SDValue NewStep = DAG.getConstant(SV0 + SV1, DL, SVT); 2527 SDValue SV = DAG.getStepVector(DL, VT, NewStep); 2528 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV); 2529 } 2530 2531 return SDValue(); 2532 } 2533 2534 SDValue DAGCombiner::visitADDSAT(SDNode *N) { 2535 unsigned Opcode = N->getOpcode(); 2536 SDValue N0 = N->getOperand(0); 2537 SDValue N1 = N->getOperand(1); 2538 EVT VT = N0.getValueType(); 2539 SDLoc DL(N); 2540 2541 // fold vector ops 2542 if (VT.isVector()) { 2543 // TODO SimplifyVBinOp 2544 2545 // fold (add_sat x, 0) -> x, vector edition 2546 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 2547 return N0; 2548 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 2549 return N1; 2550 } 2551 2552 // fold (add_sat x, undef) -> -1 2553 if (N0.isUndef() || N1.isUndef()) 2554 return DAG.getAllOnesConstant(DL, VT); 2555 2556 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2557 // canonicalize constant to RHS 2558 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2559 return DAG.getNode(Opcode, DL, VT, N1, N0); 2560 // fold (add_sat c1, c2) -> c3 2561 return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}); 2562 } 2563 2564 // fold (add_sat x, 0) -> x 2565 if (isNullConstant(N1)) 2566 return N0; 2567 2568 // If it cannot overflow, transform into an add. 2569 if (Opcode == ISD::UADDSAT) 2570 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2571 return DAG.getNode(ISD::ADD, DL, VT, N0, N1); 2572 2573 return SDValue(); 2574 } 2575 2576 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { 2577 bool Masked = false; 2578 2579 // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. 2580 while (true) { 2581 if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { 2582 V = V.getOperand(0); 2583 continue; 2584 } 2585 2586 if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { 2587 Masked = true; 2588 V = V.getOperand(0); 2589 continue; 2590 } 2591 2592 break; 2593 } 2594 2595 // If this is not a carry, return. 2596 if (V.getResNo() != 1) 2597 return SDValue(); 2598 2599 if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && 2600 V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) 2601 return SDValue(); 2602 2603 EVT VT = V.getNode()->getValueType(0); 2604 if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT)) 2605 return SDValue(); 2606 2607 // If the result is masked, then no matter what kind of bool it is we can 2608 // return. If it isn't, then we need to make sure the bool type is either 0 or 2609 // 1 and not other values. 2610 if (Masked || 2611 TLI.getBooleanContents(V.getValueType()) == 2612 TargetLoweringBase::ZeroOrOneBooleanContent) 2613 return V; 2614 2615 return SDValue(); 2616 } 2617 2618 /// Given the operands of an add/sub operation, see if the 2nd operand is a 2619 /// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert 2620 /// the opcode and bypass the mask operation. 2621 static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1, 2622 SelectionDAG &DAG, const SDLoc &DL) { 2623 if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1))) 2624 return SDValue(); 2625 2626 EVT VT = N0.getValueType(); 2627 if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits()) 2628 return SDValue(); 2629 2630 // add N0, (and (AssertSext X, i1), 1) --> sub N0, X 2631 // sub N0, (and (AssertSext X, i1), 1) --> add N0, X 2632 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0)); 2633 } 2634 2635 /// Helper for doing combines based on N0 and N1 being added to each other. 2636 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, 2637 SDNode *LocReference) { 2638 EVT VT = N0.getValueType(); 2639 SDLoc DL(LocReference); 2640 2641 // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) 2642 if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && 2643 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) 2644 return DAG.getNode(ISD::SUB, DL, VT, N0, 2645 DAG.getNode(ISD::SHL, DL, VT, 2646 N1.getOperand(0).getOperand(1), 2647 N1.getOperand(1))); 2648 2649 if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL)) 2650 return V; 2651 2652 // Look for: 2653 // add (add x, 1), y 2654 // And if the target does not like this form then turn into: 2655 // sub y, (xor x, -1) 2656 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2657 N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) { 2658 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2659 DAG.getAllOnesConstant(DL, VT)); 2660 return DAG.getNode(ISD::SUB, DL, VT, N1, Not); 2661 } 2662 2663 // Hoist one-use subtraction by non-opaque constant: 2664 // (x - C) + y -> (x + y) - C 2665 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 2666 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2667 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 2668 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1); 2669 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2670 } 2671 // Hoist one-use subtraction from non-opaque constant: 2672 // (C - x) + y -> (y - x) + C 2673 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2674 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 2675 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2676 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0)); 2677 } 2678 2679 // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1' 2680 // rather than 'add 0/-1' (the zext should get folded). 2681 // add (sext i1 Y), X --> sub X, (zext i1 Y) 2682 if (N0.getOpcode() == ISD::SIGN_EXTEND && 2683 N0.getOperand(0).getScalarValueSizeInBits() == 1 && 2684 TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) { 2685 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); 2686 return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); 2687 } 2688 2689 // add X, (sextinreg Y i1) -> sub X, (and Y 1) 2690 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 2691 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 2692 if (TN->getVT() == MVT::i1) { 2693 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 2694 DAG.getConstant(1, DL, VT)); 2695 return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); 2696 } 2697 } 2698 2699 // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2700 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && 2701 N1.getResNo() == 0) 2702 return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), 2703 N0, N1.getOperand(0), N1.getOperand(2)); 2704 2705 // (add X, Carry) -> (addcarry X, 0, Carry) 2706 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2707 if (SDValue Carry = getAsCarry(TLI, N1)) 2708 return DAG.getNode(ISD::ADDCARRY, DL, 2709 DAG.getVTList(VT, Carry.getValueType()), N0, 2710 DAG.getConstant(0, DL, VT), Carry); 2711 2712 return SDValue(); 2713 } 2714 2715 SDValue DAGCombiner::visitADDC(SDNode *N) { 2716 SDValue N0 = N->getOperand(0); 2717 SDValue N1 = N->getOperand(1); 2718 EVT VT = N0.getValueType(); 2719 SDLoc DL(N); 2720 2721 // If the flag result is dead, turn this into an ADD. 2722 if (!N->hasAnyUseOfValue(1)) 2723 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2724 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2725 2726 // canonicalize constant to RHS. 2727 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2728 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2729 if (N0C && !N1C) 2730 return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); 2731 2732 // fold (addc x, 0) -> x + no carry out 2733 if (isNullConstant(N1)) 2734 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, 2735 DL, MVT::Glue)); 2736 2737 // If it cannot overflow, transform into an add. 2738 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2739 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2740 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2741 2742 return SDValue(); 2743 } 2744 2745 /** 2746 * Flips a boolean if it is cheaper to compute. If the Force parameters is set, 2747 * then the flip also occurs if computing the inverse is the same cost. 2748 * This function returns an empty SDValue in case it cannot flip the boolean 2749 * without increasing the cost of the computation. If you want to flip a boolean 2750 * no matter what, use DAG.getLogicalNOT. 2751 */ 2752 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, 2753 const TargetLowering &TLI, 2754 bool Force) { 2755 if (Force && isa<ConstantSDNode>(V)) 2756 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType()); 2757 2758 if (V.getOpcode() != ISD::XOR) 2759 return SDValue(); 2760 2761 ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false); 2762 if (!Const) 2763 return SDValue(); 2764 2765 EVT VT = V.getValueType(); 2766 2767 bool IsFlip = false; 2768 switch(TLI.getBooleanContents(VT)) { 2769 case TargetLowering::ZeroOrOneBooleanContent: 2770 IsFlip = Const->isOne(); 2771 break; 2772 case TargetLowering::ZeroOrNegativeOneBooleanContent: 2773 IsFlip = Const->isAllOnesValue(); 2774 break; 2775 case TargetLowering::UndefinedBooleanContent: 2776 IsFlip = (Const->getAPIntValue() & 0x01) == 1; 2777 break; 2778 } 2779 2780 if (IsFlip) 2781 return V.getOperand(0); 2782 if (Force) 2783 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType()); 2784 return SDValue(); 2785 } 2786 2787 SDValue DAGCombiner::visitADDO(SDNode *N) { 2788 SDValue N0 = N->getOperand(0); 2789 SDValue N1 = N->getOperand(1); 2790 EVT VT = N0.getValueType(); 2791 bool IsSigned = (ISD::SADDO == N->getOpcode()); 2792 2793 EVT CarryVT = N->getValueType(1); 2794 SDLoc DL(N); 2795 2796 // If the flag result is dead, turn this into an ADD. 2797 if (!N->hasAnyUseOfValue(1)) 2798 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2799 DAG.getUNDEF(CarryVT)); 2800 2801 // canonicalize constant to RHS. 2802 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 2803 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2804 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 2805 2806 // fold (addo x, 0) -> x + no carry out 2807 if (isNullOrNullSplat(N1)) 2808 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 2809 2810 if (!IsSigned) { 2811 // If it cannot overflow, transform into an add. 2812 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2813 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2814 DAG.getConstant(0, DL, CarryVT)); 2815 2816 // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. 2817 if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { 2818 SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), 2819 DAG.getConstant(0, DL, VT), N0.getOperand(0)); 2820 return CombineTo( 2821 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1))); 2822 } 2823 2824 if (SDValue Combined = visitUADDOLike(N0, N1, N)) 2825 return Combined; 2826 2827 if (SDValue Combined = visitUADDOLike(N1, N0, N)) 2828 return Combined; 2829 } 2830 2831 return SDValue(); 2832 } 2833 2834 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { 2835 EVT VT = N0.getValueType(); 2836 if (VT.isVector()) 2837 return SDValue(); 2838 2839 // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2840 // If Y + 1 cannot overflow. 2841 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { 2842 SDValue Y = N1.getOperand(0); 2843 SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); 2844 if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) 2845 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, 2846 N1.getOperand(2)); 2847 } 2848 2849 // (uaddo X, Carry) -> (addcarry X, 0, Carry) 2850 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2851 if (SDValue Carry = getAsCarry(TLI, N1)) 2852 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, 2853 DAG.getConstant(0, SDLoc(N), VT), Carry); 2854 2855 return SDValue(); 2856 } 2857 2858 SDValue DAGCombiner::visitADDE(SDNode *N) { 2859 SDValue N0 = N->getOperand(0); 2860 SDValue N1 = N->getOperand(1); 2861 SDValue CarryIn = N->getOperand(2); 2862 2863 // canonicalize constant to RHS 2864 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2865 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2866 if (N0C && !N1C) 2867 return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), 2868 N1, N0, CarryIn); 2869 2870 // fold (adde x, y, false) -> (addc x, y) 2871 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 2872 return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); 2873 2874 return SDValue(); 2875 } 2876 2877 SDValue DAGCombiner::visitADDCARRY(SDNode *N) { 2878 SDValue N0 = N->getOperand(0); 2879 SDValue N1 = N->getOperand(1); 2880 SDValue CarryIn = N->getOperand(2); 2881 SDLoc DL(N); 2882 2883 // canonicalize constant to RHS 2884 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2885 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2886 if (N0C && !N1C) 2887 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); 2888 2889 // fold (addcarry x, y, false) -> (uaddo x, y) 2890 if (isNullConstant(CarryIn)) { 2891 if (!LegalOperations || 2892 TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) 2893 return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); 2894 } 2895 2896 // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. 2897 if (isNullConstant(N0) && isNullConstant(N1)) { 2898 EVT VT = N0.getValueType(); 2899 EVT CarryVT = CarryIn.getValueType(); 2900 SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); 2901 AddToWorklist(CarryExt.getNode()); 2902 return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, 2903 DAG.getConstant(1, DL, VT)), 2904 DAG.getConstant(0, DL, CarryVT)); 2905 } 2906 2907 if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) 2908 return Combined; 2909 2910 if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) 2911 return Combined; 2912 2913 return SDValue(); 2914 } 2915 2916 SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) { 2917 SDValue N0 = N->getOperand(0); 2918 SDValue N1 = N->getOperand(1); 2919 SDValue CarryIn = N->getOperand(2); 2920 SDLoc DL(N); 2921 2922 // canonicalize constant to RHS 2923 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2924 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2925 if (N0C && !N1C) 2926 return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn); 2927 2928 // fold (saddo_carry x, y, false) -> (saddo x, y) 2929 if (isNullConstant(CarryIn)) { 2930 if (!LegalOperations || 2931 TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0))) 2932 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1); 2933 } 2934 2935 return SDValue(); 2936 } 2937 2938 /** 2939 * If we are facing some sort of diamond carry propapagtion pattern try to 2940 * break it up to generate something like: 2941 * (addcarry X, 0, (addcarry A, B, Z):Carry) 2942 * 2943 * The end result is usually an increase in operation required, but because the 2944 * carry is now linearized, other tranforms can kick in and optimize the DAG. 2945 * 2946 * Patterns typically look something like 2947 * (uaddo A, B) 2948 * / \ 2949 * Carry Sum 2950 * | \ 2951 * | (addcarry *, 0, Z) 2952 * | / 2953 * \ Carry 2954 * | / 2955 * (addcarry X, *, *) 2956 * 2957 * But numerous variation exist. Our goal is to identify A, B, X and Z and 2958 * produce a combine with a single path for carry propagation. 2959 */ 2960 static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 2961 SDValue X, SDValue Carry0, SDValue Carry1, 2962 SDNode *N) { 2963 if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1) 2964 return SDValue(); 2965 if (Carry1.getOpcode() != ISD::UADDO) 2966 return SDValue(); 2967 2968 SDValue Z; 2969 2970 /** 2971 * First look for a suitable Z. It will present itself in the form of 2972 * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true 2973 */ 2974 if (Carry0.getOpcode() == ISD::ADDCARRY && 2975 isNullConstant(Carry0.getOperand(1))) { 2976 Z = Carry0.getOperand(2); 2977 } else if (Carry0.getOpcode() == ISD::UADDO && 2978 isOneConstant(Carry0.getOperand(1))) { 2979 EVT VT = Combiner.getSetCCResultType(Carry0.getValueType()); 2980 Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT); 2981 } else { 2982 // We couldn't find a suitable Z. 2983 return SDValue(); 2984 } 2985 2986 2987 auto cancelDiamond = [&](SDValue A,SDValue B) { 2988 SDLoc DL(N); 2989 SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z); 2990 Combiner.AddToWorklist(NewY.getNode()); 2991 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X, 2992 DAG.getConstant(0, DL, X.getValueType()), 2993 NewY.getValue(1)); 2994 }; 2995 2996 /** 2997 * (uaddo A, B) 2998 * | 2999 * Sum 3000 * | 3001 * (addcarry *, 0, Z) 3002 */ 3003 if (Carry0.getOperand(0) == Carry1.getValue(0)) { 3004 return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1)); 3005 } 3006 3007 /** 3008 * (addcarry A, 0, Z) 3009 * | 3010 * Sum 3011 * | 3012 * (uaddo *, B) 3013 */ 3014 if (Carry1.getOperand(0) == Carry0.getValue(0)) { 3015 return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1)); 3016 } 3017 3018 if (Carry1.getOperand(1) == Carry0.getValue(0)) { 3019 return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0)); 3020 } 3021 3022 return SDValue(); 3023 } 3024 3025 // If we are facing some sort of diamond carry/borrow in/out pattern try to 3026 // match patterns like: 3027 // 3028 // (uaddo A, B) CarryIn 3029 // | \ | 3030 // | \ | 3031 // PartialSum PartialCarryOutX / 3032 // | | / 3033 // | ____|____________/ 3034 // | / | 3035 // (uaddo *, *) \________ 3036 // | \ \ 3037 // | \ | 3038 // | PartialCarryOutY | 3039 // | \ | 3040 // | \ / 3041 // AddCarrySum | ______/ 3042 // | / 3043 // CarryOut = (or *, *) 3044 // 3045 // And generate ADDCARRY (or SUBCARRY) with two result values: 3046 // 3047 // {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) 3048 // 3049 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with 3050 // a single path for carry/borrow out propagation: 3051 static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 3052 const TargetLowering &TLI, SDValue Carry0, 3053 SDValue Carry1, SDNode *N) { 3054 if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) 3055 return SDValue(); 3056 unsigned Opcode = Carry0.getOpcode(); 3057 if (Opcode != Carry1.getOpcode()) 3058 return SDValue(); 3059 if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) 3060 return SDValue(); 3061 3062 // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the 3063 // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in 3064 // the above ASCII art.) 3065 if (Carry1.getOperand(0) != Carry0.getValue(0) && 3066 Carry1.getOperand(1) != Carry0.getValue(0)) 3067 std::swap(Carry0, Carry1); 3068 if (Carry1.getOperand(0) != Carry0.getValue(0) && 3069 Carry1.getOperand(1) != Carry0.getValue(0)) 3070 return SDValue(); 3071 3072 // The carry in value must be on the righthand side for subtraction. 3073 unsigned CarryInOperandNum = 3074 Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0; 3075 if (Opcode == ISD::USUBO && CarryInOperandNum != 1) 3076 return SDValue(); 3077 SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); 3078 3079 unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; 3080 if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) 3081 return SDValue(); 3082 3083 // Verify that the carry/borrow in is plausibly a carry/borrow bit. 3084 // TODO: make getAsCarry() aware of how partial carries are merged. 3085 if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) 3086 return SDValue(); 3087 CarryIn = CarryIn.getOperand(0); 3088 if (CarryIn.getValueType() != MVT::i1) 3089 return SDValue(); 3090 3091 SDLoc DL(N); 3092 SDValue Merged = 3093 DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0), 3094 Carry0.getOperand(1), CarryIn); 3095 3096 // Please note that because we have proven that the result of the UADDO/USUBO 3097 // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can 3098 // therefore prove that if the first UADDO/USUBO overflows, the second 3099 // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the 3100 // maximum value. 3101 // 3102 // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry 3103 // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) 3104 // 3105 // This is important because it means that OR and XOR can be used to merge 3106 // carry flags; and that AND can return a constant zero. 3107 // 3108 // TODO: match other operations that can merge flags (ADD, etc) 3109 DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); 3110 if (N->getOpcode() == ISD::AND) 3111 return DAG.getConstant(0, DL, MVT::i1); 3112 return Merged.getValue(1); 3113 } 3114 3115 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, 3116 SDNode *N) { 3117 // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. 3118 if (isBitwiseNot(N0)) 3119 if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) { 3120 SDLoc DL(N); 3121 SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, 3122 N0.getOperand(0), NotC); 3123 return CombineTo( 3124 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1))); 3125 } 3126 3127 // Iff the flag result is dead: 3128 // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) 3129 // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo 3130 // or the dependency between the instructions. 3131 if ((N0.getOpcode() == ISD::ADD || 3132 (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 && 3133 N0.getValue(1) != CarryIn)) && 3134 isNullConstant(N1) && !N->hasAnyUseOfValue(1)) 3135 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), 3136 N0.getOperand(0), N0.getOperand(1), CarryIn); 3137 3138 /** 3139 * When one of the addcarry argument is itself a carry, we may be facing 3140 * a diamond carry propagation. In which case we try to transform the DAG 3141 * to ensure linear carry propagation if that is possible. 3142 */ 3143 if (auto Y = getAsCarry(TLI, N1)) { 3144 // Because both are carries, Y and Z can be swapped. 3145 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) 3146 return R; 3147 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) 3148 return R; 3149 } 3150 3151 return SDValue(); 3152 } 3153 3154 // Attempt to create a USUBSAT(LHS, RHS) node with DstVT, performing a 3155 // clamp/truncation if necessary. 3156 static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS, 3157 SDValue RHS, SelectionDAG &DAG, 3158 const SDLoc &DL) { 3159 assert(DstVT.getScalarSizeInBits() <= SrcVT.getScalarSizeInBits() && 3160 "Illegal truncation"); 3161 3162 if (DstVT == SrcVT) 3163 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS); 3164 3165 // If the LHS is zero-extended then we can perform the USUBSAT as DstVT by 3166 // clamping RHS. 3167 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(), 3168 DstVT.getScalarSizeInBits()); 3169 if (!DAG.MaskedValueIsZero(LHS, UpperBits)) 3170 return SDValue(); 3171 3172 SDValue SatLimit = 3173 DAG.getConstant(APInt::getLowBitsSet(SrcVT.getScalarSizeInBits(), 3174 DstVT.getScalarSizeInBits()), 3175 DL, SrcVT); 3176 RHS = DAG.getNode(ISD::UMIN, DL, SrcVT, RHS, SatLimit); 3177 RHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, RHS); 3178 LHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, LHS); 3179 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS); 3180 } 3181 3182 // Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to 3183 // usubsat(a,b), optionally as a truncated type. 3184 SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) { 3185 if (N->getOpcode() != ISD::SUB || 3186 !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT))) 3187 return SDValue(); 3188 3189 EVT SubVT = N->getValueType(0); 3190 SDValue Op0 = N->getOperand(0); 3191 SDValue Op1 = N->getOperand(1); 3192 3193 // Try to find umax(a,b) - b or a - umin(a,b) patterns 3194 // they may be converted to usubsat(a,b). 3195 if (Op0.getOpcode() == ISD::UMAX) { 3196 SDValue MaxLHS = Op0.getOperand(0); 3197 SDValue MaxRHS = Op0.getOperand(1); 3198 if (MaxLHS == Op1) 3199 return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N)); 3200 if (MaxRHS == Op1) 3201 return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N)); 3202 } 3203 3204 if (Op1.getOpcode() == ISD::UMIN) { 3205 SDValue MinLHS = Op1.getOperand(0); 3206 SDValue MinRHS = Op1.getOperand(1); 3207 if (MinLHS == Op0) 3208 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N)); 3209 if (MinRHS == Op0) 3210 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N)); 3211 } 3212 3213 // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit))) 3214 if (Op1.getOpcode() == ISD::TRUNCATE && 3215 Op1.getOperand(0).getOpcode() == ISD::UMIN) { 3216 SDValue MinLHS = Op1.getOperand(0).getOperand(0); 3217 SDValue MinRHS = Op1.getOperand(0).getOperand(1); 3218 if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0) 3219 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS, 3220 DAG, SDLoc(N)); 3221 if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0) 3222 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS, 3223 DAG, SDLoc(N)); 3224 } 3225 3226 return SDValue(); 3227 } 3228 3229 // Since it may not be valid to emit a fold to zero for vector initializers 3230 // check if we can before folding. 3231 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, 3232 SelectionDAG &DAG, bool LegalOperations) { 3233 if (!VT.isVector()) 3234 return DAG.getConstant(0, DL, VT); 3235 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 3236 return DAG.getConstant(0, DL, VT); 3237 return SDValue(); 3238 } 3239 3240 SDValue DAGCombiner::visitSUB(SDNode *N) { 3241 SDValue N0 = N->getOperand(0); 3242 SDValue N1 = N->getOperand(1); 3243 EVT VT = N0.getValueType(); 3244 SDLoc DL(N); 3245 3246 // fold vector ops 3247 if (VT.isVector()) { 3248 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3249 return FoldedVOp; 3250 3251 // fold (sub x, 0) -> x, vector edition 3252 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 3253 return N0; 3254 } 3255 3256 // fold (sub x, x) -> 0 3257 // FIXME: Refactor this and xor and other similar operations together. 3258 if (N0 == N1) 3259 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 3260 3261 // fold (sub c1, c2) -> c3 3262 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) 3263 return C; 3264 3265 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3266 return NewSel; 3267 3268 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 3269 3270 // fold (sub x, c) -> (add x, -c) 3271 if (N1C) { 3272 return DAG.getNode(ISD::ADD, DL, VT, N0, 3273 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 3274 } 3275 3276 if (isNullOrNullSplat(N0)) { 3277 unsigned BitWidth = VT.getScalarSizeInBits(); 3278 // Right-shifting everything out but the sign bit followed by negation is 3279 // the same as flipping arithmetic/logical shift type without the negation: 3280 // -(X >>u 31) -> (X >>s 31) 3281 // -(X >>s 31) -> (X >>u 31) 3282 if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { 3283 ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); 3284 if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) { 3285 auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; 3286 if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) 3287 return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); 3288 } 3289 } 3290 3291 // 0 - X --> 0 if the sub is NUW. 3292 if (N->getFlags().hasNoUnsignedWrap()) 3293 return N0; 3294 3295 if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { 3296 // N1 is either 0 or the minimum signed value. If the sub is NSW, then 3297 // N1 must be 0 because negating the minimum signed value is undefined. 3298 if (N->getFlags().hasNoSignedWrap()) 3299 return N0; 3300 3301 // 0 - X --> X if X is 0 or the minimum signed value. 3302 return N1; 3303 } 3304 3305 // Convert 0 - abs(x). 3306 SDValue Result; 3307 if (N1->getOpcode() == ISD::ABS && 3308 !TLI.isOperationLegalOrCustom(ISD::ABS, VT) && 3309 TLI.expandABS(N1.getNode(), Result, DAG, true)) 3310 return Result; 3311 } 3312 3313 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) 3314 if (isAllOnesOrAllOnesSplat(N0)) 3315 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 3316 3317 // fold (A - (0-B)) -> A+B 3318 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 3319 return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1)); 3320 3321 // fold A-(A-B) -> B 3322 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) 3323 return N1.getOperand(1); 3324 3325 // fold (A+B)-A -> B 3326 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) 3327 return N0.getOperand(1); 3328 3329 // fold (A+B)-B -> A 3330 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) 3331 return N0.getOperand(0); 3332 3333 // fold (A+C1)-C2 -> A+(C1-C2) 3334 if (N0.getOpcode() == ISD::ADD && 3335 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3336 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3337 SDValue NewC = 3338 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1}); 3339 assert(NewC && "Constant folding failed"); 3340 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC); 3341 } 3342 3343 // fold C2-(A+C1) -> (C2-C1)-A 3344 if (N1.getOpcode() == ISD::ADD) { 3345 SDValue N11 = N1.getOperand(1); 3346 if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && 3347 isConstantOrConstantVector(N11, /* NoOpaques */ true)) { 3348 SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}); 3349 assert(NewC && "Constant folding failed"); 3350 return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); 3351 } 3352 } 3353 3354 // fold (A-C1)-C2 -> A-(C1+C2) 3355 if (N0.getOpcode() == ISD::SUB && 3356 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3357 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3358 SDValue NewC = 3359 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1}); 3360 assert(NewC && "Constant folding failed"); 3361 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC); 3362 } 3363 3364 // fold (c1-A)-c2 -> (c1-c2)-A 3365 if (N0.getOpcode() == ISD::SUB && 3366 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3367 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) { 3368 SDValue NewC = 3369 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1}); 3370 assert(NewC && "Constant folding failed"); 3371 return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1)); 3372 } 3373 3374 // fold ((A+(B+or-C))-B) -> A+or-C 3375 if (N0.getOpcode() == ISD::ADD && 3376 (N0.getOperand(1).getOpcode() == ISD::SUB || 3377 N0.getOperand(1).getOpcode() == ISD::ADD) && 3378 N0.getOperand(1).getOperand(0) == N1) 3379 return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0), 3380 N0.getOperand(1).getOperand(1)); 3381 3382 // fold ((A+(C+B))-B) -> A+C 3383 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && 3384 N0.getOperand(1).getOperand(1) == N1) 3385 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), 3386 N0.getOperand(1).getOperand(0)); 3387 3388 // fold ((A-(B-C))-C) -> A-B 3389 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && 3390 N0.getOperand(1).getOperand(1) == N1) 3391 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 3392 N0.getOperand(1).getOperand(0)); 3393 3394 // fold (A-(B-C)) -> A+(C-B) 3395 if (N1.getOpcode() == ISD::SUB && N1.hasOneUse()) 3396 return DAG.getNode(ISD::ADD, DL, VT, N0, 3397 DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1), 3398 N1.getOperand(0))); 3399 3400 // A - (A & B) -> A & (~B) 3401 if (N1.getOpcode() == ISD::AND) { 3402 SDValue A = N1.getOperand(0); 3403 SDValue B = N1.getOperand(1); 3404 if (A != N0) 3405 std::swap(A, B); 3406 if (A == N0 && 3407 (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) { 3408 SDValue InvB = 3409 DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT)); 3410 return DAG.getNode(ISD::AND, DL, VT, A, InvB); 3411 } 3412 } 3413 3414 // fold (X - (-Y * Z)) -> (X + (Y * Z)) 3415 if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { 3416 if (N1.getOperand(0).getOpcode() == ISD::SUB && 3417 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) { 3418 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3419 N1.getOperand(0).getOperand(1), 3420 N1.getOperand(1)); 3421 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3422 } 3423 if (N1.getOperand(1).getOpcode() == ISD::SUB && 3424 isNullOrNullSplat(N1.getOperand(1).getOperand(0))) { 3425 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3426 N1.getOperand(0), 3427 N1.getOperand(1).getOperand(1)); 3428 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3429 } 3430 } 3431 3432 // If either operand of a sub is undef, the result is undef 3433 if (N0.isUndef()) 3434 return N0; 3435 if (N1.isUndef()) 3436 return N1; 3437 3438 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 3439 return V; 3440 3441 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 3442 return V; 3443 3444 if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) 3445 return V; 3446 3447 if (SDValue V = foldSubToUSubSat(VT, N)) 3448 return V; 3449 3450 // (x - y) - 1 -> add (xor y, -1), x 3451 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) { 3452 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), 3453 DAG.getAllOnesConstant(DL, VT)); 3454 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 3455 } 3456 3457 // Look for: 3458 // sub y, (xor x, -1) 3459 // And if the target does not like this form then turn into: 3460 // add (add x, y), 1 3461 if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) { 3462 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0)); 3463 return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT)); 3464 } 3465 3466 // Hoist one-use addition by non-opaque constant: 3467 // (x + C) - y -> (x - y) + C 3468 if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD && 3469 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3470 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3471 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); 3472 } 3473 // y - (x + C) -> (y - x) - C 3474 if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD && 3475 isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) { 3476 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0)); 3477 return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1)); 3478 } 3479 // (x - C) - y -> (x - y) - C 3480 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 3481 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3482 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3483 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3484 return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); 3485 } 3486 // (C - x) - y -> C - (x + y) 3487 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3488 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 3489 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); 3490 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); 3491 } 3492 3493 // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1' 3494 // rather than 'sub 0/1' (the sext should get folded). 3495 // sub X, (zext i1 Y) --> add X, (sext i1 Y) 3496 if (N1.getOpcode() == ISD::ZERO_EXTEND && 3497 N1.getOperand(0).getScalarValueSizeInBits() == 1 && 3498 TLI.getBooleanContents(VT) == 3499 TargetLowering::ZeroOrNegativeOneBooleanContent) { 3500 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0)); 3501 return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); 3502 } 3503 3504 // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) 3505 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 3506 if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { 3507 SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); 3508 SDValue S0 = N1.getOperand(0); 3509 if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) 3510 if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) 3511 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1)) 3512 return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); 3513 } 3514 } 3515 3516 // If the relocation model supports it, consider symbol offsets. 3517 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) 3518 if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { 3519 // fold (sub Sym, c) -> Sym-c 3520 if (N1C && GA->getOpcode() == ISD::GlobalAddress) 3521 return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, 3522 GA->getOffset() - 3523 (uint64_t)N1C->getSExtValue()); 3524 // fold (sub Sym+c1, Sym+c2) -> c1-c2 3525 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1)) 3526 if (GA->getGlobal() == GB->getGlobal()) 3527 return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), 3528 DL, VT); 3529 } 3530 3531 // sub X, (sextinreg Y i1) -> add X, (and Y 1) 3532 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 3533 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 3534 if (TN->getVT() == MVT::i1) { 3535 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 3536 DAG.getConstant(1, DL, VT)); 3537 return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); 3538 } 3539 } 3540 3541 // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) 3542 if (N1.getOpcode() == ISD::VSCALE) { 3543 const APInt &IntVal = N1.getConstantOperandAPInt(0); 3544 return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal)); 3545 } 3546 3547 // Prefer an add for more folding potential and possibly better codegen: 3548 // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) 3549 if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { 3550 SDValue ShAmt = N1.getOperand(1); 3551 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 3552 if (ShAmtC && 3553 ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) { 3554 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt); 3555 return DAG.getNode(ISD::ADD, DL, VT, N0, SRA); 3556 } 3557 } 3558 3559 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) { 3560 // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry) 3561 if (SDValue Carry = getAsCarry(TLI, N0)) { 3562 SDValue X = N1; 3563 SDValue Zero = DAG.getConstant(0, DL, VT); 3564 SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X); 3565 return DAG.getNode(ISD::ADDCARRY, DL, 3566 DAG.getVTList(VT, Carry.getValueType()), NegX, Zero, 3567 Carry); 3568 } 3569 } 3570 3571 return SDValue(); 3572 } 3573 3574 SDValue DAGCombiner::visitSUBSAT(SDNode *N) { 3575 SDValue N0 = N->getOperand(0); 3576 SDValue N1 = N->getOperand(1); 3577 EVT VT = N0.getValueType(); 3578 SDLoc DL(N); 3579 3580 // fold vector ops 3581 if (VT.isVector()) { 3582 // TODO SimplifyVBinOp 3583 3584 // fold (sub_sat x, 0) -> x, vector edition 3585 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 3586 return N0; 3587 } 3588 3589 // fold (sub_sat x, undef) -> 0 3590 if (N0.isUndef() || N1.isUndef()) 3591 return DAG.getConstant(0, DL, VT); 3592 3593 // fold (sub_sat x, x) -> 0 3594 if (N0 == N1) 3595 return DAG.getConstant(0, DL, VT); 3596 3597 // fold (sub_sat c1, c2) -> c3 3598 if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1})) 3599 return C; 3600 3601 // fold (sub_sat x, 0) -> x 3602 if (isNullConstant(N1)) 3603 return N0; 3604 3605 return SDValue(); 3606 } 3607 3608 SDValue DAGCombiner::visitSUBC(SDNode *N) { 3609 SDValue N0 = N->getOperand(0); 3610 SDValue N1 = N->getOperand(1); 3611 EVT VT = N0.getValueType(); 3612 SDLoc DL(N); 3613 3614 // If the flag result is dead, turn this into an SUB. 3615 if (!N->hasAnyUseOfValue(1)) 3616 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3617 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3618 3619 // fold (subc x, x) -> 0 + no borrow 3620 if (N0 == N1) 3621 return CombineTo(N, DAG.getConstant(0, DL, VT), 3622 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3623 3624 // fold (subc x, 0) -> x + no borrow 3625 if (isNullConstant(N1)) 3626 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3627 3628 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3629 if (isAllOnesConstant(N0)) 3630 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3631 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3632 3633 return SDValue(); 3634 } 3635 3636 SDValue DAGCombiner::visitSUBO(SDNode *N) { 3637 SDValue N0 = N->getOperand(0); 3638 SDValue N1 = N->getOperand(1); 3639 EVT VT = N0.getValueType(); 3640 bool IsSigned = (ISD::SSUBO == N->getOpcode()); 3641 3642 EVT CarryVT = N->getValueType(1); 3643 SDLoc DL(N); 3644 3645 // If the flag result is dead, turn this into an SUB. 3646 if (!N->hasAnyUseOfValue(1)) 3647 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3648 DAG.getUNDEF(CarryVT)); 3649 3650 // fold (subo x, x) -> 0 + no borrow 3651 if (N0 == N1) 3652 return CombineTo(N, DAG.getConstant(0, DL, VT), 3653 DAG.getConstant(0, DL, CarryVT)); 3654 3655 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 3656 3657 // fold (subox, c) -> (addo x, -c) 3658 if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) { 3659 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, 3660 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 3661 } 3662 3663 // fold (subo x, 0) -> x + no borrow 3664 if (isNullOrNullSplat(N1)) 3665 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 3666 3667 // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3668 if (!IsSigned && isAllOnesOrAllOnesSplat(N0)) 3669 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3670 DAG.getConstant(0, DL, CarryVT)); 3671 3672 return SDValue(); 3673 } 3674 3675 SDValue DAGCombiner::visitSUBE(SDNode *N) { 3676 SDValue N0 = N->getOperand(0); 3677 SDValue N1 = N->getOperand(1); 3678 SDValue CarryIn = N->getOperand(2); 3679 3680 // fold (sube x, y, false) -> (subc x, y) 3681 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 3682 return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); 3683 3684 return SDValue(); 3685 } 3686 3687 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { 3688 SDValue N0 = N->getOperand(0); 3689 SDValue N1 = N->getOperand(1); 3690 SDValue CarryIn = N->getOperand(2); 3691 3692 // fold (subcarry x, y, false) -> (usubo x, y) 3693 if (isNullConstant(CarryIn)) { 3694 if (!LegalOperations || 3695 TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) 3696 return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); 3697 } 3698 3699 return SDValue(); 3700 } 3701 3702 SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) { 3703 SDValue N0 = N->getOperand(0); 3704 SDValue N1 = N->getOperand(1); 3705 SDValue CarryIn = N->getOperand(2); 3706 3707 // fold (ssubo_carry x, y, false) -> (ssubo x, y) 3708 if (isNullConstant(CarryIn)) { 3709 if (!LegalOperations || 3710 TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0))) 3711 return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1); 3712 } 3713 3714 return SDValue(); 3715 } 3716 3717 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and 3718 // UMULFIXSAT here. 3719 SDValue DAGCombiner::visitMULFIX(SDNode *N) { 3720 SDValue N0 = N->getOperand(0); 3721 SDValue N1 = N->getOperand(1); 3722 SDValue Scale = N->getOperand(2); 3723 EVT VT = N0.getValueType(); 3724 3725 // fold (mulfix x, undef, scale) -> 0 3726 if (N0.isUndef() || N1.isUndef()) 3727 return DAG.getConstant(0, SDLoc(N), VT); 3728 3729 // Canonicalize constant to RHS (vector doesn't have to splat) 3730 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3731 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3732 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale); 3733 3734 // fold (mulfix x, 0, scale) -> 0 3735 if (isNullConstant(N1)) 3736 return DAG.getConstant(0, SDLoc(N), VT); 3737 3738 return SDValue(); 3739 } 3740 3741 SDValue DAGCombiner::visitMUL(SDNode *N) { 3742 SDValue N0 = N->getOperand(0); 3743 SDValue N1 = N->getOperand(1); 3744 EVT VT = N0.getValueType(); 3745 3746 // fold (mul x, undef) -> 0 3747 if (N0.isUndef() || N1.isUndef()) 3748 return DAG.getConstant(0, SDLoc(N), VT); 3749 3750 bool N1IsConst = false; 3751 bool N1IsOpaqueConst = false; 3752 APInt ConstValue1; 3753 3754 // fold vector ops 3755 if (VT.isVector()) { 3756 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3757 return FoldedVOp; 3758 3759 N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); 3760 assert((!N1IsConst || 3761 ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && 3762 "Splat APInt should be element width"); 3763 } else { 3764 N1IsConst = isa<ConstantSDNode>(N1); 3765 if (N1IsConst) { 3766 ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue(); 3767 N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque(); 3768 } 3769 } 3770 3771 // fold (mul c1, c2) -> c1*c2 3772 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1})) 3773 return C; 3774 3775 // canonicalize constant to RHS (vector doesn't have to splat) 3776 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3777 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3778 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); 3779 3780 // fold (mul x, 0) -> 0 3781 if (N1IsConst && ConstValue1.isNullValue()) 3782 return N1; 3783 3784 // fold (mul x, 1) -> x 3785 if (N1IsConst && ConstValue1.isOneValue()) 3786 return N0; 3787 3788 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3789 return NewSel; 3790 3791 // fold (mul x, -1) -> 0-x 3792 if (N1IsConst && ConstValue1.isAllOnesValue()) { 3793 SDLoc DL(N); 3794 return DAG.getNode(ISD::SUB, DL, VT, 3795 DAG.getConstant(0, DL, VT), N0); 3796 } 3797 3798 // fold (mul x, (1 << c)) -> x << c 3799 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 3800 DAG.isKnownToBeAPowerOfTwo(N1) && 3801 (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { 3802 SDLoc DL(N); 3803 SDValue LogBase2 = BuildLogBase2(N1, DL); 3804 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 3805 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 3806 return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); 3807 } 3808 3809 // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c 3810 if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { 3811 unsigned Log2Val = (-ConstValue1).logBase2(); 3812 SDLoc DL(N); 3813 // FIXME: If the input is something that is easily negated (e.g. a 3814 // single-use add), we should put the negate there. 3815 return DAG.getNode(ISD::SUB, DL, VT, 3816 DAG.getConstant(0, DL, VT), 3817 DAG.getNode(ISD::SHL, DL, VT, N0, 3818 DAG.getConstant(Log2Val, DL, 3819 getShiftAmountTy(N0.getValueType())))); 3820 } 3821 3822 // Try to transform: 3823 // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub. 3824 // mul x, (2^N + 1) --> add (shl x, N), x 3825 // mul x, (2^N - 1) --> sub (shl x, N), x 3826 // Examples: x * 33 --> (x << 5) + x 3827 // x * 15 --> (x << 4) - x 3828 // x * -33 --> -((x << 5) + x) 3829 // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) 3830 // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub. 3831 // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M)) 3832 // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M)) 3833 // Examples: x * 0x8800 --> (x << 15) + (x << 11) 3834 // x * 0xf800 --> (x << 16) - (x << 11) 3835 // x * -0x8800 --> -((x << 15) + (x << 11)) 3836 // x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16) 3837 if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { 3838 // TODO: We could handle more general decomposition of any constant by 3839 // having the target set a limit on number of ops and making a 3840 // callback to determine that sequence (similar to sqrt expansion). 3841 unsigned MathOp = ISD::DELETED_NODE; 3842 APInt MulC = ConstValue1.abs(); 3843 // The constant `2` should be treated as (2^0 + 1). 3844 unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros(); 3845 MulC.lshrInPlace(TZeros); 3846 if ((MulC - 1).isPowerOf2()) 3847 MathOp = ISD::ADD; 3848 else if ((MulC + 1).isPowerOf2()) 3849 MathOp = ISD::SUB; 3850 3851 if (MathOp != ISD::DELETED_NODE) { 3852 unsigned ShAmt = 3853 MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); 3854 ShAmt += TZeros; 3855 assert(ShAmt < VT.getScalarSizeInBits() && 3856 "multiply-by-constant generated out of bounds shift"); 3857 SDLoc DL(N); 3858 SDValue Shl = 3859 DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); 3860 SDValue R = 3861 TZeros ? DAG.getNode(MathOp, DL, VT, Shl, 3862 DAG.getNode(ISD::SHL, DL, VT, N0, 3863 DAG.getConstant(TZeros, DL, VT))) 3864 : DAG.getNode(MathOp, DL, VT, Shl, N0); 3865 if (ConstValue1.isNegative()) 3866 R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); 3867 return R; 3868 } 3869 } 3870 3871 // (mul (shl X, c1), c2) -> (mul X, c2 << c1) 3872 if (N0.getOpcode() == ISD::SHL && 3873 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3874 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3875 SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); 3876 if (isConstantOrConstantVector(C3)) 3877 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); 3878 } 3879 3880 // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one 3881 // use. 3882 { 3883 SDValue Sh(nullptr, 0), Y(nullptr, 0); 3884 3885 // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). 3886 if (N0.getOpcode() == ISD::SHL && 3887 isConstantOrConstantVector(N0.getOperand(1)) && 3888 N0.getNode()->hasOneUse()) { 3889 Sh = N0; Y = N1; 3890 } else if (N1.getOpcode() == ISD::SHL && 3891 isConstantOrConstantVector(N1.getOperand(1)) && 3892 N1.getNode()->hasOneUse()) { 3893 Sh = N1; Y = N0; 3894 } 3895 3896 if (Sh.getNode()) { 3897 SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); 3898 return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); 3899 } 3900 } 3901 3902 // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) 3903 if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && 3904 N0.getOpcode() == ISD::ADD && 3905 DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && 3906 isMulAddWithConstProfitable(N, N0, N1)) 3907 return DAG.getNode(ISD::ADD, SDLoc(N), VT, 3908 DAG.getNode(ISD::MUL, SDLoc(N0), VT, 3909 N0.getOperand(0), N1), 3910 DAG.getNode(ISD::MUL, SDLoc(N1), VT, 3911 N0.getOperand(1), N1)); 3912 3913 // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)). 3914 if (N0.getOpcode() == ISD::VSCALE) 3915 if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) { 3916 const APInt &C0 = N0.getConstantOperandAPInt(0); 3917 const APInt &C1 = NC1->getAPIntValue(); 3918 return DAG.getVScale(SDLoc(N), VT, C0 * C1); 3919 } 3920 3921 // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)). 3922 APInt MulVal; 3923 if (N0.getOpcode() == ISD::STEP_VECTOR) 3924 if (ISD::isConstantSplatVector(N1.getNode(), MulVal)) { 3925 const APInt &C0 = N0.getConstantOperandAPInt(0); 3926 EVT SVT = N0.getOperand(0).getValueType(); 3927 SDValue NewStep = DAG.getConstant( 3928 C0 * MulVal.sextOrTrunc(SVT.getSizeInBits()), SDLoc(N), SVT); 3929 return DAG.getStepVector(SDLoc(N), VT, NewStep); 3930 } 3931 3932 // Fold ((mul x, 0/undef) -> 0, 3933 // (mul x, 1) -> x) -> x) 3934 // -> and(x, mask) 3935 // We can replace vectors with '0' and '1' factors with a clearing mask. 3936 if (VT.isFixedLengthVector()) { 3937 unsigned NumElts = VT.getVectorNumElements(); 3938 SmallBitVector ClearMask; 3939 ClearMask.reserve(NumElts); 3940 auto IsClearMask = [&ClearMask](ConstantSDNode *V) { 3941 if (!V || V->isNullValue()) { 3942 ClearMask.push_back(true); 3943 return true; 3944 } 3945 ClearMask.push_back(false); 3946 return V->isOne(); 3947 }; 3948 if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) && 3949 ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) { 3950 assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector"); 3951 SDLoc DL(N); 3952 EVT LegalSVT = N1.getOperand(0).getValueType(); 3953 SDValue Zero = DAG.getConstant(0, DL, LegalSVT); 3954 SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT); 3955 SmallVector<SDValue, 16> Mask(NumElts, AllOnes); 3956 for (unsigned I = 0; I != NumElts; ++I) 3957 if (ClearMask[I]) 3958 Mask[I] = Zero; 3959 return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask)); 3960 } 3961 } 3962 3963 // reassociate mul 3964 if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) 3965 return RMUL; 3966 3967 return SDValue(); 3968 } 3969 3970 /// Return true if divmod libcall is available. 3971 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, 3972 const TargetLowering &TLI) { 3973 RTLIB::Libcall LC; 3974 EVT NodeType = Node->getValueType(0); 3975 if (!NodeType.isSimple()) 3976 return false; 3977 switch (NodeType.getSimpleVT().SimpleTy) { 3978 default: return false; // No libcall for vector types. 3979 case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 3980 case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 3981 case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 3982 case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 3983 case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; 3984 } 3985 3986 return TLI.getLibcallName(LC) != nullptr; 3987 } 3988 3989 /// Issue divrem if both quotient and remainder are needed. 3990 SDValue DAGCombiner::useDivRem(SDNode *Node) { 3991 if (Node->use_empty()) 3992 return SDValue(); // This is a dead node, leave it alone. 3993 3994 unsigned Opcode = Node->getOpcode(); 3995 bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); 3996 unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; 3997 3998 // DivMod lib calls can still work on non-legal types if using lib-calls. 3999 EVT VT = Node->getValueType(0); 4000 if (VT.isVector() || !VT.isInteger()) 4001 return SDValue(); 4002 4003 if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT)) 4004 return SDValue(); 4005 4006 // If DIVREM is going to get expanded into a libcall, 4007 // but there is no libcall available, then don't combine. 4008 if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && 4009 !isDivRemLibcallAvailable(Node, isSigned, TLI)) 4010 return SDValue(); 4011 4012 // If div is legal, it's better to do the normal expansion 4013 unsigned OtherOpcode = 0; 4014 if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { 4015 OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; 4016 if (TLI.isOperationLegalOrCustom(Opcode, VT)) 4017 return SDValue(); 4018 } else { 4019 OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 4020 if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) 4021 return SDValue(); 4022 } 4023 4024 SDValue Op0 = Node->getOperand(0); 4025 SDValue Op1 = Node->getOperand(1); 4026 SDValue combined; 4027 for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), 4028 UE = Op0.getNode()->use_end(); UI != UE; ++UI) { 4029 SDNode *User = *UI; 4030 if (User == Node || User->getOpcode() == ISD::DELETED_NODE || 4031 User->use_empty()) 4032 continue; 4033 // Convert the other matching node(s), too; 4034 // otherwise, the DIVREM may get target-legalized into something 4035 // target-specific that we won't be able to recognize. 4036 unsigned UserOpc = User->getOpcode(); 4037 if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && 4038 User->getOperand(0) == Op0 && 4039 User->getOperand(1) == Op1) { 4040 if (!combined) { 4041 if (UserOpc == OtherOpcode) { 4042 SDVTList VTs = DAG.getVTList(VT, VT); 4043 combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); 4044 } else if (UserOpc == DivRemOpc) { 4045 combined = SDValue(User, 0); 4046 } else { 4047 assert(UserOpc == Opcode); 4048 continue; 4049 } 4050 } 4051 if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) 4052 CombineTo(User, combined); 4053 else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) 4054 CombineTo(User, combined.getValue(1)); 4055 } 4056 } 4057 return combined; 4058 } 4059 4060 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { 4061 SDValue N0 = N->getOperand(0); 4062 SDValue N1 = N->getOperand(1); 4063 EVT VT = N->getValueType(0); 4064 SDLoc DL(N); 4065 4066 unsigned Opc = N->getOpcode(); 4067 bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc); 4068 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4069 4070 // X / undef -> undef 4071 // X % undef -> undef 4072 // X / 0 -> undef 4073 // X % 0 -> undef 4074 // NOTE: This includes vectors where any divisor element is zero/undef. 4075 if (DAG.isUndef(Opc, {N0, N1})) 4076 return DAG.getUNDEF(VT); 4077 4078 // undef / X -> 0 4079 // undef % X -> 0 4080 if (N0.isUndef()) 4081 return DAG.getConstant(0, DL, VT); 4082 4083 // 0 / X -> 0 4084 // 0 % X -> 0 4085 ConstantSDNode *N0C = isConstOrConstSplat(N0); 4086 if (N0C && N0C->isNullValue()) 4087 return N0; 4088 4089 // X / X -> 1 4090 // X % X -> 0 4091 if (N0 == N1) 4092 return DAG.getConstant(IsDiv ? 1 : 0, DL, VT); 4093 4094 // X / 1 -> X 4095 // X % 1 -> 0 4096 // If this is a boolean op (single-bit element type), we can't have 4097 // division-by-zero or remainder-by-zero, so assume the divisor is 1. 4098 // TODO: Similarly, if we're zero-extending a boolean divisor, then assume 4099 // it's a 1. 4100 if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1)) 4101 return IsDiv ? N0 : DAG.getConstant(0, DL, VT); 4102 4103 return SDValue(); 4104 } 4105 4106 SDValue DAGCombiner::visitSDIV(SDNode *N) { 4107 SDValue N0 = N->getOperand(0); 4108 SDValue N1 = N->getOperand(1); 4109 EVT VT = N->getValueType(0); 4110 EVT CCVT = getSetCCResultType(VT); 4111 4112 // fold vector ops 4113 if (VT.isVector()) 4114 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 4115 return FoldedVOp; 4116 4117 SDLoc DL(N); 4118 4119 // fold (sdiv c1, c2) -> c1/c2 4120 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4121 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1})) 4122 return C; 4123 4124 // fold (sdiv X, -1) -> 0-X 4125 if (N1C && N1C->isAllOnesValue()) 4126 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); 4127 4128 // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) 4129 if (N1C && N1C->getAPIntValue().isMinSignedValue()) 4130 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 4131 DAG.getConstant(1, DL, VT), 4132 DAG.getConstant(0, DL, VT)); 4133 4134 if (SDValue V = simplifyDivRem(N, DAG)) 4135 return V; 4136 4137 if (SDValue NewSel = foldBinOpIntoSelect(N)) 4138 return NewSel; 4139 4140 // If we know the sign bits of both operands are zero, strength reduce to a 4141 // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 4142 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 4143 return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); 4144 4145 if (SDValue V = visitSDIVLike(N0, N1, N)) { 4146 // If the corresponding remainder node exists, update its users with 4147 // (Dividend - (Quotient * Divisor). 4148 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(), 4149 { N0, N1 })) { 4150 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 4151 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4152 AddToWorklist(Mul.getNode()); 4153 AddToWorklist(Sub.getNode()); 4154 CombineTo(RemNode, Sub); 4155 } 4156 return V; 4157 } 4158 4159 // sdiv, srem -> sdivrem 4160 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 4161 // true. Otherwise, we break the simplification logic in visitREM(). 4162 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4163 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 4164 if (SDValue DivRem = useDivRem(N)) 4165 return DivRem; 4166 4167 return SDValue(); 4168 } 4169 4170 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { 4171 SDLoc DL(N); 4172 EVT VT = N->getValueType(0); 4173 EVT CCVT = getSetCCResultType(VT); 4174 unsigned BitWidth = VT.getScalarSizeInBits(); 4175 4176 // Helper for determining whether a value is a power-2 constant scalar or a 4177 // vector of such elements. 4178 auto IsPowerOfTwo = [](ConstantSDNode *C) { 4179 if (C->isNullValue() || C->isOpaque()) 4180 return false; 4181 if (C->getAPIntValue().isPowerOf2()) 4182 return true; 4183 if ((-C->getAPIntValue()).isPowerOf2()) 4184 return true; 4185 return false; 4186 }; 4187 4188 // fold (sdiv X, pow2) -> simple ops after legalize 4189 // FIXME: We check for the exact bit here because the generic lowering gives 4190 // better results in that case. The target-specific lowering should learn how 4191 // to handle exact sdivs efficiently. 4192 if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) { 4193 // Target-specific implementation of sdiv x, pow2. 4194 if (SDValue Res = BuildSDIVPow2(N)) 4195 return Res; 4196 4197 // Create constants that are functions of the shift amount value. 4198 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 4199 SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy); 4200 SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1); 4201 C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy); 4202 SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1); 4203 if (!isConstantOrConstantVector(Inexact)) 4204 return SDValue(); 4205 4206 // Splat the sign bit into the register 4207 SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0, 4208 DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy)); 4209 AddToWorklist(Sign.getNode()); 4210 4211 // Add (N0 < 0) ? abs2 - 1 : 0; 4212 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact); 4213 AddToWorklist(Srl.getNode()); 4214 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl); 4215 AddToWorklist(Add.getNode()); 4216 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1); 4217 AddToWorklist(Sra.getNode()); 4218 4219 // Special case: (sdiv X, 1) -> X 4220 // Special Case: (sdiv X, -1) -> 0-X 4221 SDValue One = DAG.getConstant(1, DL, VT); 4222 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 4223 SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ); 4224 SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ); 4225 SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes); 4226 Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra); 4227 4228 // If dividing by a positive value, we're done. Otherwise, the result must 4229 // be negated. 4230 SDValue Zero = DAG.getConstant(0, DL, VT); 4231 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra); 4232 4233 // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding. 4234 SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT); 4235 SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra); 4236 return Res; 4237 } 4238 4239 // If integer divide is expensive and we satisfy the requirements, emit an 4240 // alternate sequence. Targets may check function attributes for size/speed 4241 // trade-offs. 4242 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4243 if (isConstantOrConstantVector(N1) && 4244 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 4245 if (SDValue Op = BuildSDIV(N)) 4246 return Op; 4247 4248 return SDValue(); 4249 } 4250 4251 SDValue DAGCombiner::visitUDIV(SDNode *N) { 4252 SDValue N0 = N->getOperand(0); 4253 SDValue N1 = N->getOperand(1); 4254 EVT VT = N->getValueType(0); 4255 EVT CCVT = getSetCCResultType(VT); 4256 4257 // fold vector ops 4258 if (VT.isVector()) 4259 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 4260 return FoldedVOp; 4261 4262 SDLoc DL(N); 4263 4264 // fold (udiv c1, c2) -> c1/c2 4265 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4266 if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1})) 4267 return C; 4268 4269 // fold (udiv X, -1) -> select(X == -1, 1, 0) 4270 if (N1C && N1C->getAPIntValue().isAllOnesValue()) 4271 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 4272 DAG.getConstant(1, DL, VT), 4273 DAG.getConstant(0, DL, VT)); 4274 4275 if (SDValue V = simplifyDivRem(N, DAG)) 4276 return V; 4277 4278 if (SDValue NewSel = foldBinOpIntoSelect(N)) 4279 return NewSel; 4280 4281 if (SDValue V = visitUDIVLike(N0, N1, N)) { 4282 // If the corresponding remainder node exists, update its users with 4283 // (Dividend - (Quotient * Divisor). 4284 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(), 4285 { N0, N1 })) { 4286 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 4287 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4288 AddToWorklist(Mul.getNode()); 4289 AddToWorklist(Sub.getNode()); 4290 CombineTo(RemNode, Sub); 4291 } 4292 return V; 4293 } 4294 4295 // sdiv, srem -> sdivrem 4296 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 4297 // true. Otherwise, we break the simplification logic in visitREM(). 4298 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4299 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 4300 if (SDValue DivRem = useDivRem(N)) 4301 return DivRem; 4302 4303 return SDValue(); 4304 } 4305 4306 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { 4307 SDLoc DL(N); 4308 EVT VT = N->getValueType(0); 4309 4310 // fold (udiv x, (1 << c)) -> x >>u c 4311 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 4312 DAG.isKnownToBeAPowerOfTwo(N1)) { 4313 SDValue LogBase2 = BuildLogBase2(N1, DL); 4314 AddToWorklist(LogBase2.getNode()); 4315 4316 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 4317 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 4318 AddToWorklist(Trunc.getNode()); 4319 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 4320 } 4321 4322 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 4323 if (N1.getOpcode() == ISD::SHL) { 4324 SDValue N10 = N1.getOperand(0); 4325 if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && 4326 DAG.isKnownToBeAPowerOfTwo(N10)) { 4327 SDValue LogBase2 = BuildLogBase2(N10, DL); 4328 AddToWorklist(LogBase2.getNode()); 4329 4330 EVT ADDVT = N1.getOperand(1).getValueType(); 4331 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); 4332 AddToWorklist(Trunc.getNode()); 4333 SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); 4334 AddToWorklist(Add.getNode()); 4335 return DAG.getNode(ISD::SRL, DL, VT, N0, Add); 4336 } 4337 } 4338 4339 // fold (udiv x, c) -> alternate 4340 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4341 if (isConstantOrConstantVector(N1) && 4342 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 4343 if (SDValue Op = BuildUDIV(N)) 4344 return Op; 4345 4346 return SDValue(); 4347 } 4348 4349 // handles ISD::SREM and ISD::UREM 4350 SDValue DAGCombiner::visitREM(SDNode *N) { 4351 unsigned Opcode = N->getOpcode(); 4352 SDValue N0 = N->getOperand(0); 4353 SDValue N1 = N->getOperand(1); 4354 EVT VT = N->getValueType(0); 4355 EVT CCVT = getSetCCResultType(VT); 4356 4357 bool isSigned = (Opcode == ISD::SREM); 4358 SDLoc DL(N); 4359 4360 // fold (rem c1, c2) -> c1%c2 4361 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4362 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) 4363 return C; 4364 4365 // fold (urem X, -1) -> select(X == -1, 0, x) 4366 if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue()) 4367 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 4368 DAG.getConstant(0, DL, VT), N0); 4369 4370 if (SDValue V = simplifyDivRem(N, DAG)) 4371 return V; 4372 4373 if (SDValue NewSel = foldBinOpIntoSelect(N)) 4374 return NewSel; 4375 4376 if (isSigned) { 4377 // If we know the sign bits of both operands are zero, strength reduce to a 4378 // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 4379 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 4380 return DAG.getNode(ISD::UREM, DL, VT, N0, N1); 4381 } else { 4382 if (DAG.isKnownToBeAPowerOfTwo(N1)) { 4383 // fold (urem x, pow2) -> (and x, pow2-1) 4384 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 4385 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4386 AddToWorklist(Add.getNode()); 4387 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4388 } 4389 if (N1.getOpcode() == ISD::SHL && 4390 DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { 4391 // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) 4392 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 4393 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4394 AddToWorklist(Add.getNode()); 4395 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4396 } 4397 } 4398 4399 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4400 4401 // If X/C can be simplified by the division-by-constant logic, lower 4402 // X%C to the equivalent of X-X/C*C. 4403 // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the 4404 // speculative DIV must not cause a DIVREM conversion. We guard against this 4405 // by skipping the simplification if isIntDivCheap(). When div is not cheap, 4406 // combine will not return a DIVREM. Regardless, checking cheapness here 4407 // makes sense since the simplification results in fatter code. 4408 if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { 4409 SDValue OptimizedDiv = 4410 isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); 4411 if (OptimizedDiv.getNode()) { 4412 // If the equivalent Div node also exists, update its users. 4413 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 4414 if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), 4415 { N0, N1 })) 4416 CombineTo(DivNode, OptimizedDiv); 4417 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); 4418 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4419 AddToWorklist(OptimizedDiv.getNode()); 4420 AddToWorklist(Mul.getNode()); 4421 return Sub; 4422 } 4423 } 4424 4425 // sdiv, srem -> sdivrem 4426 if (SDValue DivRem = useDivRem(N)) 4427 return DivRem.getValue(1); 4428 4429 return SDValue(); 4430 } 4431 4432 SDValue DAGCombiner::visitMULHS(SDNode *N) { 4433 SDValue N0 = N->getOperand(0); 4434 SDValue N1 = N->getOperand(1); 4435 EVT VT = N->getValueType(0); 4436 SDLoc DL(N); 4437 4438 if (VT.isVector()) { 4439 // fold (mulhs x, 0) -> 0 4440 // do not return N0/N1, because undef node may exist. 4441 if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || 4442 ISD::isConstantSplatVectorAllZeros(N1.getNode())) 4443 return DAG.getConstant(0, DL, VT); 4444 } 4445 4446 // fold (mulhs x, 0) -> 0 4447 if (isNullConstant(N1)) 4448 return N1; 4449 // fold (mulhs x, 1) -> (sra x, size(x)-1) 4450 if (isOneConstant(N1)) 4451 return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, 4452 DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL, 4453 getShiftAmountTy(N0.getValueType()))); 4454 4455 // fold (mulhs x, undef) -> 0 4456 if (N0.isUndef() || N1.isUndef()) 4457 return DAG.getConstant(0, DL, VT); 4458 4459 // If the type twice as wide is legal, transform the mulhs to a wider multiply 4460 // plus a shift. 4461 if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() && 4462 !VT.isVector()) { 4463 MVT Simple = VT.getSimpleVT(); 4464 unsigned SimpleSize = Simple.getSizeInBits(); 4465 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4466 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4467 N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); 4468 N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); 4469 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4470 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4471 DAG.getConstant(SimpleSize, DL, 4472 getShiftAmountTy(N1.getValueType()))); 4473 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4474 } 4475 } 4476 4477 return SDValue(); 4478 } 4479 4480 SDValue DAGCombiner::visitMULHU(SDNode *N) { 4481 SDValue N0 = N->getOperand(0); 4482 SDValue N1 = N->getOperand(1); 4483 EVT VT = N->getValueType(0); 4484 SDLoc DL(N); 4485 4486 if (VT.isVector()) { 4487 // fold (mulhu x, 0) -> 0 4488 // do not return N0/N1, because undef node may exist. 4489 if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || 4490 ISD::isConstantSplatVectorAllZeros(N1.getNode())) 4491 return DAG.getConstant(0, DL, VT); 4492 } 4493 4494 // fold (mulhu x, 0) -> 0 4495 if (isNullConstant(N1)) 4496 return N1; 4497 // fold (mulhu x, 1) -> 0 4498 if (isOneConstant(N1)) 4499 return DAG.getConstant(0, DL, N0.getValueType()); 4500 // fold (mulhu x, undef) -> 0 4501 if (N0.isUndef() || N1.isUndef()) 4502 return DAG.getConstant(0, DL, VT); 4503 4504 // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) 4505 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 4506 DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { 4507 unsigned NumEltBits = VT.getScalarSizeInBits(); 4508 SDValue LogBase2 = BuildLogBase2(N1, DL); 4509 SDValue SRLAmt = DAG.getNode( 4510 ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); 4511 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 4512 SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); 4513 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 4514 } 4515 4516 // If the type twice as wide is legal, transform the mulhu to a wider multiply 4517 // plus a shift. 4518 if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() && 4519 !VT.isVector()) { 4520 MVT Simple = VT.getSimpleVT(); 4521 unsigned SimpleSize = Simple.getSizeInBits(); 4522 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4523 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4524 N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); 4525 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); 4526 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4527 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4528 DAG.getConstant(SimpleSize, DL, 4529 getShiftAmountTy(N1.getValueType()))); 4530 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4531 } 4532 } 4533 4534 return SDValue(); 4535 } 4536 4537 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp 4538 /// give the opcodes for the two computations that are being performed. Return 4539 /// true if a simplification was made. 4540 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 4541 unsigned HiOp) { 4542 // If the high half is not needed, just compute the low half. 4543 bool HiExists = N->hasAnyUseOfValue(1); 4544 if (!HiExists && (!LegalOperations || 4545 TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { 4546 SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4547 return CombineTo(N, Res, Res); 4548 } 4549 4550 // If the low half is not needed, just compute the high half. 4551 bool LoExists = N->hasAnyUseOfValue(0); 4552 if (!LoExists && (!LegalOperations || 4553 TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) { 4554 SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4555 return CombineTo(N, Res, Res); 4556 } 4557 4558 // If both halves are used, return as it is. 4559 if (LoExists && HiExists) 4560 return SDValue(); 4561 4562 // If the two computed results can be simplified separately, separate them. 4563 if (LoExists) { 4564 SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4565 AddToWorklist(Lo.getNode()); 4566 SDValue LoOpt = combine(Lo.getNode()); 4567 if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && 4568 (!LegalOperations || 4569 TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType()))) 4570 return CombineTo(N, LoOpt, LoOpt); 4571 } 4572 4573 if (HiExists) { 4574 SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4575 AddToWorklist(Hi.getNode()); 4576 SDValue HiOpt = combine(Hi.getNode()); 4577 if (HiOpt.getNode() && HiOpt != Hi && 4578 (!LegalOperations || 4579 TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType()))) 4580 return CombineTo(N, HiOpt, HiOpt); 4581 } 4582 4583 return SDValue(); 4584 } 4585 4586 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { 4587 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) 4588 return Res; 4589 4590 EVT VT = N->getValueType(0); 4591 SDLoc DL(N); 4592 4593 // If the type is twice as wide is legal, transform the mulhu to a wider 4594 // multiply plus a shift. 4595 if (VT.isSimple() && !VT.isVector()) { 4596 MVT Simple = VT.getSimpleVT(); 4597 unsigned SimpleSize = Simple.getSizeInBits(); 4598 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4599 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4600 SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); 4601 SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); 4602 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4603 // Compute the high part as N1. 4604 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4605 DAG.getConstant(SimpleSize, DL, 4606 getShiftAmountTy(Lo.getValueType()))); 4607 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4608 // Compute the low part as N0. 4609 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4610 return CombineTo(N, Lo, Hi); 4611 } 4612 } 4613 4614 return SDValue(); 4615 } 4616 4617 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { 4618 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) 4619 return Res; 4620 4621 EVT VT = N->getValueType(0); 4622 SDLoc DL(N); 4623 4624 // (umul_lohi N0, 0) -> (0, 0) 4625 if (isNullConstant(N->getOperand(1))) { 4626 SDValue Zero = DAG.getConstant(0, DL, VT); 4627 return CombineTo(N, Zero, Zero); 4628 } 4629 4630 // (umul_lohi N0, 1) -> (N0, 0) 4631 if (isOneConstant(N->getOperand(1))) { 4632 SDValue Zero = DAG.getConstant(0, DL, VT); 4633 return CombineTo(N, N->getOperand(0), Zero); 4634 } 4635 4636 // If the type is twice as wide is legal, transform the mulhu to a wider 4637 // multiply plus a shift. 4638 if (VT.isSimple() && !VT.isVector()) { 4639 MVT Simple = VT.getSimpleVT(); 4640 unsigned SimpleSize = Simple.getSizeInBits(); 4641 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4642 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4643 SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); 4644 SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); 4645 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4646 // Compute the high part as N1. 4647 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4648 DAG.getConstant(SimpleSize, DL, 4649 getShiftAmountTy(Lo.getValueType()))); 4650 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4651 // Compute the low part as N0. 4652 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4653 return CombineTo(N, Lo, Hi); 4654 } 4655 } 4656 4657 return SDValue(); 4658 } 4659 4660 SDValue DAGCombiner::visitMULO(SDNode *N) { 4661 SDValue N0 = N->getOperand(0); 4662 SDValue N1 = N->getOperand(1); 4663 EVT VT = N0.getValueType(); 4664 bool IsSigned = (ISD::SMULO == N->getOpcode()); 4665 4666 EVT CarryVT = N->getValueType(1); 4667 SDLoc DL(N); 4668 4669 ConstantSDNode *N0C = isConstOrConstSplat(N0); 4670 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4671 4672 // fold operation with constant operands. 4673 // TODO: Move this to FoldConstantArithmetic when it supports nodes with 4674 // multiple results. 4675 if (N0C && N1C) { 4676 bool Overflow; 4677 APInt Result = 4678 IsSigned ? N0C->getAPIntValue().smul_ov(N1C->getAPIntValue(), Overflow) 4679 : N0C->getAPIntValue().umul_ov(N1C->getAPIntValue(), Overflow); 4680 return CombineTo(N, DAG.getConstant(Result, DL, VT), 4681 DAG.getBoolConstant(Overflow, DL, CarryVT, CarryVT)); 4682 } 4683 4684 // canonicalize constant to RHS. 4685 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4686 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4687 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 4688 4689 // fold (mulo x, 0) -> 0 + no carry out 4690 if (isNullOrNullSplat(N1)) 4691 return CombineTo(N, DAG.getConstant(0, DL, VT), 4692 DAG.getConstant(0, DL, CarryVT)); 4693 4694 // (mulo x, 2) -> (addo x, x) 4695 if (N1C && N1C->getAPIntValue() == 2) 4696 return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL, 4697 N->getVTList(), N0, N0); 4698 4699 if (IsSigned) { 4700 // A 1 bit SMULO overflows if both inputs are 1. 4701 if (VT.getScalarSizeInBits() == 1) { 4702 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1); 4703 return CombineTo(N, And, 4704 DAG.getSetCC(DL, CarryVT, And, 4705 DAG.getConstant(0, DL, VT), ISD::SETNE)); 4706 } 4707 4708 // Multiplying n * m significant bits yields a result of n + m significant 4709 // bits. If the total number of significant bits does not exceed the 4710 // result bit width (minus 1), there is no overflow. 4711 unsigned SignBits = DAG.ComputeNumSignBits(N0); 4712 if (SignBits > 1) 4713 SignBits += DAG.ComputeNumSignBits(N1); 4714 if (SignBits > VT.getScalarSizeInBits() + 1) 4715 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1), 4716 DAG.getConstant(0, DL, CarryVT)); 4717 } else { 4718 KnownBits N1Known = DAG.computeKnownBits(N1); 4719 KnownBits N0Known = DAG.computeKnownBits(N0); 4720 bool Overflow; 4721 (void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow); 4722 if (!Overflow) 4723 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1), 4724 DAG.getConstant(0, DL, CarryVT)); 4725 } 4726 4727 return SDValue(); 4728 } 4729 4730 SDValue DAGCombiner::visitIMINMAX(SDNode *N) { 4731 SDValue N0 = N->getOperand(0); 4732 SDValue N1 = N->getOperand(1); 4733 EVT VT = N0.getValueType(); 4734 unsigned Opcode = N->getOpcode(); 4735 4736 // fold vector ops 4737 if (VT.isVector()) 4738 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 4739 return FoldedVOp; 4740 4741 // fold operation with constant operands. 4742 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1})) 4743 return C; 4744 4745 // canonicalize constant to RHS 4746 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4747 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4748 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 4749 4750 // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. 4751 // Only do this if the current op isn't legal and the flipped is. 4752 if (!TLI.isOperationLegal(Opcode, VT) && 4753 (N0.isUndef() || DAG.SignBitIsZero(N0)) && 4754 (N1.isUndef() || DAG.SignBitIsZero(N1))) { 4755 unsigned AltOpcode; 4756 switch (Opcode) { 4757 case ISD::SMIN: AltOpcode = ISD::UMIN; break; 4758 case ISD::SMAX: AltOpcode = ISD::UMAX; break; 4759 case ISD::UMIN: AltOpcode = ISD::SMIN; break; 4760 case ISD::UMAX: AltOpcode = ISD::SMAX; break; 4761 default: llvm_unreachable("Unknown MINMAX opcode"); 4762 } 4763 if (TLI.isOperationLegal(AltOpcode, VT)) 4764 return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1); 4765 } 4766 4767 // Simplify the operands using demanded-bits information. 4768 if (SimplifyDemandedBits(SDValue(N, 0))) 4769 return SDValue(N, 0); 4770 4771 return SDValue(); 4772 } 4773 4774 /// If this is a bitwise logic instruction and both operands have the same 4775 /// opcode, try to sink the other opcode after the logic instruction. 4776 SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { 4777 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 4778 EVT VT = N0.getValueType(); 4779 unsigned LogicOpcode = N->getOpcode(); 4780 unsigned HandOpcode = N0.getOpcode(); 4781 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || 4782 LogicOpcode == ISD::XOR) && "Expected logic opcode"); 4783 assert(HandOpcode == N1.getOpcode() && "Bad input!"); 4784 4785 // Bail early if none of these transforms apply. 4786 if (N0.getNumOperands() == 0) 4787 return SDValue(); 4788 4789 // FIXME: We should check number of uses of the operands to not increase 4790 // the instruction count for all transforms. 4791 4792 // Handle size-changing casts. 4793 SDValue X = N0.getOperand(0); 4794 SDValue Y = N1.getOperand(0); 4795 EVT XVT = X.getValueType(); 4796 SDLoc DL(N); 4797 if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND || 4798 HandOpcode == ISD::SIGN_EXTEND) { 4799 // If both operands have other uses, this transform would create extra 4800 // instructions without eliminating anything. 4801 if (!N0.hasOneUse() && !N1.hasOneUse()) 4802 return SDValue(); 4803 // We need matching integer source types. 4804 if (XVT != Y.getValueType()) 4805 return SDValue(); 4806 // Don't create an illegal op during or after legalization. Don't ever 4807 // create an unsupported vector op. 4808 if ((VT.isVector() || LegalOperations) && 4809 !TLI.isOperationLegalOrCustom(LogicOpcode, XVT)) 4810 return SDValue(); 4811 // Avoid infinite looping with PromoteIntBinOp. 4812 // TODO: Should we apply desirable/legal constraints to all opcodes? 4813 if (HandOpcode == ISD::ANY_EXTEND && LegalTypes && 4814 !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) 4815 return SDValue(); 4816 // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) 4817 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4818 return DAG.getNode(HandOpcode, DL, VT, Logic); 4819 } 4820 4821 // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y) 4822 if (HandOpcode == ISD::TRUNCATE) { 4823 // If both operands have other uses, this transform would create extra 4824 // instructions without eliminating anything. 4825 if (!N0.hasOneUse() && !N1.hasOneUse()) 4826 return SDValue(); 4827 // We need matching source types. 4828 if (XVT != Y.getValueType()) 4829 return SDValue(); 4830 // Don't create an illegal op during or after legalization. 4831 if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) 4832 return SDValue(); 4833 // Be extra careful sinking truncate. If it's free, there's no benefit in 4834 // widening a binop. Also, don't create a logic op on an illegal type. 4835 if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT)) 4836 return SDValue(); 4837 if (!TLI.isTypeLegal(XVT)) 4838 return SDValue(); 4839 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4840 return DAG.getNode(HandOpcode, DL, VT, Logic); 4841 } 4842 4843 // For binops SHL/SRL/SRA/AND: 4844 // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z 4845 if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL || 4846 HandOpcode == ISD::SRA || HandOpcode == ISD::AND) && 4847 N0.getOperand(1) == N1.getOperand(1)) { 4848 // If either operand has other uses, this transform is not an improvement. 4849 if (!N0.hasOneUse() || !N1.hasOneUse()) 4850 return SDValue(); 4851 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4852 return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); 4853 } 4854 4855 // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y) 4856 if (HandOpcode == ISD::BSWAP) { 4857 // If either operand has other uses, this transform is not an improvement. 4858 if (!N0.hasOneUse() || !N1.hasOneUse()) 4859 return SDValue(); 4860 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4861 return DAG.getNode(HandOpcode, DL, VT, Logic); 4862 } 4863 4864 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) 4865 // Only perform this optimization up until type legalization, before 4866 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by 4867 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and 4868 // we don't want to undo this promotion. 4869 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper 4870 // on scalars. 4871 if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) && 4872 Level <= AfterLegalizeTypes) { 4873 // Input types must be integer and the same. 4874 if (XVT.isInteger() && XVT == Y.getValueType() && 4875 !(VT.isVector() && TLI.isTypeLegal(VT) && 4876 !XVT.isVector() && !TLI.isTypeLegal(XVT))) { 4877 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4878 return DAG.getNode(HandOpcode, DL, VT, Logic); 4879 } 4880 } 4881 4882 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). 4883 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) 4884 // If both shuffles use the same mask, and both shuffle within a single 4885 // vector, then it is worthwhile to move the swizzle after the operation. 4886 // The type-legalizer generates this pattern when loading illegal 4887 // vector types from memory. In many cases this allows additional shuffle 4888 // optimizations. 4889 // There are other cases where moving the shuffle after the xor/and/or 4890 // is profitable even if shuffles don't perform a swizzle. 4891 // If both shuffles use the same mask, and both shuffles have the same first 4892 // or second operand, then it might still be profitable to move the shuffle 4893 // after the xor/and/or operation. 4894 if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { 4895 auto *SVN0 = cast<ShuffleVectorSDNode>(N0); 4896 auto *SVN1 = cast<ShuffleVectorSDNode>(N1); 4897 assert(X.getValueType() == Y.getValueType() && 4898 "Inputs to shuffles are not the same type"); 4899 4900 // Check that both shuffles use the same mask. The masks are known to be of 4901 // the same length because the result vector type is the same. 4902 // Check also that shuffles have only one use to avoid introducing extra 4903 // instructions. 4904 if (!SVN0->hasOneUse() || !SVN1->hasOneUse() || 4905 !SVN0->getMask().equals(SVN1->getMask())) 4906 return SDValue(); 4907 4908 // Don't try to fold this node if it requires introducing a 4909 // build vector of all zeros that might be illegal at this stage. 4910 SDValue ShOp = N0.getOperand(1); 4911 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4912 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4913 4914 // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C) 4915 if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { 4916 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, 4917 N0.getOperand(0), N1.getOperand(0)); 4918 return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask()); 4919 } 4920 4921 // Don't try to fold this node if it requires introducing a 4922 // build vector of all zeros that might be illegal at this stage. 4923 ShOp = N0.getOperand(0); 4924 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4925 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4926 4927 // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B)) 4928 if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) { 4929 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1), 4930 N1.getOperand(1)); 4931 return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask()); 4932 } 4933 } 4934 4935 return SDValue(); 4936 } 4937 4938 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. 4939 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 4940 const SDLoc &DL) { 4941 SDValue LL, LR, RL, RR, N0CC, N1CC; 4942 if (!isSetCCEquivalent(N0, LL, LR, N0CC) || 4943 !isSetCCEquivalent(N1, RL, RR, N1CC)) 4944 return SDValue(); 4945 4946 assert(N0.getValueType() == N1.getValueType() && 4947 "Unexpected operand types for bitwise logic op"); 4948 assert(LL.getValueType() == LR.getValueType() && 4949 RL.getValueType() == RR.getValueType() && 4950 "Unexpected operand types for setcc"); 4951 4952 // If we're here post-legalization or the logic op type is not i1, the logic 4953 // op type must match a setcc result type. Also, all folds require new 4954 // operations on the left and right operands, so those types must match. 4955 EVT VT = N0.getValueType(); 4956 EVT OpVT = LL.getValueType(); 4957 if (LegalOperations || VT.getScalarType() != MVT::i1) 4958 if (VT != getSetCCResultType(OpVT)) 4959 return SDValue(); 4960 if (OpVT != RL.getValueType()) 4961 return SDValue(); 4962 4963 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); 4964 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); 4965 bool IsInteger = OpVT.isInteger(); 4966 if (LR == RR && CC0 == CC1 && IsInteger) { 4967 bool IsZero = isNullOrNullSplat(LR); 4968 bool IsNeg1 = isAllOnesOrAllOnesSplat(LR); 4969 4970 // All bits clear? 4971 bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; 4972 // All sign bits clear? 4973 bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; 4974 // Any bits set? 4975 bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; 4976 // Any sign bits set? 4977 bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; 4978 4979 // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) 4980 // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) 4981 // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) 4982 // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) 4983 if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { 4984 SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); 4985 AddToWorklist(Or.getNode()); 4986 return DAG.getSetCC(DL, VT, Or, LR, CC1); 4987 } 4988 4989 // All bits set? 4990 bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; 4991 // All sign bits set? 4992 bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; 4993 // Any bits clear? 4994 bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; 4995 // Any sign bits clear? 4996 bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; 4997 4998 // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) 4999 // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) 5000 // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) 5001 // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) 5002 if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { 5003 SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); 5004 AddToWorklist(And.getNode()); 5005 return DAG.getSetCC(DL, VT, And, LR, CC1); 5006 } 5007 } 5008 5009 // TODO: What is the 'or' equivalent of this fold? 5010 // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) 5011 if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 && 5012 IsInteger && CC0 == ISD::SETNE && 5013 ((isNullConstant(LR) && isAllOnesConstant(RR)) || 5014 (isAllOnesConstant(LR) && isNullConstant(RR)))) { 5015 SDValue One = DAG.getConstant(1, DL, OpVT); 5016 SDValue Two = DAG.getConstant(2, DL, OpVT); 5017 SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); 5018 AddToWorklist(Add.getNode()); 5019 return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); 5020 } 5021 5022 // Try more general transforms if the predicates match and the only user of 5023 // the compares is the 'and' or 'or'. 5024 if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && 5025 N0.hasOneUse() && N1.hasOneUse()) { 5026 // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 5027 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 5028 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { 5029 SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); 5030 SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); 5031 SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); 5032 SDValue Zero = DAG.getConstant(0, DL, OpVT); 5033 return DAG.getSetCC(DL, VT, Or, Zero, CC1); 5034 } 5035 5036 // Turn compare of constants whose difference is 1 bit into add+and+setcc. 5037 // TODO - support non-uniform vector amounts. 5038 if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) { 5039 // Match a shared variable operand and 2 non-opaque constant operands. 5040 ConstantSDNode *C0 = isConstOrConstSplat(LR); 5041 ConstantSDNode *C1 = isConstOrConstSplat(RR); 5042 if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) { 5043 const APInt &CMax = 5044 APIntOps::umax(C0->getAPIntValue(), C1->getAPIntValue()); 5045 const APInt &CMin = 5046 APIntOps::umin(C0->getAPIntValue(), C1->getAPIntValue()); 5047 // The difference of the constants must be a single bit. 5048 if ((CMax - CMin).isPowerOf2()) { 5049 // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) --> 5050 // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq 5051 SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR); 5052 SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR); 5053 SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min); 5054 SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min); 5055 SDValue Mask = DAG.getNOT(DL, Diff, OpVT); 5056 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask); 5057 SDValue Zero = DAG.getConstant(0, DL, OpVT); 5058 return DAG.getSetCC(DL, VT, And, Zero, CC0); 5059 } 5060 } 5061 } 5062 } 5063 5064 // Canonicalize equivalent operands to LL == RL. 5065 if (LL == RR && LR == RL) { 5066 CC1 = ISD::getSetCCSwappedOperands(CC1); 5067 std::swap(RL, RR); 5068 } 5069 5070 // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 5071 // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 5072 if (LL == RL && LR == RR) { 5073 ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT) 5074 : ISD::getSetCCOrOperation(CC0, CC1, OpVT); 5075 if (NewCC != ISD::SETCC_INVALID && 5076 (!LegalOperations || 5077 (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && 5078 TLI.isOperationLegal(ISD::SETCC, OpVT)))) 5079 return DAG.getSetCC(DL, VT, LL, LR, NewCC); 5080 } 5081 5082 return SDValue(); 5083 } 5084 5085 /// This contains all DAGCombine rules which reduce two values combined by 5086 /// an And operation to a single value. This makes them reusable in the context 5087 /// of visitSELECT(). Rules involving constants are not included as 5088 /// visitSELECT() already handles those cases. 5089 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { 5090 EVT VT = N1.getValueType(); 5091 SDLoc DL(N); 5092 5093 // fold (and x, undef) -> 0 5094 if (N0.isUndef() || N1.isUndef()) 5095 return DAG.getConstant(0, DL, VT); 5096 5097 if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) 5098 return V; 5099 5100 if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && 5101 VT.getSizeInBits() <= 64) { 5102 if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 5103 if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) { 5104 // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal 5105 // immediate for an add, but it is legal if its top c2 bits are set, 5106 // transform the ADD so the immediate doesn't need to be materialized 5107 // in a register. 5108 APInt ADDC = ADDI->getAPIntValue(); 5109 APInt SRLC = SRLI->getAPIntValue(); 5110 if (ADDC.getMinSignedBits() <= 64 && 5111 SRLC.ult(VT.getSizeInBits()) && 5112 !TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 5113 APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 5114 SRLC.getZExtValue()); 5115 if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { 5116 ADDC |= Mask; 5117 if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 5118 SDLoc DL0(N0); 5119 SDValue NewAdd = 5120 DAG.getNode(ISD::ADD, DL0, VT, 5121 N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); 5122 CombineTo(N0.getNode(), NewAdd); 5123 // Return N so it doesn't get rechecked! 5124 return SDValue(N, 0); 5125 } 5126 } 5127 } 5128 } 5129 } 5130 } 5131 5132 // Reduce bit extract of low half of an integer to the narrower type. 5133 // (and (srl i64:x, K), KMask) -> 5134 // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) 5135 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 5136 if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) { 5137 if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 5138 unsigned Size = VT.getSizeInBits(); 5139 const APInt &AndMask = CAnd->getAPIntValue(); 5140 unsigned ShiftBits = CShift->getZExtValue(); 5141 5142 // Bail out, this node will probably disappear anyway. 5143 if (ShiftBits == 0) 5144 return SDValue(); 5145 5146 unsigned MaskBits = AndMask.countTrailingOnes(); 5147 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); 5148 5149 if (AndMask.isMask() && 5150 // Required bits must not span the two halves of the integer and 5151 // must fit in the half size type. 5152 (ShiftBits + MaskBits <= Size / 2) && 5153 TLI.isNarrowingProfitable(VT, HalfVT) && 5154 TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && 5155 TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && 5156 TLI.isTruncateFree(VT, HalfVT) && 5157 TLI.isZExtFree(HalfVT, VT)) { 5158 // The isNarrowingProfitable is to avoid regressions on PPC and 5159 // AArch64 which match a few 64-bit bit insert / bit extract patterns 5160 // on downstream users of this. Those patterns could probably be 5161 // extended to handle extensions mixed in. 5162 5163 SDValue SL(N0); 5164 assert(MaskBits <= Size); 5165 5166 // Extracting the highest bit of the low half. 5167 EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); 5168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, 5169 N0.getOperand(0)); 5170 5171 SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); 5172 SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); 5173 SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); 5174 SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); 5175 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); 5176 } 5177 } 5178 } 5179 } 5180 5181 return SDValue(); 5182 } 5183 5184 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 5185 EVT LoadResultTy, EVT &ExtVT) { 5186 if (!AndC->getAPIntValue().isMask()) 5187 return false; 5188 5189 unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); 5190 5191 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 5192 EVT LoadedVT = LoadN->getMemoryVT(); 5193 5194 if (ExtVT == LoadedVT && 5195 (!LegalOperations || 5196 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { 5197 // ZEXTLOAD will match without needing to change the size of the value being 5198 // loaded. 5199 return true; 5200 } 5201 5202 // Do not change the width of a volatile or atomic loads. 5203 if (!LoadN->isSimple()) 5204 return false; 5205 5206 // Do not generate loads of non-round integer types since these can 5207 // be expensive (and would be wrong if the type is not byte sized). 5208 if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) 5209 return false; 5210 5211 if (LegalOperations && 5212 !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) 5213 return false; 5214 5215 if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) 5216 return false; 5217 5218 return true; 5219 } 5220 5221 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, 5222 ISD::LoadExtType ExtType, EVT &MemVT, 5223 unsigned ShAmt) { 5224 if (!LDST) 5225 return false; 5226 // Only allow byte offsets. 5227 if (ShAmt % 8) 5228 return false; 5229 5230 // Do not generate loads of non-round integer types since these can 5231 // be expensive (and would be wrong if the type is not byte sized). 5232 if (!MemVT.isRound()) 5233 return false; 5234 5235 // Don't change the width of a volatile or atomic loads. 5236 if (!LDST->isSimple()) 5237 return false; 5238 5239 EVT LdStMemVT = LDST->getMemoryVT(); 5240 5241 // Bail out when changing the scalable property, since we can't be sure that 5242 // we're actually narrowing here. 5243 if (LdStMemVT.isScalableVector() != MemVT.isScalableVector()) 5244 return false; 5245 5246 // Verify that we are actually reducing a load width here. 5247 if (LdStMemVT.bitsLT(MemVT)) 5248 return false; 5249 5250 // Ensure that this isn't going to produce an unsupported memory access. 5251 if (ShAmt) { 5252 assert(ShAmt % 8 == 0 && "ShAmt is byte offset"); 5253 const unsigned ByteShAmt = ShAmt / 8; 5254 const Align LDSTAlign = LDST->getAlign(); 5255 const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt); 5256 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 5257 LDST->getAddressSpace(), NarrowAlign, 5258 LDST->getMemOperand()->getFlags())) 5259 return false; 5260 } 5261 5262 // It's not possible to generate a constant of extended or untyped type. 5263 EVT PtrType = LDST->getBasePtr().getValueType(); 5264 if (PtrType == MVT::Untyped || PtrType.isExtended()) 5265 return false; 5266 5267 if (isa<LoadSDNode>(LDST)) { 5268 LoadSDNode *Load = cast<LoadSDNode>(LDST); 5269 // Don't transform one with multiple uses, this would require adding a new 5270 // load. 5271 if (!SDValue(Load, 0).hasOneUse()) 5272 return false; 5273 5274 if (LegalOperations && 5275 !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT)) 5276 return false; 5277 5278 // For the transform to be legal, the load must produce only two values 5279 // (the value loaded and the chain). Don't transform a pre-increment 5280 // load, for example, which produces an extra value. Otherwise the 5281 // transformation is not equivalent, and the downstream logic to replace 5282 // uses gets things wrong. 5283 if (Load->getNumValues() > 2) 5284 return false; 5285 5286 // If the load that we're shrinking is an extload and we're not just 5287 // discarding the extension we can't simply shrink the load. Bail. 5288 // TODO: It would be possible to merge the extensions in some cases. 5289 if (Load->getExtensionType() != ISD::NON_EXTLOAD && 5290 Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 5291 return false; 5292 5293 if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT)) 5294 return false; 5295 } else { 5296 assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode"); 5297 StoreSDNode *Store = cast<StoreSDNode>(LDST); 5298 // Can't write outside the original store 5299 if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 5300 return false; 5301 5302 if (LegalOperations && 5303 !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT)) 5304 return false; 5305 } 5306 return true; 5307 } 5308 5309 bool DAGCombiner::SearchForAndLoads(SDNode *N, 5310 SmallVectorImpl<LoadSDNode*> &Loads, 5311 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 5312 ConstantSDNode *Mask, 5313 SDNode *&NodeToMask) { 5314 // Recursively search for the operands, looking for loads which can be 5315 // narrowed. 5316 for (SDValue Op : N->op_values()) { 5317 if (Op.getValueType().isVector()) 5318 return false; 5319 5320 // Some constants may need fixing up later if they are too large. 5321 if (auto *C = dyn_cast<ConstantSDNode>(Op)) { 5322 if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && 5323 (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) 5324 NodesWithConsts.insert(N); 5325 continue; 5326 } 5327 5328 if (!Op.hasOneUse()) 5329 return false; 5330 5331 switch(Op.getOpcode()) { 5332 case ISD::LOAD: { 5333 auto *Load = cast<LoadSDNode>(Op); 5334 EVT ExtVT; 5335 if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && 5336 isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) { 5337 5338 // ZEXTLOAD is already small enough. 5339 if (Load->getExtensionType() == ISD::ZEXTLOAD && 5340 ExtVT.bitsGE(Load->getMemoryVT())) 5341 continue; 5342 5343 // Use LE to convert equal sized loads to zext. 5344 if (ExtVT.bitsLE(Load->getMemoryVT())) 5345 Loads.push_back(Load); 5346 5347 continue; 5348 } 5349 return false; 5350 } 5351 case ISD::ZERO_EXTEND: 5352 case ISD::AssertZext: { 5353 unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); 5354 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 5355 EVT VT = Op.getOpcode() == ISD::AssertZext ? 5356 cast<VTSDNode>(Op.getOperand(1))->getVT() : 5357 Op.getOperand(0).getValueType(); 5358 5359 // We can accept extending nodes if the mask is wider or an equal 5360 // width to the original type. 5361 if (ExtVT.bitsGE(VT)) 5362 continue; 5363 break; 5364 } 5365 case ISD::OR: 5366 case ISD::XOR: 5367 case ISD::AND: 5368 if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, 5369 NodeToMask)) 5370 return false; 5371 continue; 5372 } 5373 5374 // Allow one node which will masked along with any loads found. 5375 if (NodeToMask) 5376 return false; 5377 5378 // Also ensure that the node to be masked only produces one data result. 5379 NodeToMask = Op.getNode(); 5380 if (NodeToMask->getNumValues() > 1) { 5381 bool HasValue = false; 5382 for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) { 5383 MVT VT = SDValue(NodeToMask, i).getSimpleValueType(); 5384 if (VT != MVT::Glue && VT != MVT::Other) { 5385 if (HasValue) { 5386 NodeToMask = nullptr; 5387 return false; 5388 } 5389 HasValue = true; 5390 } 5391 } 5392 assert(HasValue && "Node to be masked has no data result?"); 5393 } 5394 } 5395 return true; 5396 } 5397 5398 bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { 5399 auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5400 if (!Mask) 5401 return false; 5402 5403 if (!Mask->getAPIntValue().isMask()) 5404 return false; 5405 5406 // No need to do anything if the and directly uses a load. 5407 if (isa<LoadSDNode>(N->getOperand(0))) 5408 return false; 5409 5410 SmallVector<LoadSDNode*, 8> Loads; 5411 SmallPtrSet<SDNode*, 2> NodesWithConsts; 5412 SDNode *FixupNode = nullptr; 5413 if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { 5414 if (Loads.size() == 0) 5415 return false; 5416 5417 LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); 5418 SDValue MaskOp = N->getOperand(1); 5419 5420 // If it exists, fixup the single node we allow in the tree that needs 5421 // masking. 5422 if (FixupNode) { 5423 LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); 5424 SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), 5425 FixupNode->getValueType(0), 5426 SDValue(FixupNode, 0), MaskOp); 5427 DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); 5428 if (And.getOpcode() == ISD ::AND) 5429 DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); 5430 } 5431 5432 // Narrow any constants that need it. 5433 for (auto *LogicN : NodesWithConsts) { 5434 SDValue Op0 = LogicN->getOperand(0); 5435 SDValue Op1 = LogicN->getOperand(1); 5436 5437 if (isa<ConstantSDNode>(Op0)) 5438 std::swap(Op0, Op1); 5439 5440 SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), 5441 Op1, MaskOp); 5442 5443 DAG.UpdateNodeOperands(LogicN, Op0, And); 5444 } 5445 5446 // Create narrow loads. 5447 for (auto *Load : Loads) { 5448 LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); 5449 SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), 5450 SDValue(Load, 0), MaskOp); 5451 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); 5452 if (And.getOpcode() == ISD ::AND) 5453 And = SDValue( 5454 DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0); 5455 SDValue NewLoad = ReduceLoadWidth(And.getNode()); 5456 assert(NewLoad && 5457 "Shouldn't be masking the load if it can't be narrowed"); 5458 CombineTo(Load, NewLoad, NewLoad.getValue(1)); 5459 } 5460 DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); 5461 return true; 5462 } 5463 return false; 5464 } 5465 5466 // Unfold 5467 // x & (-1 'logical shift' y) 5468 // To 5469 // (x 'opposite logical shift' y) 'logical shift' y 5470 // if it is better for performance. 5471 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { 5472 assert(N->getOpcode() == ISD::AND); 5473 5474 SDValue N0 = N->getOperand(0); 5475 SDValue N1 = N->getOperand(1); 5476 5477 // Do we actually prefer shifts over mask? 5478 if (!TLI.shouldFoldMaskToVariableShiftPair(N0)) 5479 return SDValue(); 5480 5481 // Try to match (-1 '[outer] logical shift' y) 5482 unsigned OuterShift; 5483 unsigned InnerShift; // The opposite direction to the OuterShift. 5484 SDValue Y; // Shift amount. 5485 auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool { 5486 if (!M.hasOneUse()) 5487 return false; 5488 OuterShift = M->getOpcode(); 5489 if (OuterShift == ISD::SHL) 5490 InnerShift = ISD::SRL; 5491 else if (OuterShift == ISD::SRL) 5492 InnerShift = ISD::SHL; 5493 else 5494 return false; 5495 if (!isAllOnesConstant(M->getOperand(0))) 5496 return false; 5497 Y = M->getOperand(1); 5498 return true; 5499 }; 5500 5501 SDValue X; 5502 if (matchMask(N1)) 5503 X = N0; 5504 else if (matchMask(N0)) 5505 X = N1; 5506 else 5507 return SDValue(); 5508 5509 SDLoc DL(N); 5510 EVT VT = N->getValueType(0); 5511 5512 // tmp = x 'opposite logical shift' y 5513 SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y); 5514 // ret = tmp 'logical shift' y 5515 SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y); 5516 5517 return T1; 5518 } 5519 5520 /// Try to replace shift/logic that tests if a bit is clear with mask + setcc. 5521 /// For a target with a bit test, this is expected to become test + set and save 5522 /// at least 1 instruction. 5523 static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { 5524 assert(And->getOpcode() == ISD::AND && "Expected an 'and' op"); 5525 5526 // This is probably not worthwhile without a supported type. 5527 EVT VT = And->getValueType(0); 5528 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5529 if (!TLI.isTypeLegal(VT)) 5530 return SDValue(); 5531 5532 // Look through an optional extension and find a 'not'. 5533 // TODO: Should we favor test+set even without the 'not' op? 5534 SDValue Not = And->getOperand(0), And1 = And->getOperand(1); 5535 if (Not.getOpcode() == ISD::ANY_EXTEND) 5536 Not = Not.getOperand(0); 5537 if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1)) 5538 return SDValue(); 5539 5540 // Look though an optional truncation. The source operand may not be the same 5541 // type as the original 'and', but that is ok because we are masking off 5542 // everything but the low bit. 5543 SDValue Srl = Not.getOperand(0); 5544 if (Srl.getOpcode() == ISD::TRUNCATE) 5545 Srl = Srl.getOperand(0); 5546 5547 // Match a shift-right by constant. 5548 if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() || 5549 !isa<ConstantSDNode>(Srl.getOperand(1))) 5550 return SDValue(); 5551 5552 // We might have looked through casts that make this transform invalid. 5553 // TODO: If the source type is wider than the result type, do the mask and 5554 // compare in the source type. 5555 const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1); 5556 unsigned VTBitWidth = VT.getSizeInBits(); 5557 if (ShiftAmt.uge(VTBitWidth)) 5558 return SDValue(); 5559 5560 // Turn this into a bit-test pattern using mask op + setcc: 5561 // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0 5562 SDLoc DL(And); 5563 SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT); 5564 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 5565 SDValue Mask = DAG.getConstant( 5566 APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT); 5567 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask); 5568 SDValue Zero = DAG.getConstant(0, DL, VT); 5569 SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ); 5570 return DAG.getZExtOrTrunc(Setcc, DL, VT); 5571 } 5572 5573 SDValue DAGCombiner::visitAND(SDNode *N) { 5574 SDValue N0 = N->getOperand(0); 5575 SDValue N1 = N->getOperand(1); 5576 EVT VT = N1.getValueType(); 5577 5578 // x & x --> x 5579 if (N0 == N1) 5580 return N0; 5581 5582 // fold vector ops 5583 if (VT.isVector()) { 5584 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 5585 return FoldedVOp; 5586 5587 // fold (and x, 0) -> 0, vector edition 5588 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 5589 // do not return N0, because undef node may exist in N0 5590 return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()), 5591 SDLoc(N), N0.getValueType()); 5592 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 5593 // do not return N1, because undef node may exist in N1 5594 return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()), 5595 SDLoc(N), N1.getValueType()); 5596 5597 // fold (and x, -1) -> x, vector edition 5598 if (ISD::isConstantSplatVectorAllOnes(N0.getNode())) 5599 return N1; 5600 if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) 5601 return N0; 5602 5603 // fold (and (masked_load) (build_vec (x, ...))) to zext_masked_load 5604 auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0); 5605 auto *BVec = dyn_cast<BuildVectorSDNode>(N1); 5606 if (MLoad && BVec && MLoad->getExtensionType() == ISD::EXTLOAD && 5607 N0.hasOneUse() && N1.hasOneUse()) { 5608 EVT LoadVT = MLoad->getMemoryVT(); 5609 EVT ExtVT = VT; 5610 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) { 5611 // For this AND to be a zero extension of the masked load the elements 5612 // of the BuildVec must mask the bottom bits of the extended element 5613 // type 5614 if (ConstantSDNode *Splat = BVec->getConstantSplatNode()) { 5615 uint64_t ElementSize = 5616 LoadVT.getVectorElementType().getScalarSizeInBits(); 5617 if (Splat->getAPIntValue().isMask(ElementSize)) { 5618 return DAG.getMaskedLoad( 5619 ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(), 5620 MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(), 5621 LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(), 5622 ISD::ZEXTLOAD, MLoad->isExpandingLoad()); 5623 } 5624 } 5625 } 5626 } 5627 } 5628 5629 // fold (and c1, c2) -> c1&c2 5630 ConstantSDNode *N1C = isConstOrConstSplat(N1); 5631 if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1})) 5632 return C; 5633 5634 // canonicalize constant to RHS 5635 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 5636 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 5637 return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); 5638 5639 // fold (and x, -1) -> x 5640 if (isAllOnesConstant(N1)) 5641 return N0; 5642 5643 // if (and x, c) is known to be zero, return 0 5644 unsigned BitWidth = VT.getScalarSizeInBits(); 5645 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 5646 APInt::getAllOnesValue(BitWidth))) 5647 return DAG.getConstant(0, SDLoc(N), VT); 5648 5649 if (SDValue NewSel = foldBinOpIntoSelect(N)) 5650 return NewSel; 5651 5652 // reassociate and 5653 if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) 5654 return RAND; 5655 5656 // Try to convert a constant mask AND into a shuffle clear mask. 5657 if (VT.isVector()) 5658 if (SDValue Shuffle = XformToShuffleWithZero(N)) 5659 return Shuffle; 5660 5661 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 5662 return Combined; 5663 5664 // fold (and (or x, C), D) -> D if (C & D) == D 5665 auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { 5666 return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); 5667 }; 5668 if (N0.getOpcode() == ISD::OR && 5669 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) 5670 return N1; 5671 // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. 5672 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 5673 SDValue N0Op0 = N0.getOperand(0); 5674 APInt Mask = ~N1C->getAPIntValue(); 5675 Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); 5676 if (DAG.MaskedValueIsZero(N0Op0, Mask)) { 5677 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5678 N0.getValueType(), N0Op0); 5679 5680 // Replace uses of the AND with uses of the Zero extend node. 5681 CombineTo(N, Zext); 5682 5683 // We actually want to replace all uses of the any_extend with the 5684 // zero_extend, to avoid duplicating things. This will later cause this 5685 // AND to be folded. 5686 CombineTo(N0.getNode(), Zext); 5687 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5688 } 5689 } 5690 5691 // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> 5692 // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must 5693 // already be zero by virtue of the width of the base type of the load. 5694 // 5695 // the 'X' node here can either be nothing or an extract_vector_elt to catch 5696 // more cases. 5697 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5698 N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && 5699 N0.getOperand(0).getOpcode() == ISD::LOAD && 5700 N0.getOperand(0).getResNo() == 0) || 5701 (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { 5702 LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ? 5703 N0 : N0.getOperand(0) ); 5704 5705 // Get the constant (if applicable) the zero'th operand is being ANDed with. 5706 // This can be a pure constant or a vector splat, in which case we treat the 5707 // vector as a scalar and use the splat value. 5708 APInt Constant = APInt::getNullValue(1); 5709 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 5710 Constant = C->getAPIntValue(); 5711 } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) { 5712 APInt SplatValue, SplatUndef; 5713 unsigned SplatBitSize; 5714 bool HasAnyUndefs; 5715 bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, 5716 SplatBitSize, HasAnyUndefs); 5717 if (IsSplat) { 5718 // Undef bits can contribute to a possible optimisation if set, so 5719 // set them. 5720 SplatValue |= SplatUndef; 5721 5722 // The splat value may be something like "0x00FFFFFF", which means 0 for 5723 // the first vector value and FF for the rest, repeating. We need a mask 5724 // that will apply equally to all members of the vector, so AND all the 5725 // lanes of the constant together. 5726 unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits(); 5727 5728 // If the splat value has been compressed to a bitlength lower 5729 // than the size of the vector lane, we need to re-expand it to 5730 // the lane size. 5731 if (EltBitWidth > SplatBitSize) 5732 for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth); 5733 SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2) 5734 SplatValue |= SplatValue.shl(SplatBitSize); 5735 5736 // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a 5737 // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. 5738 if ((SplatBitSize % EltBitWidth) == 0) { 5739 Constant = APInt::getAllOnesValue(EltBitWidth); 5740 for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i) 5741 Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth); 5742 } 5743 } 5744 } 5745 5746 // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is 5747 // actually legal and isn't going to get expanded, else this is a false 5748 // optimisation. 5749 bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, 5750 Load->getValueType(0), 5751 Load->getMemoryVT()); 5752 5753 // Resize the constant to the same size as the original memory access before 5754 // extension. If it is still the AllOnesValue then this AND is completely 5755 // unneeded. 5756 Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); 5757 5758 bool B; 5759 switch (Load->getExtensionType()) { 5760 default: B = false; break; 5761 case ISD::EXTLOAD: B = CanZextLoadProfitably; break; 5762 case ISD::ZEXTLOAD: 5763 case ISD::NON_EXTLOAD: B = true; break; 5764 } 5765 5766 if (B && Constant.isAllOnesValue()) { 5767 // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to 5768 // preserve semantics once we get rid of the AND. 5769 SDValue NewLoad(Load, 0); 5770 5771 // Fold the AND away. NewLoad may get replaced immediately. 5772 CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); 5773 5774 if (Load->getExtensionType() == ISD::EXTLOAD) { 5775 NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, 5776 Load->getValueType(0), SDLoc(Load), 5777 Load->getChain(), Load->getBasePtr(), 5778 Load->getOffset(), Load->getMemoryVT(), 5779 Load->getMemOperand()); 5780 // Replace uses of the EXTLOAD with the new ZEXTLOAD. 5781 if (Load->getNumValues() == 3) { 5782 // PRE/POST_INC loads have 3 values. 5783 SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), 5784 NewLoad.getValue(2) }; 5785 CombineTo(Load, To, 3, true); 5786 } else { 5787 CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); 5788 } 5789 } 5790 5791 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5792 } 5793 } 5794 5795 // fold (and (masked_gather x)) -> (zext_masked_gather x) 5796 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) { 5797 EVT MemVT = GN0->getMemoryVT(); 5798 EVT ScalarVT = MemVT.getScalarType(); 5799 5800 if (SDValue(GN0, 0).hasOneUse() && 5801 isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) && 5802 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { 5803 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), 5804 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; 5805 5806 SDValue ZExtLoad = DAG.getMaskedGather( 5807 DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops, 5808 GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD); 5809 5810 CombineTo(N, ZExtLoad); 5811 AddToWorklist(ZExtLoad.getNode()); 5812 // Avoid recheck of N. 5813 return SDValue(N, 0); 5814 } 5815 } 5816 5817 // fold (and (load x), 255) -> (zextload x, i8) 5818 // fold (and (extload x, i16), 255) -> (zextload x, i8) 5819 // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) 5820 if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || 5821 (N0.getOpcode() == ISD::ANY_EXTEND && 5822 N0.getOperand(0).getOpcode() == ISD::LOAD))) { 5823 if (SDValue Res = ReduceLoadWidth(N)) { 5824 LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND 5825 ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); 5826 AddToWorklist(N); 5827 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res); 5828 return SDValue(N, 0); 5829 } 5830 } 5831 5832 if (LegalTypes) { 5833 // Attempt to propagate the AND back up to the leaves which, if they're 5834 // loads, can be combined to narrow loads and the AND node can be removed. 5835 // Perform after legalization so that extend nodes will already be 5836 // combined into the loads. 5837 if (BackwardsPropagateMask(N)) 5838 return SDValue(N, 0); 5839 } 5840 5841 if (SDValue Combined = visitANDLike(N0, N1, N)) 5842 return Combined; 5843 5844 // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) 5845 if (N0.getOpcode() == N1.getOpcode()) 5846 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 5847 return V; 5848 5849 // Masking the negated extension of a boolean is just the zero-extended 5850 // boolean: 5851 // and (sub 0, zext(bool X)), 1 --> zext(bool X) 5852 // and (sub 0, sext(bool X)), 1 --> zext(bool X) 5853 // 5854 // Note: the SimplifyDemandedBits fold below can make an information-losing 5855 // transform, and then we have no way to find this better fold. 5856 if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { 5857 if (isNullOrNullSplat(N0.getOperand(0))) { 5858 SDValue SubRHS = N0.getOperand(1); 5859 if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && 5860 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5861 return SubRHS; 5862 if (SubRHS.getOpcode() == ISD::SIGN_EXTEND && 5863 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5864 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0)); 5865 } 5866 } 5867 5868 // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) 5869 // fold (and (sra)) -> (and (srl)) when possible. 5870 if (SimplifyDemandedBits(SDValue(N, 0))) 5871 return SDValue(N, 0); 5872 5873 // fold (zext_inreg (extload x)) -> (zextload x) 5874 // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use 5875 if (ISD::isUNINDEXEDLoad(N0.getNode()) && 5876 (ISD::isEXTLoad(N0.getNode()) || 5877 (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) { 5878 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 5879 EVT MemVT = LN0->getMemoryVT(); 5880 // If we zero all the possible extended bits, then we can turn this into 5881 // a zextload if we are running before legalize or the operation is legal. 5882 unsigned ExtBitSize = N1.getScalarValueSizeInBits(); 5883 unsigned MemBitSize = MemVT.getScalarSizeInBits(); 5884 APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); 5885 if (DAG.MaskedValueIsZero(N1, ExtBits) && 5886 ((!LegalOperations && LN0->isSimple()) || 5887 TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { 5888 SDValue ExtLoad = 5889 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), 5890 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 5891 AddToWorklist(N); 5892 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 5893 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5894 } 5895 } 5896 5897 // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) 5898 if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { 5899 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 5900 N0.getOperand(1), false)) 5901 return BSwap; 5902 } 5903 5904 if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N)) 5905 return Shifts; 5906 5907 if (TLI.hasBitTest(N0, N1)) 5908 if (SDValue V = combineShiftAnd1ToBitTest(N, DAG)) 5909 return V; 5910 5911 // Recognize the following pattern: 5912 // 5913 // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask) 5914 // 5915 // where bitmask is a mask that clears the upper bits of AndVT. The 5916 // number of bits in bitmask must be a power of two. 5917 auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) { 5918 if (LHS->getOpcode() != ISD::SIGN_EXTEND) 5919 return false; 5920 5921 auto *C = dyn_cast<ConstantSDNode>(RHS); 5922 if (!C) 5923 return false; 5924 5925 if (!C->getAPIntValue().isMask( 5926 LHS.getOperand(0).getValueType().getFixedSizeInBits())) 5927 return false; 5928 5929 return true; 5930 }; 5931 5932 // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...). 5933 if (IsAndZeroExtMask(N0, N1)) 5934 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); 5935 5936 return SDValue(); 5937 } 5938 5939 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. 5940 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 5941 bool DemandHighBits) { 5942 if (!LegalOperations) 5943 return SDValue(); 5944 5945 EVT VT = N->getValueType(0); 5946 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) 5947 return SDValue(); 5948 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 5949 return SDValue(); 5950 5951 // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) 5952 bool LookPassAnd0 = false; 5953 bool LookPassAnd1 = false; 5954 if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) 5955 std::swap(N0, N1); 5956 if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) 5957 std::swap(N0, N1); 5958 if (N0.getOpcode() == ISD::AND) { 5959 if (!N0.getNode()->hasOneUse()) 5960 return SDValue(); 5961 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5962 // Also handle 0xffff since the LHS is guaranteed to have zeros there. 5963 // This is needed for X86. 5964 if (!N01C || (N01C->getZExtValue() != 0xFF00 && 5965 N01C->getZExtValue() != 0xFFFF)) 5966 return SDValue(); 5967 N0 = N0.getOperand(0); 5968 LookPassAnd0 = true; 5969 } 5970 5971 if (N1.getOpcode() == ISD::AND) { 5972 if (!N1.getNode()->hasOneUse()) 5973 return SDValue(); 5974 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5975 if (!N11C || N11C->getZExtValue() != 0xFF) 5976 return SDValue(); 5977 N1 = N1.getOperand(0); 5978 LookPassAnd1 = true; 5979 } 5980 5981 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 5982 std::swap(N0, N1); 5983 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 5984 return SDValue(); 5985 if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) 5986 return SDValue(); 5987 5988 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5989 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5990 if (!N01C || !N11C) 5991 return SDValue(); 5992 if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) 5993 return SDValue(); 5994 5995 // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) 5996 SDValue N00 = N0->getOperand(0); 5997 if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { 5998 if (!N00.getNode()->hasOneUse()) 5999 return SDValue(); 6000 ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1)); 6001 if (!N001C || N001C->getZExtValue() != 0xFF) 6002 return SDValue(); 6003 N00 = N00.getOperand(0); 6004 LookPassAnd0 = true; 6005 } 6006 6007 SDValue N10 = N1->getOperand(0); 6008 if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { 6009 if (!N10.getNode()->hasOneUse()) 6010 return SDValue(); 6011 ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1)); 6012 // Also allow 0xFFFF since the bits will be shifted out. This is needed 6013 // for X86. 6014 if (!N101C || (N101C->getZExtValue() != 0xFF00 && 6015 N101C->getZExtValue() != 0xFFFF)) 6016 return SDValue(); 6017 N10 = N10.getOperand(0); 6018 LookPassAnd1 = true; 6019 } 6020 6021 if (N00 != N10) 6022 return SDValue(); 6023 6024 // Make sure everything beyond the low halfword gets set to zero since the SRL 6025 // 16 will clear the top bits. 6026 unsigned OpSizeInBits = VT.getSizeInBits(); 6027 if (DemandHighBits && OpSizeInBits > 16) { 6028 // If the left-shift isn't masked out then the only way this is a bswap is 6029 // if all bits beyond the low 8 are 0. In that case the entire pattern 6030 // reduces to a left shift anyway: leave it for other parts of the combiner. 6031 if (!LookPassAnd0) 6032 return SDValue(); 6033 6034 // However, if the right shift isn't masked out then it might be because 6035 // it's not needed. See if we can spot that too. 6036 if (!LookPassAnd1 && 6037 !DAG.MaskedValueIsZero( 6038 N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) 6039 return SDValue(); 6040 } 6041 6042 SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); 6043 if (OpSizeInBits > 16) { 6044 SDLoc DL(N); 6045 Res = DAG.getNode(ISD::SRL, DL, VT, Res, 6046 DAG.getConstant(OpSizeInBits - 16, DL, 6047 getShiftAmountTy(VT))); 6048 } 6049 return Res; 6050 } 6051 6052 /// Return true if the specified node is an element that makes up a 32-bit 6053 /// packed halfword byteswap. 6054 /// ((x & 0x000000ff) << 8) | 6055 /// ((x & 0x0000ff00) >> 8) | 6056 /// ((x & 0x00ff0000) << 8) | 6057 /// ((x & 0xff000000) >> 8) 6058 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { 6059 if (!N.getNode()->hasOneUse()) 6060 return false; 6061 6062 unsigned Opc = N.getOpcode(); 6063 if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) 6064 return false; 6065 6066 SDValue N0 = N.getOperand(0); 6067 unsigned Opc0 = N0.getOpcode(); 6068 if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) 6069 return false; 6070 6071 ConstantSDNode *N1C = nullptr; 6072 // SHL or SRL: look upstream for AND mask operand 6073 if (Opc == ISD::AND) 6074 N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 6075 else if (Opc0 == ISD::AND) 6076 N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 6077 if (!N1C) 6078 return false; 6079 6080 unsigned MaskByteOffset; 6081 switch (N1C->getZExtValue()) { 6082 default: 6083 return false; 6084 case 0xFF: MaskByteOffset = 0; break; 6085 case 0xFF00: MaskByteOffset = 1; break; 6086 case 0xFFFF: 6087 // In case demanded bits didn't clear the bits that will be shifted out. 6088 // This is needed for X86. 6089 if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) { 6090 MaskByteOffset = 1; 6091 break; 6092 } 6093 return false; 6094 case 0xFF0000: MaskByteOffset = 2; break; 6095 case 0xFF000000: MaskByteOffset = 3; break; 6096 } 6097 6098 // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). 6099 if (Opc == ISD::AND) { 6100 if (MaskByteOffset == 0 || MaskByteOffset == 2) { 6101 // (x >> 8) & 0xff 6102 // (x >> 8) & 0xff0000 6103 if (Opc0 != ISD::SRL) 6104 return false; 6105 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 6106 if (!C || C->getZExtValue() != 8) 6107 return false; 6108 } else { 6109 // (x << 8) & 0xff00 6110 // (x << 8) & 0xff000000 6111 if (Opc0 != ISD::SHL) 6112 return false; 6113 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 6114 if (!C || C->getZExtValue() != 8) 6115 return false; 6116 } 6117 } else if (Opc == ISD::SHL) { 6118 // (x & 0xff) << 8 6119 // (x & 0xff0000) << 8 6120 if (MaskByteOffset != 0 && MaskByteOffset != 2) 6121 return false; 6122 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 6123 if (!C || C->getZExtValue() != 8) 6124 return false; 6125 } else { // Opc == ISD::SRL 6126 // (x & 0xff00) >> 8 6127 // (x & 0xff000000) >> 8 6128 if (MaskByteOffset != 1 && MaskByteOffset != 3) 6129 return false; 6130 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 6131 if (!C || C->getZExtValue() != 8) 6132 return false; 6133 } 6134 6135 if (Parts[MaskByteOffset]) 6136 return false; 6137 6138 Parts[MaskByteOffset] = N0.getOperand(0).getNode(); 6139 return true; 6140 } 6141 6142 // Match 2 elements of a packed halfword bswap. 6143 static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) { 6144 if (N.getOpcode() == ISD::OR) 6145 return isBSwapHWordElement(N.getOperand(0), Parts) && 6146 isBSwapHWordElement(N.getOperand(1), Parts); 6147 6148 if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) { 6149 ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1)); 6150 if (!C || C->getAPIntValue() != 16) 6151 return false; 6152 Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode(); 6153 return true; 6154 } 6155 6156 return false; 6157 } 6158 6159 // Match this pattern: 6160 // (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff)) 6161 // And rewrite this to: 6162 // (rotr (bswap A), 16) 6163 static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI, 6164 SelectionDAG &DAG, SDNode *N, SDValue N0, 6165 SDValue N1, EVT VT, EVT ShiftAmountTy) { 6166 assert(N->getOpcode() == ISD::OR && VT == MVT::i32 && 6167 "MatchBSwapHWordOrAndAnd: expecting i32"); 6168 if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) 6169 return SDValue(); 6170 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) 6171 return SDValue(); 6172 // TODO: this is too restrictive; lifting this restriction requires more tests 6173 if (!N0->hasOneUse() || !N1->hasOneUse()) 6174 return SDValue(); 6175 ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1)); 6176 ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1)); 6177 if (!Mask0 || !Mask1) 6178 return SDValue(); 6179 if (Mask0->getAPIntValue() != 0xff00ff00 || 6180 Mask1->getAPIntValue() != 0x00ff00ff) 6181 return SDValue(); 6182 SDValue Shift0 = N0.getOperand(0); 6183 SDValue Shift1 = N1.getOperand(0); 6184 if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL) 6185 return SDValue(); 6186 ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1)); 6187 ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1)); 6188 if (!ShiftAmt0 || !ShiftAmt1) 6189 return SDValue(); 6190 if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8) 6191 return SDValue(); 6192 if (Shift0.getOperand(0) != Shift1.getOperand(0)) 6193 return SDValue(); 6194 6195 SDLoc DL(N); 6196 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0)); 6197 SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy); 6198 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); 6199 } 6200 6201 /// Match a 32-bit packed halfword bswap. That is 6202 /// ((x & 0x000000ff) << 8) | 6203 /// ((x & 0x0000ff00) >> 8) | 6204 /// ((x & 0x00ff0000) << 8) | 6205 /// ((x & 0xff000000) >> 8) 6206 /// => (rotl (bswap x), 16) 6207 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { 6208 if (!LegalOperations) 6209 return SDValue(); 6210 6211 EVT VT = N->getValueType(0); 6212 if (VT != MVT::i32) 6213 return SDValue(); 6214 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 6215 return SDValue(); 6216 6217 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT, 6218 getShiftAmountTy(VT))) 6219 return BSwap; 6220 6221 // Try again with commuted operands. 6222 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT, 6223 getShiftAmountTy(VT))) 6224 return BSwap; 6225 6226 6227 // Look for either 6228 // (or (bswaphpair), (bswaphpair)) 6229 // (or (or (bswaphpair), (and)), (and)) 6230 // (or (or (and), (bswaphpair)), (and)) 6231 SDNode *Parts[4] = {}; 6232 6233 if (isBSwapHWordPair(N0, Parts)) { 6234 // (or (or (and), (and)), (or (and), (and))) 6235 if (!isBSwapHWordPair(N1, Parts)) 6236 return SDValue(); 6237 } else if (N0.getOpcode() == ISD::OR) { 6238 // (or (or (or (and), (and)), (and)), (and)) 6239 if (!isBSwapHWordElement(N1, Parts)) 6240 return SDValue(); 6241 SDValue N00 = N0.getOperand(0); 6242 SDValue N01 = N0.getOperand(1); 6243 if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) && 6244 !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts))) 6245 return SDValue(); 6246 } else 6247 return SDValue(); 6248 6249 // Make sure the parts are all coming from the same node. 6250 if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) 6251 return SDValue(); 6252 6253 SDLoc DL(N); 6254 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, 6255 SDValue(Parts[0], 0)); 6256 6257 // Result of the bswap should be rotated by 16. If it's not legal, then 6258 // do (x << 16) | (x >> 16). 6259 SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT)); 6260 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) 6261 return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt); 6262 if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) 6263 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); 6264 return DAG.getNode(ISD::OR, DL, VT, 6265 DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt), 6266 DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt)); 6267 } 6268 6269 /// This contains all DAGCombine rules which reduce two values combined by 6270 /// an Or operation to a single value \see visitANDLike(). 6271 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { 6272 EVT VT = N1.getValueType(); 6273 SDLoc DL(N); 6274 6275 // fold (or x, undef) -> -1 6276 if (!LegalOperations && (N0.isUndef() || N1.isUndef())) 6277 return DAG.getAllOnesConstant(DL, VT); 6278 6279 if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) 6280 return V; 6281 6282 // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. 6283 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && 6284 // Don't increase # computations. 6285 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 6286 // We can only do this xform if we know that bits from X that are set in C2 6287 // but not in C1 are already zero. Likewise for Y. 6288 if (const ConstantSDNode *N0O1C = 6289 getAsNonOpaqueConstant(N0.getOperand(1))) { 6290 if (const ConstantSDNode *N1O1C = 6291 getAsNonOpaqueConstant(N1.getOperand(1))) { 6292 // We can only do this xform if we know that bits from X that are set in 6293 // C2 but not in C1 are already zero. Likewise for Y. 6294 const APInt &LHSMask = N0O1C->getAPIntValue(); 6295 const APInt &RHSMask = N1O1C->getAPIntValue(); 6296 6297 if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && 6298 DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { 6299 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 6300 N0.getOperand(0), N1.getOperand(0)); 6301 return DAG.getNode(ISD::AND, DL, VT, X, 6302 DAG.getConstant(LHSMask | RHSMask, DL, VT)); 6303 } 6304 } 6305 } 6306 } 6307 6308 // (or (and X, M), (and X, N)) -> (and X, (or M, N)) 6309 if (N0.getOpcode() == ISD::AND && 6310 N1.getOpcode() == ISD::AND && 6311 N0.getOperand(0) == N1.getOperand(0) && 6312 // Don't increase # computations. 6313 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 6314 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 6315 N0.getOperand(1), N1.getOperand(1)); 6316 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); 6317 } 6318 6319 return SDValue(); 6320 } 6321 6322 /// OR combines for which the commuted variant will be tried as well. 6323 static SDValue visitORCommutative( 6324 SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) { 6325 EVT VT = N0.getValueType(); 6326 if (N0.getOpcode() == ISD::AND) { 6327 // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y) 6328 if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1) 6329 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1); 6330 6331 // fold (or (and (xor Y, -1), X), Y) -> (or X, Y) 6332 if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1) 6333 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1); 6334 } 6335 6336 return SDValue(); 6337 } 6338 6339 SDValue DAGCombiner::visitOR(SDNode *N) { 6340 SDValue N0 = N->getOperand(0); 6341 SDValue N1 = N->getOperand(1); 6342 EVT VT = N1.getValueType(); 6343 6344 // x | x --> x 6345 if (N0 == N1) 6346 return N0; 6347 6348 // fold vector ops 6349 if (VT.isVector()) { 6350 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 6351 return FoldedVOp; 6352 6353 // fold (or x, 0) -> x, vector edition 6354 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 6355 return N1; 6356 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 6357 return N0; 6358 6359 // fold (or x, -1) -> -1, vector edition 6360 if (ISD::isConstantSplatVectorAllOnes(N0.getNode())) 6361 // do not return N0, because undef node may exist in N0 6362 return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); 6363 if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) 6364 // do not return N1, because undef node may exist in N1 6365 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); 6366 6367 // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) 6368 // Do this only if the resulting shuffle is legal. 6369 if (isa<ShuffleVectorSDNode>(N0) && 6370 isa<ShuffleVectorSDNode>(N1) && 6371 // Avoid folding a node with illegal type. 6372 TLI.isTypeLegal(VT)) { 6373 bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode()); 6374 bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()); 6375 bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 6376 bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); 6377 // Ensure both shuffles have a zero input. 6378 if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { 6379 assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); 6380 assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); 6381 const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); 6382 const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1); 6383 bool CanFold = true; 6384 int NumElts = VT.getVectorNumElements(); 6385 SmallVector<int, 4> Mask(NumElts); 6386 6387 for (int i = 0; i != NumElts; ++i) { 6388 int M0 = SV0->getMaskElt(i); 6389 int M1 = SV1->getMaskElt(i); 6390 6391 // Determine if either index is pointing to a zero vector. 6392 bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts)); 6393 bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts)); 6394 6395 // If one element is zero and the otherside is undef, keep undef. 6396 // This also handles the case that both are undef. 6397 if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) { 6398 Mask[i] = -1; 6399 continue; 6400 } 6401 6402 // Make sure only one of the elements is zero. 6403 if (M0Zero == M1Zero) { 6404 CanFold = false; 6405 break; 6406 } 6407 6408 assert((M0 >= 0 || M1 >= 0) && "Undef index!"); 6409 6410 // We have a zero and non-zero element. If the non-zero came from 6411 // SV0 make the index a LHS index. If it came from SV1, make it 6412 // a RHS index. We need to mod by NumElts because we don't care 6413 // which operand it came from in the original shuffles. 6414 Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts; 6415 } 6416 6417 if (CanFold) { 6418 SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0); 6419 SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0); 6420 6421 SDValue LegalShuffle = 6422 TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, 6423 Mask, DAG); 6424 if (LegalShuffle) 6425 return LegalShuffle; 6426 } 6427 } 6428 } 6429 } 6430 6431 // fold (or c1, c2) -> c1|c2 6432 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 6433 if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1})) 6434 return C; 6435 6436 // canonicalize constant to RHS 6437 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 6438 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 6439 return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); 6440 6441 // fold (or x, 0) -> x 6442 if (isNullConstant(N1)) 6443 return N0; 6444 6445 // fold (or x, -1) -> -1 6446 if (isAllOnesConstant(N1)) 6447 return N1; 6448 6449 if (SDValue NewSel = foldBinOpIntoSelect(N)) 6450 return NewSel; 6451 6452 // fold (or x, c) -> c iff (x & ~c) == 0 6453 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) 6454 return N1; 6455 6456 if (SDValue Combined = visitORLike(N0, N1, N)) 6457 return Combined; 6458 6459 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 6460 return Combined; 6461 6462 // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) 6463 if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) 6464 return BSwap; 6465 if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) 6466 return BSwap; 6467 6468 // reassociate or 6469 if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) 6470 return ROR; 6471 6472 // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) 6473 // iff (c1 & c2) != 0 or c1/c2 are undef. 6474 auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) { 6475 return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue()); 6476 }; 6477 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && 6478 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) { 6479 if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, 6480 {N1, N0.getOperand(1)})) { 6481 SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); 6482 AddToWorklist(IOR.getNode()); 6483 return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); 6484 } 6485 } 6486 6487 if (SDValue Combined = visitORCommutative(DAG, N0, N1, N)) 6488 return Combined; 6489 if (SDValue Combined = visitORCommutative(DAG, N1, N0, N)) 6490 return Combined; 6491 6492 // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) 6493 if (N0.getOpcode() == N1.getOpcode()) 6494 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 6495 return V; 6496 6497 // See if this is some rotate idiom. 6498 if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N))) 6499 return Rot; 6500 6501 if (SDValue Load = MatchLoadCombine(N)) 6502 return Load; 6503 6504 // Simplify the operands using demanded-bits information. 6505 if (SimplifyDemandedBits(SDValue(N, 0))) 6506 return SDValue(N, 0); 6507 6508 // If OR can be rewritten into ADD, try combines based on ADD. 6509 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) && 6510 DAG.haveNoCommonBitsSet(N0, N1)) 6511 if (SDValue Combined = visitADDLike(N)) 6512 return Combined; 6513 6514 return SDValue(); 6515 } 6516 6517 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) { 6518 if (Op.getOpcode() == ISD::AND && 6519 DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { 6520 Mask = Op.getOperand(1); 6521 return Op.getOperand(0); 6522 } 6523 return Op; 6524 } 6525 6526 /// Match "(X shl/srl V1) & V2" where V2 may not be present. 6527 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift, 6528 SDValue &Mask) { 6529 Op = stripConstantMask(DAG, Op, Mask); 6530 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { 6531 Shift = Op; 6532 return true; 6533 } 6534 return false; 6535 } 6536 6537 /// Helper function for visitOR to extract the needed side of a rotate idiom 6538 /// from a shl/srl/mul/udiv. This is meant to handle cases where 6539 /// InstCombine merged some outside op with one of the shifts from 6540 /// the rotate pattern. 6541 /// \returns An empty \c SDValue if the needed shift couldn't be extracted. 6542 /// Otherwise, returns an expansion of \p ExtractFrom based on the following 6543 /// patterns: 6544 /// 6545 /// (or (add v v) (shrl v bitwidth-1)): 6546 /// expands (add v v) -> (shl v 1) 6547 /// 6548 /// (or (mul v c0) (shrl (mul v c1) c2)): 6549 /// expands (mul v c0) -> (shl (mul v c1) c3) 6550 /// 6551 /// (or (udiv v c0) (shl (udiv v c1) c2)): 6552 /// expands (udiv v c0) -> (shrl (udiv v c1) c3) 6553 /// 6554 /// (or (shl v c0) (shrl (shl v c1) c2)): 6555 /// expands (shl v c0) -> (shl (shl v c1) c3) 6556 /// 6557 /// (or (shrl v c0) (shl (shrl v c1) c2)): 6558 /// expands (shrl v c0) -> (shrl (shrl v c1) c3) 6559 /// 6560 /// Such that in all cases, c3+c2==bitwidth(op v c1). 6561 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, 6562 SDValue ExtractFrom, SDValue &Mask, 6563 const SDLoc &DL) { 6564 assert(OppShift && ExtractFrom && "Empty SDValue"); 6565 assert( 6566 (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) && 6567 "Existing shift must be valid as a rotate half"); 6568 6569 ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask); 6570 6571 // Value and Type of the shift. 6572 SDValue OppShiftLHS = OppShift.getOperand(0); 6573 EVT ShiftedVT = OppShiftLHS.getValueType(); 6574 6575 // Amount of the existing shift. 6576 ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1)); 6577 6578 // (add v v) -> (shl v 1) 6579 // TODO: Should this be a general DAG canonicalization? 6580 if (OppShift.getOpcode() == ISD::SRL && OppShiftCst && 6581 ExtractFrom.getOpcode() == ISD::ADD && 6582 ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) && 6583 ExtractFrom.getOperand(0) == OppShiftLHS && 6584 OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1) 6585 return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS, 6586 DAG.getShiftAmountConstant(1, ShiftedVT, DL)); 6587 6588 // Preconditions: 6589 // (or (op0 v c0) (shiftl/r (op0 v c1) c2)) 6590 // 6591 // Find opcode of the needed shift to be extracted from (op0 v c0). 6592 unsigned Opcode = ISD::DELETED_NODE; 6593 bool IsMulOrDiv = false; 6594 // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift 6595 // opcode or its arithmetic (mul or udiv) variant. 6596 auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) { 6597 IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant; 6598 if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift) 6599 return false; 6600 Opcode = NeededShift; 6601 return true; 6602 }; 6603 // op0 must be either the needed shift opcode or the mul/udiv equivalent 6604 // that the needed shift can be extracted from. 6605 if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) && 6606 (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV))) 6607 return SDValue(); 6608 6609 // op0 must be the same opcode on both sides, have the same LHS argument, 6610 // and produce the same value type. 6611 if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() || 6612 OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) || 6613 ShiftedVT != ExtractFrom.getValueType()) 6614 return SDValue(); 6615 6616 // Constant mul/udiv/shift amount from the RHS of the shift's LHS op. 6617 ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1)); 6618 // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op. 6619 ConstantSDNode *ExtractFromCst = 6620 isConstOrConstSplat(ExtractFrom.getOperand(1)); 6621 // TODO: We should be able to handle non-uniform constant vectors for these values 6622 // Check that we have constant values. 6623 if (!OppShiftCst || !OppShiftCst->getAPIntValue() || 6624 !OppLHSCst || !OppLHSCst->getAPIntValue() || 6625 !ExtractFromCst || !ExtractFromCst->getAPIntValue()) 6626 return SDValue(); 6627 6628 // Compute the shift amount we need to extract to complete the rotate. 6629 const unsigned VTWidth = ShiftedVT.getScalarSizeInBits(); 6630 if (OppShiftCst->getAPIntValue().ugt(VTWidth)) 6631 return SDValue(); 6632 APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue(); 6633 // Normalize the bitwidth of the two mul/udiv/shift constant operands. 6634 APInt ExtractFromAmt = ExtractFromCst->getAPIntValue(); 6635 APInt OppLHSAmt = OppLHSCst->getAPIntValue(); 6636 zeroExtendToMatch(ExtractFromAmt, OppLHSAmt); 6637 6638 // Now try extract the needed shift from the ExtractFrom op and see if the 6639 // result matches up with the existing shift's LHS op. 6640 if (IsMulOrDiv) { 6641 // Op to extract from is a mul or udiv by a constant. 6642 // Check: 6643 // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0 6644 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0 6645 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(), 6646 NeededShiftAmt.getZExtValue()); 6647 APInt ResultAmt; 6648 APInt Rem; 6649 APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem); 6650 if (Rem != 0 || ResultAmt != OppLHSAmt) 6651 return SDValue(); 6652 } else { 6653 // Op to extract from is a shift by a constant. 6654 // Check: 6655 // c2 - (bitwidth(op0 v c0) - c1) == c0 6656 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc( 6657 ExtractFromAmt.getBitWidth())) 6658 return SDValue(); 6659 } 6660 6661 // Return the expanded shift op that should allow a rotate to be formed. 6662 EVT ShiftVT = OppShift.getOperand(1).getValueType(); 6663 EVT ResVT = ExtractFrom.getValueType(); 6664 SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT); 6665 return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode); 6666 } 6667 6668 // Return true if we can prove that, whenever Neg and Pos are both in the 6669 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that 6670 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: 6671 // 6672 // (or (shift1 X, Neg), (shift2 X, Pos)) 6673 // 6674 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate 6675 // in direction shift1 by Neg. The range [0, EltSize) means that we only need 6676 // to consider shift amounts with defined behavior. 6677 // 6678 // The IsRotate flag should be set when the LHS of both shifts is the same. 6679 // Otherwise if matching a general funnel shift, it should be clear. 6680 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, 6681 SelectionDAG &DAG, bool IsRotate) { 6682 // If EltSize is a power of 2 then: 6683 // 6684 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) 6685 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). 6686 // 6687 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check 6688 // for the stronger condition: 6689 // 6690 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] 6691 // 6692 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) 6693 // we can just replace Neg with Neg' for the rest of the function. 6694 // 6695 // In other cases we check for the even stronger condition: 6696 // 6697 // Neg == EltSize - Pos [B] 6698 // 6699 // for all Neg and Pos. Note that the (or ...) then invokes undefined 6700 // behavior if Pos == 0 (and consequently Neg == EltSize). 6701 // 6702 // We could actually use [A] whenever EltSize is a power of 2, but the 6703 // only extra cases that it would match are those uninteresting ones 6704 // where Neg and Pos are never in range at the same time. E.g. for 6705 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) 6706 // as well as (sub 32, Pos), but: 6707 // 6708 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) 6709 // 6710 // always invokes undefined behavior for 32-bit X. 6711 // 6712 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. 6713 // 6714 // NOTE: We can only do this when matching an AND and not a general 6715 // funnel shift. 6716 unsigned MaskLoBits = 0; 6717 if (IsRotate && Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { 6718 if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { 6719 KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0)); 6720 unsigned Bits = Log2_64(EltSize); 6721 if (NegC->getAPIntValue().getActiveBits() <= Bits && 6722 ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { 6723 Neg = Neg.getOperand(0); 6724 MaskLoBits = Bits; 6725 } 6726 } 6727 } 6728 6729 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. 6730 if (Neg.getOpcode() != ISD::SUB) 6731 return false; 6732 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); 6733 if (!NegC) 6734 return false; 6735 SDValue NegOp1 = Neg.getOperand(1); 6736 6737 // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with 6738 // Pos'. The truncation is redundant for the purpose of the equality. 6739 if (MaskLoBits && Pos.getOpcode() == ISD::AND) { 6740 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { 6741 KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0)); 6742 if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && 6743 ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= 6744 MaskLoBits)) 6745 Pos = Pos.getOperand(0); 6746 } 6747 } 6748 6749 // The condition we need is now: 6750 // 6751 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask 6752 // 6753 // If NegOp1 == Pos then we need: 6754 // 6755 // EltSize & Mask == NegC & Mask 6756 // 6757 // (because "x & Mask" is a truncation and distributes through subtraction). 6758 // 6759 // We also need to account for a potential truncation of NegOp1 if the amount 6760 // has already been legalized to a shift amount type. 6761 APInt Width; 6762 if ((Pos == NegOp1) || 6763 (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0))) 6764 Width = NegC->getAPIntValue(); 6765 6766 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. 6767 // Then the condition we want to prove becomes: 6768 // 6769 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask 6770 // 6771 // which, again because "x & Mask" is a truncation, becomes: 6772 // 6773 // NegC & Mask == (EltSize - PosC) & Mask 6774 // EltSize & Mask == (NegC + PosC) & Mask 6775 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { 6776 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) 6777 Width = PosC->getAPIntValue() + NegC->getAPIntValue(); 6778 else 6779 return false; 6780 } else 6781 return false; 6782 6783 // Now we just need to check that EltSize & Mask == Width & Mask. 6784 if (MaskLoBits) 6785 // EltSize & Mask is 0 since Mask is EltSize - 1. 6786 return Width.getLoBits(MaskLoBits) == 0; 6787 return Width == EltSize; 6788 } 6789 6790 // A subroutine of MatchRotate used once we have found an OR of two opposite 6791 // shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces 6792 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the 6793 // former being preferred if supported. InnerPos and InnerNeg are Pos and 6794 // Neg with outer conversions stripped away. 6795 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, 6796 SDValue Neg, SDValue InnerPos, 6797 SDValue InnerNeg, unsigned PosOpcode, 6798 unsigned NegOpcode, const SDLoc &DL) { 6799 // fold (or (shl x, (*ext y)), 6800 // (srl x, (*ext (sub 32, y)))) -> 6801 // (rotl x, y) or (rotr x, (sub 32, y)) 6802 // 6803 // fold (or (shl x, (*ext (sub 32, y))), 6804 // (srl x, (*ext y))) -> 6805 // (rotr x, y) or (rotl x, (sub 32, y)) 6806 EVT VT = Shifted.getValueType(); 6807 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG, 6808 /*IsRotate*/ true)) { 6809 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); 6810 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, 6811 HasPos ? Pos : Neg); 6812 } 6813 6814 return SDValue(); 6815 } 6816 6817 // A subroutine of MatchRotate used once we have found an OR of two opposite 6818 // shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces 6819 // to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the 6820 // former being preferred if supported. InnerPos and InnerNeg are Pos and 6821 // Neg with outer conversions stripped away. 6822 // TODO: Merge with MatchRotatePosNeg. 6823 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, 6824 SDValue Neg, SDValue InnerPos, 6825 SDValue InnerNeg, unsigned PosOpcode, 6826 unsigned NegOpcode, const SDLoc &DL) { 6827 EVT VT = N0.getValueType(); 6828 unsigned EltBits = VT.getScalarSizeInBits(); 6829 6830 // fold (or (shl x0, (*ext y)), 6831 // (srl x1, (*ext (sub 32, y)))) -> 6832 // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y)) 6833 // 6834 // fold (or (shl x0, (*ext (sub 32, y))), 6835 // (srl x1, (*ext y))) -> 6836 // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y)) 6837 if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) { 6838 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); 6839 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1, 6840 HasPos ? Pos : Neg); 6841 } 6842 6843 // Matching the shift+xor cases, we can't easily use the xor'd shift amount 6844 // so for now just use the PosOpcode case if its legal. 6845 // TODO: When can we use the NegOpcode case? 6846 if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) { 6847 auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) { 6848 if (Op.getOpcode() != BinOpc) 6849 return false; 6850 ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1)); 6851 return Cst && (Cst->getAPIntValue() == Imm); 6852 }; 6853 6854 // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31))) 6855 // -> (fshl x0, x1, y) 6856 if (IsBinOpImm(N1, ISD::SRL, 1) && 6857 IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) && 6858 InnerPos == InnerNeg.getOperand(0) && 6859 TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) { 6860 return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos); 6861 } 6862 6863 // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y)) 6864 // -> (fshr x0, x1, y) 6865 if (IsBinOpImm(N0, ISD::SHL, 1) && 6866 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && 6867 InnerNeg == InnerPos.getOperand(0) && 6868 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { 6869 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); 6870 } 6871 6872 // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y)) 6873 // -> (fshr x0, x1, y) 6874 // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization? 6875 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) && 6876 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && 6877 InnerNeg == InnerPos.getOperand(0) && 6878 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { 6879 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); 6880 } 6881 } 6882 6883 return SDValue(); 6884 } 6885 6886 // MatchRotate - Handle an 'or' of two operands. If this is one of the many 6887 // idioms for rotate, and if the target supports rotation instructions, generate 6888 // a rot[lr]. This also matches funnel shift patterns, similar to rotation but 6889 // with different shifted sources. 6890 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { 6891 // Must be a legal type. Expanded 'n promoted things won't work with rotates. 6892 EVT VT = LHS.getValueType(); 6893 if (!TLI.isTypeLegal(VT)) 6894 return SDValue(); 6895 6896 // The target must have at least one rotate/funnel flavor. 6897 bool HasROTL = hasOperation(ISD::ROTL, VT); 6898 bool HasROTR = hasOperation(ISD::ROTR, VT); 6899 bool HasFSHL = hasOperation(ISD::FSHL, VT); 6900 bool HasFSHR = hasOperation(ISD::FSHR, VT); 6901 if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR) 6902 return SDValue(); 6903 6904 // Check for truncated rotate. 6905 if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && 6906 LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { 6907 assert(LHS.getValueType() == RHS.getValueType()); 6908 if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { 6909 return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot); 6910 } 6911 } 6912 6913 // Match "(X shl/srl V1) & V2" where V2 may not be present. 6914 SDValue LHSShift; // The shift. 6915 SDValue LHSMask; // AND value if any. 6916 matchRotateHalf(DAG, LHS, LHSShift, LHSMask); 6917 6918 SDValue RHSShift; // The shift. 6919 SDValue RHSMask; // AND value if any. 6920 matchRotateHalf(DAG, RHS, RHSShift, RHSMask); 6921 6922 // If neither side matched a rotate half, bail 6923 if (!LHSShift && !RHSShift) 6924 return SDValue(); 6925 6926 // InstCombine may have combined a constant shl, srl, mul, or udiv with one 6927 // side of the rotate, so try to handle that here. In all cases we need to 6928 // pass the matched shift from the opposite side to compute the opcode and 6929 // needed shift amount to extract. We still want to do this if both sides 6930 // matched a rotate half because one half may be a potential overshift that 6931 // can be broken down (ie if InstCombine merged two shl or srl ops into a 6932 // single one). 6933 6934 // Have LHS side of the rotate, try to extract the needed shift from the RHS. 6935 if (LHSShift) 6936 if (SDValue NewRHSShift = 6937 extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL)) 6938 RHSShift = NewRHSShift; 6939 // Have RHS side of the rotate, try to extract the needed shift from the LHS. 6940 if (RHSShift) 6941 if (SDValue NewLHSShift = 6942 extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL)) 6943 LHSShift = NewLHSShift; 6944 6945 // If a side is still missing, nothing else we can do. 6946 if (!RHSShift || !LHSShift) 6947 return SDValue(); 6948 6949 // At this point we've matched or extracted a shift op on each side. 6950 6951 if (LHSShift.getOpcode() == RHSShift.getOpcode()) 6952 return SDValue(); // Shifts must disagree. 6953 6954 bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0); 6955 if (!IsRotate && !(HasFSHL || HasFSHR)) 6956 return SDValue(); // Requires funnel shift support. 6957 6958 // Canonicalize shl to left side in a shl/srl pair. 6959 if (RHSShift.getOpcode() == ISD::SHL) { 6960 std::swap(LHS, RHS); 6961 std::swap(LHSShift, RHSShift); 6962 std::swap(LHSMask, RHSMask); 6963 } 6964 6965 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 6966 SDValue LHSShiftArg = LHSShift.getOperand(0); 6967 SDValue LHSShiftAmt = LHSShift.getOperand(1); 6968 SDValue RHSShiftArg = RHSShift.getOperand(0); 6969 SDValue RHSShiftAmt = RHSShift.getOperand(1); 6970 6971 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) 6972 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) 6973 // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) 6974 // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) 6975 // iff C1+C2 == EltSizeInBits 6976 auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, 6977 ConstantSDNode *RHS) { 6978 return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; 6979 }; 6980 if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { 6981 SDValue Res; 6982 if (IsRotate && (HasROTL || HasROTR)) 6983 Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, 6984 HasROTL ? LHSShiftAmt : RHSShiftAmt); 6985 else 6986 Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg, 6987 RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt); 6988 6989 // If there is an AND of either shifted operand, apply it to the result. 6990 if (LHSMask.getNode() || RHSMask.getNode()) { 6991 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 6992 SDValue Mask = AllOnes; 6993 6994 if (LHSMask.getNode()) { 6995 SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); 6996 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 6997 DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); 6998 } 6999 if (RHSMask.getNode()) { 7000 SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); 7001 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 7002 DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); 7003 } 7004 7005 Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask); 7006 } 7007 7008 return Res; 7009 } 7010 7011 // If there is a mask here, and we have a variable shift, we can't be sure 7012 // that we're masking out the right stuff. 7013 if (LHSMask.getNode() || RHSMask.getNode()) 7014 return SDValue(); 7015 7016 // If the shift amount is sign/zext/any-extended just peel it off. 7017 SDValue LExtOp0 = LHSShiftAmt; 7018 SDValue RExtOp0 = RHSShiftAmt; 7019 if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 7020 LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 7021 LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 7022 LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && 7023 (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 7024 RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 7025 RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 7026 RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { 7027 LExtOp0 = LHSShiftAmt.getOperand(0); 7028 RExtOp0 = RHSShiftAmt.getOperand(0); 7029 } 7030 7031 if (IsRotate && (HasROTL || HasROTR)) { 7032 SDValue TryL = 7033 MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, 7034 RExtOp0, ISD::ROTL, ISD::ROTR, DL); 7035 if (TryL) 7036 return TryL; 7037 7038 SDValue TryR = 7039 MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, 7040 LExtOp0, ISD::ROTR, ISD::ROTL, DL); 7041 if (TryR) 7042 return TryR; 7043 } 7044 7045 SDValue TryL = 7046 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt, 7047 LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL); 7048 if (TryL) 7049 return TryL; 7050 7051 SDValue TryR = 7052 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt, 7053 RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL); 7054 if (TryR) 7055 return TryR; 7056 7057 return SDValue(); 7058 } 7059 7060 namespace { 7061 7062 /// Represents known origin of an individual byte in load combine pattern. The 7063 /// value of the byte is either constant zero or comes from memory. 7064 struct ByteProvider { 7065 // For constant zero providers Load is set to nullptr. For memory providers 7066 // Load represents the node which loads the byte from memory. 7067 // ByteOffset is the offset of the byte in the value produced by the load. 7068 LoadSDNode *Load = nullptr; 7069 unsigned ByteOffset = 0; 7070 7071 ByteProvider() = default; 7072 7073 static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { 7074 return ByteProvider(Load, ByteOffset); 7075 } 7076 7077 static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } 7078 7079 bool isConstantZero() const { return !Load; } 7080 bool isMemory() const { return Load; } 7081 7082 bool operator==(const ByteProvider &Other) const { 7083 return Other.Load == Load && Other.ByteOffset == ByteOffset; 7084 } 7085 7086 private: 7087 ByteProvider(LoadSDNode *Load, unsigned ByteOffset) 7088 : Load(Load), ByteOffset(ByteOffset) {} 7089 }; 7090 7091 } // end anonymous namespace 7092 7093 /// Recursively traverses the expression calculating the origin of the requested 7094 /// byte of the given value. Returns None if the provider can't be calculated. 7095 /// 7096 /// For all the values except the root of the expression verifies that the value 7097 /// has exactly one use and if it's not true return None. This way if the origin 7098 /// of the byte is returned it's guaranteed that the values which contribute to 7099 /// the byte are not used outside of this expression. 7100 /// 7101 /// Because the parts of the expression are not allowed to have more than one 7102 /// use this function iterates over trees, not DAGs. So it never visits the same 7103 /// node more than once. 7104 static const Optional<ByteProvider> 7105 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, 7106 bool Root = false) { 7107 // Typical i64 by i8 pattern requires recursion up to 8 calls depth 7108 if (Depth == 10) 7109 return None; 7110 7111 if (!Root && !Op.hasOneUse()) 7112 return None; 7113 7114 assert(Op.getValueType().isScalarInteger() && "can't handle other types"); 7115 unsigned BitWidth = Op.getValueSizeInBits(); 7116 if (BitWidth % 8 != 0) 7117 return None; 7118 unsigned ByteWidth = BitWidth / 8; 7119 assert(Index < ByteWidth && "invalid index requested"); 7120 (void) ByteWidth; 7121 7122 switch (Op.getOpcode()) { 7123 case ISD::OR: { 7124 auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); 7125 if (!LHS) 7126 return None; 7127 auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); 7128 if (!RHS) 7129 return None; 7130 7131 if (LHS->isConstantZero()) 7132 return RHS; 7133 if (RHS->isConstantZero()) 7134 return LHS; 7135 return None; 7136 } 7137 case ISD::SHL: { 7138 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 7139 if (!ShiftOp) 7140 return None; 7141 7142 uint64_t BitShift = ShiftOp->getZExtValue(); 7143 if (BitShift % 8 != 0) 7144 return None; 7145 uint64_t ByteShift = BitShift / 8; 7146 7147 return Index < ByteShift 7148 ? ByteProvider::getConstantZero() 7149 : calculateByteProvider(Op->getOperand(0), Index - ByteShift, 7150 Depth + 1); 7151 } 7152 case ISD::ANY_EXTEND: 7153 case ISD::SIGN_EXTEND: 7154 case ISD::ZERO_EXTEND: { 7155 SDValue NarrowOp = Op->getOperand(0); 7156 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); 7157 if (NarrowBitWidth % 8 != 0) 7158 return None; 7159 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 7160 7161 if (Index >= NarrowByteWidth) 7162 return Op.getOpcode() == ISD::ZERO_EXTEND 7163 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 7164 : None; 7165 return calculateByteProvider(NarrowOp, Index, Depth + 1); 7166 } 7167 case ISD::BSWAP: 7168 return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, 7169 Depth + 1); 7170 case ISD::LOAD: { 7171 auto L = cast<LoadSDNode>(Op.getNode()); 7172 if (!L->isSimple() || L->isIndexed()) 7173 return None; 7174 7175 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 7176 if (NarrowBitWidth % 8 != 0) 7177 return None; 7178 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 7179 7180 if (Index >= NarrowByteWidth) 7181 return L->getExtensionType() == ISD::ZEXTLOAD 7182 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 7183 : None; 7184 return ByteProvider::getMemory(L, Index); 7185 } 7186 } 7187 7188 return None; 7189 } 7190 7191 static unsigned littleEndianByteAt(unsigned BW, unsigned i) { 7192 return i; 7193 } 7194 7195 static unsigned bigEndianByteAt(unsigned BW, unsigned i) { 7196 return BW - i - 1; 7197 } 7198 7199 // Check if the bytes offsets we are looking at match with either big or 7200 // little endian value loaded. Return true for big endian, false for little 7201 // endian, and None if match failed. 7202 static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, 7203 int64_t FirstOffset) { 7204 // The endian can be decided only when it is 2 bytes at least. 7205 unsigned Width = ByteOffsets.size(); 7206 if (Width < 2) 7207 return None; 7208 7209 bool BigEndian = true, LittleEndian = true; 7210 for (unsigned i = 0; i < Width; i++) { 7211 int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; 7212 LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i); 7213 BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i); 7214 if (!BigEndian && !LittleEndian) 7215 return None; 7216 } 7217 7218 assert((BigEndian != LittleEndian) && "It should be either big endian or" 7219 "little endian"); 7220 return BigEndian; 7221 } 7222 7223 static SDValue stripTruncAndExt(SDValue Value) { 7224 switch (Value.getOpcode()) { 7225 case ISD::TRUNCATE: 7226 case ISD::ZERO_EXTEND: 7227 case ISD::SIGN_EXTEND: 7228 case ISD::ANY_EXTEND: 7229 return stripTruncAndExt(Value.getOperand(0)); 7230 } 7231 return Value; 7232 } 7233 7234 /// Match a pattern where a wide type scalar value is stored by several narrow 7235 /// stores. Fold it into a single store or a BSWAP and a store if the targets 7236 /// supports it. 7237 /// 7238 /// Assuming little endian target: 7239 /// i8 *p = ... 7240 /// i32 val = ... 7241 /// p[0] = (val >> 0) & 0xFF; 7242 /// p[1] = (val >> 8) & 0xFF; 7243 /// p[2] = (val >> 16) & 0xFF; 7244 /// p[3] = (val >> 24) & 0xFF; 7245 /// => 7246 /// *((i32)p) = val; 7247 /// 7248 /// i8 *p = ... 7249 /// i32 val = ... 7250 /// p[0] = (val >> 24) & 0xFF; 7251 /// p[1] = (val >> 16) & 0xFF; 7252 /// p[2] = (val >> 8) & 0xFF; 7253 /// p[3] = (val >> 0) & 0xFF; 7254 /// => 7255 /// *((i32)p) = BSWAP(val); 7256 SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { 7257 // The matching looks for "store (trunc x)" patterns that appear early but are 7258 // likely to be replaced by truncating store nodes during combining. 7259 // TODO: If there is evidence that running this later would help, this 7260 // limitation could be removed. Legality checks may need to be added 7261 // for the created store and optional bswap/rotate. 7262 if (LegalOperations) 7263 return SDValue(); 7264 7265 // Collect all the stores in the chain. 7266 SDValue Chain; 7267 SmallVector<StoreSDNode *, 8> Stores; 7268 for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) { 7269 // TODO: Allow unordered atomics when wider type is legal (see D66309) 7270 EVT MemVT = Store->getMemoryVT(); 7271 if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) || 7272 !Store->isSimple() || Store->isIndexed()) 7273 return SDValue(); 7274 Stores.push_back(Store); 7275 Chain = Store->getChain(); 7276 } 7277 // There is no reason to continue if we do not have at least a pair of stores. 7278 if (Stores.size() < 2) 7279 return SDValue(); 7280 7281 // Handle simple types only. 7282 LLVMContext &Context = *DAG.getContext(); 7283 unsigned NumStores = Stores.size(); 7284 unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits(); 7285 unsigned WideNumBits = NumStores * NarrowNumBits; 7286 EVT WideVT = EVT::getIntegerVT(Context, WideNumBits); 7287 if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64) 7288 return SDValue(); 7289 7290 // Check if all bytes of the source value that we are looking at are stored 7291 // to the same base address. Collect offsets from Base address into OffsetMap. 7292 SDValue SourceValue; 7293 SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX); 7294 int64_t FirstOffset = INT64_MAX; 7295 StoreSDNode *FirstStore = nullptr; 7296 Optional<BaseIndexOffset> Base; 7297 for (auto Store : Stores) { 7298 // All the stores store different parts of the CombinedValue. A truncate is 7299 // required to get the partial value. 7300 SDValue Trunc = Store->getValue(); 7301 if (Trunc.getOpcode() != ISD::TRUNCATE) 7302 return SDValue(); 7303 // Other than the first/last part, a shift operation is required to get the 7304 // offset. 7305 int64_t Offset = 0; 7306 SDValue WideVal = Trunc.getOperand(0); 7307 if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) && 7308 isa<ConstantSDNode>(WideVal.getOperand(1))) { 7309 // The shift amount must be a constant multiple of the narrow type. 7310 // It is translated to the offset address in the wide source value "y". 7311 // 7312 // x = srl y, ShiftAmtC 7313 // i8 z = trunc x 7314 // store z, ... 7315 uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1); 7316 if (ShiftAmtC % NarrowNumBits != 0) 7317 return SDValue(); 7318 7319 Offset = ShiftAmtC / NarrowNumBits; 7320 WideVal = WideVal.getOperand(0); 7321 } 7322 7323 // Stores must share the same source value with different offsets. 7324 // Truncate and extends should be stripped to get the single source value. 7325 if (!SourceValue) 7326 SourceValue = WideVal; 7327 else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal)) 7328 return SDValue(); 7329 else if (SourceValue.getValueType() != WideVT) { 7330 if (WideVal.getValueType() == WideVT || 7331 WideVal.getScalarValueSizeInBits() > 7332 SourceValue.getScalarValueSizeInBits()) 7333 SourceValue = WideVal; 7334 // Give up if the source value type is smaller than the store size. 7335 if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits()) 7336 return SDValue(); 7337 } 7338 7339 // Stores must share the same base address. 7340 BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); 7341 int64_t ByteOffsetFromBase = 0; 7342 if (!Base) 7343 Base = Ptr; 7344 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 7345 return SDValue(); 7346 7347 // Remember the first store. 7348 if (ByteOffsetFromBase < FirstOffset) { 7349 FirstStore = Store; 7350 FirstOffset = ByteOffsetFromBase; 7351 } 7352 // Map the offset in the store and the offset in the combined value, and 7353 // early return if it has been set before. 7354 if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX) 7355 return SDValue(); 7356 OffsetMap[Offset] = ByteOffsetFromBase; 7357 } 7358 7359 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 7360 assert(FirstStore && "First store must be set"); 7361 7362 // Check that a store of the wide type is both allowed and fast on the target 7363 const DataLayout &Layout = DAG.getDataLayout(); 7364 bool Fast = false; 7365 bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT, 7366 *FirstStore->getMemOperand(), &Fast); 7367 if (!Allowed || !Fast) 7368 return SDValue(); 7369 7370 // Check if the pieces of the value are going to the expected places in memory 7371 // to merge the stores. 7372 auto checkOffsets = [&](bool MatchLittleEndian) { 7373 if (MatchLittleEndian) { 7374 for (unsigned i = 0; i != NumStores; ++i) 7375 if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset) 7376 return false; 7377 } else { // MatchBigEndian by reversing loop counter. 7378 for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j) 7379 if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset) 7380 return false; 7381 } 7382 return true; 7383 }; 7384 7385 // Check if the offsets line up for the native data layout of this target. 7386 bool NeedBswap = false; 7387 bool NeedRotate = false; 7388 if (!checkOffsets(Layout.isLittleEndian())) { 7389 // Special-case: check if byte offsets line up for the opposite endian. 7390 if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian())) 7391 NeedBswap = true; 7392 else if (NumStores == 2 && checkOffsets(Layout.isBigEndian())) 7393 NeedRotate = true; 7394 else 7395 return SDValue(); 7396 } 7397 7398 SDLoc DL(N); 7399 if (WideVT != SourceValue.getValueType()) { 7400 assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits && 7401 "Unexpected store value to merge"); 7402 SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue); 7403 } 7404 7405 // Before legalize we can introduce illegal bswaps/rotates which will be later 7406 // converted to an explicit bswap sequence. This way we end up with a single 7407 // store and byte shuffling instead of several stores and byte shuffling. 7408 if (NeedBswap) { 7409 SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); 7410 } else if (NeedRotate) { 7411 assert(WideNumBits % 2 == 0 && "Unexpected type for rotate"); 7412 SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT); 7413 SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt); 7414 } 7415 7416 SDValue NewStore = 7417 DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), 7418 FirstStore->getPointerInfo(), FirstStore->getAlign()); 7419 7420 // Rely on other DAG combine rules to remove the other individual stores. 7421 DAG.ReplaceAllUsesWith(N, NewStore.getNode()); 7422 return NewStore; 7423 } 7424 7425 /// Match a pattern where a wide type scalar value is loaded by several narrow 7426 /// loads and combined by shifts and ors. Fold it into a single load or a load 7427 /// and a BSWAP if the targets supports it. 7428 /// 7429 /// Assuming little endian target: 7430 /// i8 *a = ... 7431 /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) 7432 /// => 7433 /// i32 val = *((i32)a) 7434 /// 7435 /// i8 *a = ... 7436 /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] 7437 /// => 7438 /// i32 val = BSWAP(*((i32)a)) 7439 /// 7440 /// TODO: This rule matches complex patterns with OR node roots and doesn't 7441 /// interact well with the worklist mechanism. When a part of the pattern is 7442 /// updated (e.g. one of the loads) its direct users are put into the worklist, 7443 /// but the root node of the pattern which triggers the load combine is not 7444 /// necessarily a direct user of the changed node. For example, once the address 7445 /// of t28 load is reassociated load combine won't be triggered: 7446 /// t25: i32 = add t4, Constant:i32<2> 7447 /// t26: i64 = sign_extend t25 7448 /// t27: i64 = add t2, t26 7449 /// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 7450 /// t29: i32 = zero_extend t28 7451 /// t32: i32 = shl t29, Constant:i8<8> 7452 /// t33: i32 = or t23, t32 7453 /// As a possible fix visitLoad can check if the load can be a part of a load 7454 /// combine pattern and add corresponding OR roots to the worklist. 7455 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { 7456 assert(N->getOpcode() == ISD::OR && 7457 "Can only match load combining against OR nodes"); 7458 7459 // Handles simple types only 7460 EVT VT = N->getValueType(0); 7461 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 7462 return SDValue(); 7463 unsigned ByteWidth = VT.getSizeInBits() / 8; 7464 7465 bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); 7466 auto MemoryByteOffset = [&] (ByteProvider P) { 7467 assert(P.isMemory() && "Must be a memory byte provider"); 7468 unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); 7469 assert(LoadBitWidth % 8 == 0 && 7470 "can only analyze providers for individual bytes not bit"); 7471 unsigned LoadByteWidth = LoadBitWidth / 8; 7472 return IsBigEndianTarget 7473 ? bigEndianByteAt(LoadByteWidth, P.ByteOffset) 7474 : littleEndianByteAt(LoadByteWidth, P.ByteOffset); 7475 }; 7476 7477 Optional<BaseIndexOffset> Base; 7478 SDValue Chain; 7479 7480 SmallPtrSet<LoadSDNode *, 8> Loads; 7481 Optional<ByteProvider> FirstByteProvider; 7482 int64_t FirstOffset = INT64_MAX; 7483 7484 // Check if all the bytes of the OR we are looking at are loaded from the same 7485 // base address. Collect bytes offsets from Base address in ByteOffsets. 7486 SmallVector<int64_t, 8> ByteOffsets(ByteWidth); 7487 unsigned ZeroExtendedBytes = 0; 7488 for (int i = ByteWidth - 1; i >= 0; --i) { 7489 auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); 7490 if (!P) 7491 return SDValue(); 7492 7493 if (P->isConstantZero()) { 7494 // It's OK for the N most significant bytes to be 0, we can just 7495 // zero-extend the load. 7496 if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i))) 7497 return SDValue(); 7498 continue; 7499 } 7500 assert(P->isMemory() && "provenance should either be memory or zero"); 7501 7502 LoadSDNode *L = P->Load; 7503 assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && 7504 !L->isIndexed() && 7505 "Must be enforced by calculateByteProvider"); 7506 assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); 7507 7508 // All loads must share the same chain 7509 SDValue LChain = L->getChain(); 7510 if (!Chain) 7511 Chain = LChain; 7512 else if (Chain != LChain) 7513 return SDValue(); 7514 7515 // Loads must share the same base address 7516 BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); 7517 int64_t ByteOffsetFromBase = 0; 7518 if (!Base) 7519 Base = Ptr; 7520 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 7521 return SDValue(); 7522 7523 // Calculate the offset of the current byte from the base address 7524 ByteOffsetFromBase += MemoryByteOffset(*P); 7525 ByteOffsets[i] = ByteOffsetFromBase; 7526 7527 // Remember the first byte load 7528 if (ByteOffsetFromBase < FirstOffset) { 7529 FirstByteProvider = P; 7530 FirstOffset = ByteOffsetFromBase; 7531 } 7532 7533 Loads.insert(L); 7534 } 7535 assert(!Loads.empty() && "All the bytes of the value must be loaded from " 7536 "memory, so there must be at least one load which produces the value"); 7537 assert(Base && "Base address of the accessed memory location must be set"); 7538 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 7539 7540 bool NeedsZext = ZeroExtendedBytes > 0; 7541 7542 EVT MemVT = 7543 EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); 7544 7545 if (!MemVT.isSimple()) 7546 return SDValue(); 7547 7548 // Before legalize we can introduce too wide illegal loads which will be later 7549 // split into legal sized loads. This enables us to combine i64 load by i8 7550 // patterns to a couple of i32 loads on 32 bit targets. 7551 if (LegalOperations && 7552 !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, 7553 MemVT)) 7554 return SDValue(); 7555 7556 // Check if the bytes of the OR we are looking at match with either big or 7557 // little endian value load 7558 Optional<bool> IsBigEndian = isBigEndian( 7559 makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); 7560 if (!IsBigEndian.hasValue()) 7561 return SDValue(); 7562 7563 assert(FirstByteProvider && "must be set"); 7564 7565 // Ensure that the first byte is loaded from zero offset of the first load. 7566 // So the combined value can be loaded from the first load address. 7567 if (MemoryByteOffset(*FirstByteProvider) != 0) 7568 return SDValue(); 7569 LoadSDNode *FirstLoad = FirstByteProvider->Load; 7570 7571 // The node we are looking at matches with the pattern, check if we can 7572 // replace it with a single (possibly zero-extended) load and bswap + shift if 7573 // needed. 7574 7575 // If the load needs byte swap check if the target supports it 7576 bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; 7577 7578 // Before legalize we can introduce illegal bswaps which will be later 7579 // converted to an explicit bswap sequence. This way we end up with a single 7580 // load and byte shuffling instead of several loads and byte shuffling. 7581 // We do not introduce illegal bswaps when zero-extending as this tends to 7582 // introduce too many arithmetic instructions. 7583 if (NeedsBswap && (LegalOperations || NeedsZext) && 7584 !TLI.isOperationLegal(ISD::BSWAP, VT)) 7585 return SDValue(); 7586 7587 // If we need to bswap and zero extend, we have to insert a shift. Check that 7588 // it is legal. 7589 if (NeedsBswap && NeedsZext && LegalOperations && 7590 !TLI.isOperationLegal(ISD::SHL, VT)) 7591 return SDValue(); 7592 7593 // Check that a load of the wide type is both allowed and fast on the target 7594 bool Fast = false; 7595 bool Allowed = 7596 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 7597 *FirstLoad->getMemOperand(), &Fast); 7598 if (!Allowed || !Fast) 7599 return SDValue(); 7600 7601 SDValue NewLoad = 7602 DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, 7603 Chain, FirstLoad->getBasePtr(), 7604 FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign()); 7605 7606 // Transfer chain users from old loads to the new load. 7607 for (LoadSDNode *L : Loads) 7608 DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); 7609 7610 if (!NeedsBswap) 7611 return NewLoad; 7612 7613 SDValue ShiftedLoad = 7614 NeedsZext 7615 ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, 7616 DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, 7617 SDLoc(N), LegalOperations)) 7618 : NewLoad; 7619 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); 7620 } 7621 7622 // If the target has andn, bsl, or a similar bit-select instruction, 7623 // we want to unfold masked merge, with canonical pattern of: 7624 // | A | |B| 7625 // ((x ^ y) & m) ^ y 7626 // | D | 7627 // Into: 7628 // (x & m) | (y & ~m) 7629 // If y is a constant, and the 'andn' does not work with immediates, 7630 // we unfold into a different pattern: 7631 // ~(~x & m) & (m | y) 7632 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at 7633 // the very least that breaks andnpd / andnps patterns, and because those 7634 // patterns are simplified in IR and shouldn't be created in the DAG 7635 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { 7636 assert(N->getOpcode() == ISD::XOR); 7637 7638 // Don't touch 'not' (i.e. where y = -1). 7639 if (isAllOnesOrAllOnesSplat(N->getOperand(1))) 7640 return SDValue(); 7641 7642 EVT VT = N->getValueType(0); 7643 7644 // There are 3 commutable operators in the pattern, 7645 // so we have to deal with 8 possible variants of the basic pattern. 7646 SDValue X, Y, M; 7647 auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) { 7648 if (And.getOpcode() != ISD::AND || !And.hasOneUse()) 7649 return false; 7650 SDValue Xor = And.getOperand(XorIdx); 7651 if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse()) 7652 return false; 7653 SDValue Xor0 = Xor.getOperand(0); 7654 SDValue Xor1 = Xor.getOperand(1); 7655 // Don't touch 'not' (i.e. where y = -1). 7656 if (isAllOnesOrAllOnesSplat(Xor1)) 7657 return false; 7658 if (Other == Xor0) 7659 std::swap(Xor0, Xor1); 7660 if (Other != Xor1) 7661 return false; 7662 X = Xor0; 7663 Y = Xor1; 7664 M = And.getOperand(XorIdx ? 0 : 1); 7665 return true; 7666 }; 7667 7668 SDValue N0 = N->getOperand(0); 7669 SDValue N1 = N->getOperand(1); 7670 if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) && 7671 !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0)) 7672 return SDValue(); 7673 7674 // Don't do anything if the mask is constant. This should not be reachable. 7675 // InstCombine should have already unfolded this pattern, and DAGCombiner 7676 // probably shouldn't produce it, too. 7677 if (isa<ConstantSDNode>(M.getNode())) 7678 return SDValue(); 7679 7680 // We can transform if the target has AndNot 7681 if (!TLI.hasAndNot(M)) 7682 return SDValue(); 7683 7684 SDLoc DL(N); 7685 7686 // If Y is a constant, check that 'andn' works with immediates. 7687 if (!TLI.hasAndNot(Y)) { 7688 assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable."); 7689 // If not, we need to do a bit more work to make sure andn is still used. 7690 SDValue NotX = DAG.getNOT(DL, X, VT); 7691 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M); 7692 SDValue NotLHS = DAG.getNOT(DL, LHS, VT); 7693 SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y); 7694 return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS); 7695 } 7696 7697 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M); 7698 SDValue NotM = DAG.getNOT(DL, M, VT); 7699 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM); 7700 7701 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); 7702 } 7703 7704 SDValue DAGCombiner::visitXOR(SDNode *N) { 7705 SDValue N0 = N->getOperand(0); 7706 SDValue N1 = N->getOperand(1); 7707 EVT VT = N0.getValueType(); 7708 7709 // fold vector ops 7710 if (VT.isVector()) { 7711 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7712 return FoldedVOp; 7713 7714 // fold (xor x, 0) -> x, vector edition 7715 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 7716 return N1; 7717 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) 7718 return N0; 7719 } 7720 7721 // fold (xor undef, undef) -> 0. This is a common idiom (misuse). 7722 SDLoc DL(N); 7723 if (N0.isUndef() && N1.isUndef()) 7724 return DAG.getConstant(0, DL, VT); 7725 7726 // fold (xor x, undef) -> undef 7727 if (N0.isUndef()) 7728 return N0; 7729 if (N1.isUndef()) 7730 return N1; 7731 7732 // fold (xor c1, c2) -> c1^c2 7733 if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1})) 7734 return C; 7735 7736 // canonicalize constant to RHS 7737 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 7738 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 7739 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 7740 7741 // fold (xor x, 0) -> x 7742 if (isNullConstant(N1)) 7743 return N0; 7744 7745 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7746 return NewSel; 7747 7748 // reassociate xor 7749 if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) 7750 return RXOR; 7751 7752 // fold !(x cc y) -> (x !cc y) 7753 unsigned N0Opcode = N0.getOpcode(); 7754 SDValue LHS, RHS, CC; 7755 if (TLI.isConstTrueVal(N1.getNode()) && 7756 isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) { 7757 ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 7758 LHS.getValueType()); 7759 if (!LegalOperations || 7760 TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { 7761 switch (N0Opcode) { 7762 default: 7763 llvm_unreachable("Unhandled SetCC Equivalent!"); 7764 case ISD::SETCC: 7765 return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); 7766 case ISD::SELECT_CC: 7767 return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), 7768 N0.getOperand(3), NotCC); 7769 case ISD::STRICT_FSETCC: 7770 case ISD::STRICT_FSETCCS: { 7771 if (N0.hasOneUse()) { 7772 // FIXME Can we handle multiple uses? Could we token factor the chain 7773 // results from the new/old setcc? 7774 SDValue SetCC = 7775 DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, 7776 N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS); 7777 CombineTo(N, SetCC); 7778 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1)); 7779 recursivelyDeleteUnusedNodes(N0.getNode()); 7780 return SDValue(N, 0); // Return N so it doesn't get rechecked! 7781 } 7782 break; 7783 } 7784 } 7785 } 7786 } 7787 7788 // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) 7789 if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() && 7790 isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ 7791 SDValue V = N0.getOperand(0); 7792 SDLoc DL0(N0); 7793 V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V, 7794 DAG.getConstant(1, DL0, V.getValueType())); 7795 AddToWorklist(V.getNode()); 7796 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V); 7797 } 7798 7799 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc 7800 if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && 7801 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7802 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7803 if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) { 7804 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7805 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7806 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7807 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7808 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7809 } 7810 } 7811 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants 7812 if (isAllOnesConstant(N1) && N0.hasOneUse() && 7813 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7814 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7815 if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) { 7816 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7817 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7818 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7819 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7820 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7821 } 7822 } 7823 7824 // fold (not (neg x)) -> (add X, -1) 7825 // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if 7826 // Y is a constant or the subtract has a single use. 7827 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && 7828 isNullConstant(N0.getOperand(0))) { 7829 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), 7830 DAG.getAllOnesConstant(DL, VT)); 7831 } 7832 7833 // fold (not (add X, -1)) -> (neg X) 7834 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD && 7835 isAllOnesOrAllOnesSplat(N0.getOperand(1))) { 7836 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 7837 N0.getOperand(0)); 7838 } 7839 7840 // fold (xor (and x, y), y) -> (and (not x), y) 7841 if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { 7842 SDValue X = N0.getOperand(0); 7843 SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); 7844 AddToWorklist(NotX.getNode()); 7845 return DAG.getNode(ISD::AND, DL, VT, NotX, N1); 7846 } 7847 7848 if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) { 7849 ConstantSDNode *XorC = isConstOrConstSplat(N1); 7850 ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1)); 7851 unsigned BitWidth = VT.getScalarSizeInBits(); 7852 if (XorC && ShiftC) { 7853 // Don't crash on an oversized shift. We can not guarantee that a bogus 7854 // shift has been simplified to undef. 7855 uint64_t ShiftAmt = ShiftC->getLimitedValue(); 7856 if (ShiftAmt < BitWidth) { 7857 APInt Ones = APInt::getAllOnesValue(BitWidth); 7858 Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt); 7859 if (XorC->getAPIntValue() == Ones) { 7860 // If the xor constant is a shifted -1, do a 'not' before the shift: 7861 // xor (X << ShiftC), XorC --> (not X) << ShiftC 7862 // xor (X >> ShiftC), XorC --> (not X) >> ShiftC 7863 SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT); 7864 return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1)); 7865 } 7866 } 7867 } 7868 } 7869 7870 // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) 7871 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 7872 SDValue A = N0Opcode == ISD::ADD ? N0 : N1; 7873 SDValue S = N0Opcode == ISD::SRA ? N0 : N1; 7874 if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { 7875 SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); 7876 SDValue S0 = S.getOperand(0); 7877 if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) 7878 if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) 7879 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1)) 7880 return DAG.getNode(ISD::ABS, DL, VT, S0); 7881 } 7882 } 7883 7884 // fold (xor x, x) -> 0 7885 if (N0 == N1) 7886 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 7887 7888 // fold (xor (shl 1, x), -1) -> (rotl ~1, x) 7889 // Here is a concrete example of this equivalence: 7890 // i16 x == 14 7891 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000 7892 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111 7893 // 7894 // => 7895 // 7896 // i16 ~1 == 0b1111111111111110 7897 // i16 rol(~1, 14) == 0b1011111111111111 7898 // 7899 // Some additional tips to help conceptualize this transform: 7900 // - Try to see the operation as placing a single zero in a value of all ones. 7901 // - There exists no value for x which would allow the result to contain zero. 7902 // - Values of x larger than the bitwidth are undefined and do not require a 7903 // consistent result. 7904 // - Pushing the zero left requires shifting one bits in from the right. 7905 // A rotate left of ~1 is a nice way of achieving the desired result. 7906 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL && 7907 isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { 7908 return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), 7909 N0.getOperand(1)); 7910 } 7911 7912 // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) 7913 if (N0Opcode == N1.getOpcode()) 7914 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 7915 return V; 7916 7917 // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable 7918 if (SDValue MM = unfoldMaskedMerge(N)) 7919 return MM; 7920 7921 // Simplify the expression using non-local knowledge. 7922 if (SimplifyDemandedBits(SDValue(N, 0))) 7923 return SDValue(N, 0); 7924 7925 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 7926 return Combined; 7927 7928 return SDValue(); 7929 } 7930 7931 /// If we have a shift-by-constant of a bitwise logic op that itself has a 7932 /// shift-by-constant operand with identical opcode, we may be able to convert 7933 /// that into 2 independent shifts followed by the logic op. This is a 7934 /// throughput improvement. 7935 static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) { 7936 // Match a one-use bitwise logic op. 7937 SDValue LogicOp = Shift->getOperand(0); 7938 if (!LogicOp.hasOneUse()) 7939 return SDValue(); 7940 7941 unsigned LogicOpcode = LogicOp.getOpcode(); 7942 if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR && 7943 LogicOpcode != ISD::XOR) 7944 return SDValue(); 7945 7946 // Find a matching one-use shift by constant. 7947 unsigned ShiftOpcode = Shift->getOpcode(); 7948 SDValue C1 = Shift->getOperand(1); 7949 ConstantSDNode *C1Node = isConstOrConstSplat(C1); 7950 assert(C1Node && "Expected a shift with constant operand"); 7951 const APInt &C1Val = C1Node->getAPIntValue(); 7952 auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp, 7953 const APInt *&ShiftAmtVal) { 7954 if (V.getOpcode() != ShiftOpcode || !V.hasOneUse()) 7955 return false; 7956 7957 ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1)); 7958 if (!ShiftCNode) 7959 return false; 7960 7961 // Capture the shifted operand and shift amount value. 7962 ShiftOp = V.getOperand(0); 7963 ShiftAmtVal = &ShiftCNode->getAPIntValue(); 7964 7965 // Shift amount types do not have to match their operand type, so check that 7966 // the constants are the same width. 7967 if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth()) 7968 return false; 7969 7970 // The fold is not valid if the sum of the shift values exceeds bitwidth. 7971 if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits())) 7972 return false; 7973 7974 return true; 7975 }; 7976 7977 // Logic ops are commutative, so check each operand for a match. 7978 SDValue X, Y; 7979 const APInt *C0Val; 7980 if (matchFirstShift(LogicOp.getOperand(0), X, C0Val)) 7981 Y = LogicOp.getOperand(1); 7982 else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val)) 7983 Y = LogicOp.getOperand(0); 7984 else 7985 return SDValue(); 7986 7987 // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) 7988 SDLoc DL(Shift); 7989 EVT VT = Shift->getValueType(0); 7990 EVT ShiftAmtVT = Shift->getOperand(1).getValueType(); 7991 SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT); 7992 SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC); 7993 SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1); 7994 return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); 7995 } 7996 7997 /// Handle transforms common to the three shifts, when the shift amount is a 7998 /// constant. 7999 /// We are looking for: (shift being one of shl/sra/srl) 8000 /// shift (binop X, C0), C1 8001 /// And want to transform into: 8002 /// binop (shift X, C1), (shift C0, C1) 8003 SDValue DAGCombiner::visitShiftByConstant(SDNode *N) { 8004 assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand"); 8005 8006 // Do not turn a 'not' into a regular xor. 8007 if (isBitwiseNot(N->getOperand(0))) 8008 return SDValue(); 8009 8010 // The inner binop must be one-use, since we want to replace it. 8011 SDValue LHS = N->getOperand(0); 8012 if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level)) 8013 return SDValue(); 8014 8015 // TODO: This is limited to early combining because it may reveal regressions 8016 // otherwise. But since we just checked a target hook to see if this is 8017 // desirable, that should have filtered out cases where this interferes 8018 // with some other pattern matching. 8019 if (!LegalTypes) 8020 if (SDValue R = combineShiftOfShiftedLogic(N, DAG)) 8021 return R; 8022 8023 // We want to pull some binops through shifts, so that we have (and (shift)) 8024 // instead of (shift (and)), likewise for add, or, xor, etc. This sort of 8025 // thing happens with address calculations, so it's important to canonicalize 8026 // it. 8027 switch (LHS.getOpcode()) { 8028 default: 8029 return SDValue(); 8030 case ISD::OR: 8031 case ISD::XOR: 8032 case ISD::AND: 8033 break; 8034 case ISD::ADD: 8035 if (N->getOpcode() != ISD::SHL) 8036 return SDValue(); // only shl(add) not sr[al](add). 8037 break; 8038 } 8039 8040 // We require the RHS of the binop to be a constant and not opaque as well. 8041 ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1)); 8042 if (!BinOpCst) 8043 return SDValue(); 8044 8045 // FIXME: disable this unless the input to the binop is a shift by a constant 8046 // or is copy/select. Enable this in other cases when figure out it's exactly 8047 // profitable. 8048 SDValue BinOpLHSVal = LHS.getOperand(0); 8049 bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL || 8050 BinOpLHSVal.getOpcode() == ISD::SRA || 8051 BinOpLHSVal.getOpcode() == ISD::SRL) && 8052 isa<ConstantSDNode>(BinOpLHSVal.getOperand(1)); 8053 bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg || 8054 BinOpLHSVal.getOpcode() == ISD::SELECT; 8055 8056 if (!IsShiftByConstant && !IsCopyOrSelect) 8057 return SDValue(); 8058 8059 if (IsCopyOrSelect && N->hasOneUse()) 8060 return SDValue(); 8061 8062 // Fold the constants, shifting the binop RHS by the shift amount. 8063 SDLoc DL(N); 8064 EVT VT = N->getValueType(0); 8065 SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1), 8066 N->getOperand(1)); 8067 assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!"); 8068 8069 SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0), 8070 N->getOperand(1)); 8071 return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS); 8072 } 8073 8074 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { 8075 assert(N->getOpcode() == ISD::TRUNCATE); 8076 assert(N->getOperand(0).getOpcode() == ISD::AND); 8077 8078 // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) 8079 EVT TruncVT = N->getValueType(0); 8080 if (N->hasOneUse() && N->getOperand(0).hasOneUse() && 8081 TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { 8082 SDValue N01 = N->getOperand(0).getOperand(1); 8083 if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { 8084 SDLoc DL(N); 8085 SDValue N00 = N->getOperand(0).getOperand(0); 8086 SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); 8087 SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); 8088 AddToWorklist(Trunc00.getNode()); 8089 AddToWorklist(Trunc01.getNode()); 8090 return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); 8091 } 8092 } 8093 8094 return SDValue(); 8095 } 8096 8097 SDValue DAGCombiner::visitRotate(SDNode *N) { 8098 SDLoc dl(N); 8099 SDValue N0 = N->getOperand(0); 8100 SDValue N1 = N->getOperand(1); 8101 EVT VT = N->getValueType(0); 8102 unsigned Bitsize = VT.getScalarSizeInBits(); 8103 8104 // fold (rot x, 0) -> x 8105 if (isNullOrNullSplat(N1)) 8106 return N0; 8107 8108 // fold (rot x, c) -> x iff (c % BitSize) == 0 8109 if (isPowerOf2_32(Bitsize) && Bitsize > 1) { 8110 APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1); 8111 if (DAG.MaskedValueIsZero(N1, ModuloMask)) 8112 return N0; 8113 } 8114 8115 // fold (rot x, c) -> (rot x, c % BitSize) 8116 bool OutOfRange = false; 8117 auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) { 8118 OutOfRange |= C->getAPIntValue().uge(Bitsize); 8119 return true; 8120 }; 8121 if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) { 8122 EVT AmtVT = N1.getValueType(); 8123 SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT); 8124 if (SDValue Amt = 8125 DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits})) 8126 return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt); 8127 } 8128 8129 // rot i16 X, 8 --> bswap X 8130 auto *RotAmtC = isConstOrConstSplat(N1); 8131 if (RotAmtC && RotAmtC->getAPIntValue() == 8 && 8132 VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT)) 8133 return DAG.getNode(ISD::BSWAP, dl, VT, N0); 8134 8135 // Simplify the operands using demanded-bits information. 8136 if (SimplifyDemandedBits(SDValue(N, 0))) 8137 return SDValue(N, 0); 8138 8139 // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). 8140 if (N1.getOpcode() == ISD::TRUNCATE && 8141 N1.getOperand(0).getOpcode() == ISD::AND) { 8142 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8143 return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); 8144 } 8145 8146 unsigned NextOp = N0.getOpcode(); 8147 // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) 8148 if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { 8149 SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); 8150 SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); 8151 if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { 8152 EVT ShiftVT = C1->getValueType(0); 8153 bool SameSide = (N->getOpcode() == NextOp); 8154 unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; 8155 if (SDValue CombinedShift = DAG.FoldConstantArithmetic( 8156 CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) { 8157 SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); 8158 SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( 8159 ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC}); 8160 return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), 8161 CombinedShiftNorm); 8162 } 8163 } 8164 } 8165 return SDValue(); 8166 } 8167 8168 SDValue DAGCombiner::visitSHL(SDNode *N) { 8169 SDValue N0 = N->getOperand(0); 8170 SDValue N1 = N->getOperand(1); 8171 if (SDValue V = DAG.simplifyShift(N0, N1)) 8172 return V; 8173 8174 EVT VT = N0.getValueType(); 8175 EVT ShiftVT = N1.getValueType(); 8176 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 8177 8178 // fold vector ops 8179 if (VT.isVector()) { 8180 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 8181 return FoldedVOp; 8182 8183 BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1); 8184 // If setcc produces all-one true value then: 8185 // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV) 8186 if (N1CV && N1CV->isConstant()) { 8187 if (N0.getOpcode() == ISD::AND) { 8188 SDValue N00 = N0->getOperand(0); 8189 SDValue N01 = N0->getOperand(1); 8190 BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01); 8191 8192 if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && 8193 TLI.getBooleanContents(N00.getOperand(0).getValueType()) == 8194 TargetLowering::ZeroOrNegativeOneBooleanContent) { 8195 if (SDValue C = 8196 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1})) 8197 return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); 8198 } 8199 } 8200 } 8201 } 8202 8203 ConstantSDNode *N1C = isConstOrConstSplat(N1); 8204 8205 // fold (shl c1, c2) -> c1<<c2 8206 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) 8207 return C; 8208 8209 if (SDValue NewSel = foldBinOpIntoSelect(N)) 8210 return NewSel; 8211 8212 // if (shl x, c) is known to be zero, return 0 8213 if (DAG.MaskedValueIsZero(SDValue(N, 0), 8214 APInt::getAllOnesValue(OpSizeInBits))) 8215 return DAG.getConstant(0, SDLoc(N), VT); 8216 8217 // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). 8218 if (N1.getOpcode() == ISD::TRUNCATE && 8219 N1.getOperand(0).getOpcode() == ISD::AND) { 8220 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8221 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); 8222 } 8223 8224 if (SimplifyDemandedBits(SDValue(N, 0))) 8225 return SDValue(N, 0); 8226 8227 // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) 8228 if (N0.getOpcode() == ISD::SHL) { 8229 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 8230 ConstantSDNode *RHS) { 8231 APInt c1 = LHS->getAPIntValue(); 8232 APInt c2 = RHS->getAPIntValue(); 8233 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8234 return (c1 + c2).uge(OpSizeInBits); 8235 }; 8236 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 8237 return DAG.getConstant(0, SDLoc(N), VT); 8238 8239 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 8240 ConstantSDNode *RHS) { 8241 APInt c1 = LHS->getAPIntValue(); 8242 APInt c2 = RHS->getAPIntValue(); 8243 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8244 return (c1 + c2).ult(OpSizeInBits); 8245 }; 8246 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 8247 SDLoc DL(N); 8248 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 8249 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); 8250 } 8251 } 8252 8253 // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) 8254 // For this to be valid, the second form must not preserve any of the bits 8255 // that are shifted out by the inner shift in the first form. This means 8256 // the outer shift size must be >= the number of bits added by the ext. 8257 // As a corollary, we don't care what kind of ext it is. 8258 if ((N0.getOpcode() == ISD::ZERO_EXTEND || 8259 N0.getOpcode() == ISD::ANY_EXTEND || 8260 N0.getOpcode() == ISD::SIGN_EXTEND) && 8261 N0.getOperand(0).getOpcode() == ISD::SHL) { 8262 SDValue N0Op0 = N0.getOperand(0); 8263 SDValue InnerShiftAmt = N0Op0.getOperand(1); 8264 EVT InnerVT = N0Op0.getValueType(); 8265 uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits(); 8266 8267 auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 8268 ConstantSDNode *RHS) { 8269 APInt c1 = LHS->getAPIntValue(); 8270 APInt c2 = RHS->getAPIntValue(); 8271 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8272 return c2.uge(OpSizeInBits - InnerBitwidth) && 8273 (c1 + c2).uge(OpSizeInBits); 8274 }; 8275 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange, 8276 /*AllowUndefs*/ false, 8277 /*AllowTypeMismatch*/ true)) 8278 return DAG.getConstant(0, SDLoc(N), VT); 8279 8280 auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 8281 ConstantSDNode *RHS) { 8282 APInt c1 = LHS->getAPIntValue(); 8283 APInt c2 = RHS->getAPIntValue(); 8284 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8285 return c2.uge(OpSizeInBits - InnerBitwidth) && 8286 (c1 + c2).ult(OpSizeInBits); 8287 }; 8288 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange, 8289 /*AllowUndefs*/ false, 8290 /*AllowTypeMismatch*/ true)) { 8291 SDLoc DL(N); 8292 SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0)); 8293 SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT); 8294 Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1); 8295 return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum); 8296 } 8297 } 8298 8299 // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) 8300 // Only fold this if the inner zext has no other uses to avoid increasing 8301 // the total number of instructions. 8302 if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && 8303 N0.getOperand(0).getOpcode() == ISD::SRL) { 8304 SDValue N0Op0 = N0.getOperand(0); 8305 SDValue InnerShiftAmt = N0Op0.getOperand(1); 8306 8307 auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) { 8308 APInt c1 = LHS->getAPIntValue(); 8309 APInt c2 = RHS->getAPIntValue(); 8310 zeroExtendToMatch(c1, c2); 8311 return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2); 8312 }; 8313 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual, 8314 /*AllowUndefs*/ false, 8315 /*AllowTypeMismatch*/ true)) { 8316 SDLoc DL(N); 8317 EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType(); 8318 SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT); 8319 NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL); 8320 AddToWorklist(NewSHL.getNode()); 8321 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); 8322 } 8323 } 8324 8325 // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 8326 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 8327 // TODO - support non-uniform vector shift amounts. 8328 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && 8329 N0->getFlags().hasExact()) { 8330 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 8331 uint64_t C1 = N0C1->getZExtValue(); 8332 uint64_t C2 = N1C->getZExtValue(); 8333 SDLoc DL(N); 8334 if (C1 <= C2) 8335 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 8336 DAG.getConstant(C2 - C1, DL, ShiftVT)); 8337 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), 8338 DAG.getConstant(C1 - C2, DL, ShiftVT)); 8339 } 8340 } 8341 8342 // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or 8343 // (and (srl x, (sub c1, c2), MASK) 8344 // Only fold this if the inner shift has no other uses -- if it does, folding 8345 // this will increase the total number of instructions. 8346 // TODO - drop hasOneUse requirement if c1 == c2? 8347 // TODO - support non-uniform vector shift amounts. 8348 if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && 8349 TLI.shouldFoldConstantShiftPairToMask(N, Level)) { 8350 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 8351 if (N0C1->getAPIntValue().ult(OpSizeInBits)) { 8352 uint64_t c1 = N0C1->getZExtValue(); 8353 uint64_t c2 = N1C->getZExtValue(); 8354 APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); 8355 SDValue Shift; 8356 if (c2 > c1) { 8357 Mask <<= c2 - c1; 8358 SDLoc DL(N); 8359 Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 8360 DAG.getConstant(c2 - c1, DL, ShiftVT)); 8361 } else { 8362 Mask.lshrInPlace(c1 - c2); 8363 SDLoc DL(N); 8364 Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), 8365 DAG.getConstant(c1 - c2, DL, ShiftVT)); 8366 } 8367 SDLoc DL(N0); 8368 return DAG.getNode(ISD::AND, DL, VT, Shift, 8369 DAG.getConstant(Mask, DL, VT)); 8370 } 8371 } 8372 } 8373 8374 // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) 8375 if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && 8376 isConstantOrConstantVector(N1, /* No Opaques */ true)) { 8377 SDLoc DL(N); 8378 SDValue AllBits = DAG.getAllOnesConstant(DL, VT); 8379 SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); 8380 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); 8381 } 8382 8383 // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 8384 // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) 8385 // Variant of version done on multiply, except mul by a power of 2 is turned 8386 // into a shift. 8387 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && 8388 N0.getNode()->hasOneUse() && 8389 isConstantOrConstantVector(N1, /* No Opaques */ true) && 8390 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) && 8391 TLI.isDesirableToCommuteWithShift(N, Level)) { 8392 SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); 8393 SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 8394 AddToWorklist(Shl0.getNode()); 8395 AddToWorklist(Shl1.getNode()); 8396 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); 8397 } 8398 8399 // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) 8400 if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() && 8401 isConstantOrConstantVector(N1, /* No Opaques */ true) && 8402 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { 8403 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 8404 if (isConstantOrConstantVector(Shl)) 8405 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); 8406 } 8407 8408 if (N1C && !N1C->isOpaque()) 8409 if (SDValue NewSHL = visitShiftByConstant(N)) 8410 return NewSHL; 8411 8412 // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)). 8413 if (N0.getOpcode() == ISD::VSCALE) 8414 if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) { 8415 const APInt &C0 = N0.getConstantOperandAPInt(0); 8416 const APInt &C1 = NC1->getAPIntValue(); 8417 return DAG.getVScale(SDLoc(N), VT, C0 << C1); 8418 } 8419 8420 // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)). 8421 APInt ShlVal; 8422 if (N0.getOpcode() == ISD::STEP_VECTOR) 8423 if (ISD::isConstantSplatVector(N1.getNode(), ShlVal)) { 8424 const APInt &C0 = N0.getConstantOperandAPInt(0); 8425 EVT SVT = N0.getOperand(0).getValueType(); 8426 SDValue NewStep = DAG.getConstant( 8427 C0 << ShlVal.sextOrTrunc(SVT.getSizeInBits()), SDLoc(N), SVT); 8428 return DAG.getStepVector(SDLoc(N), VT, NewStep); 8429 } 8430 8431 return SDValue(); 8432 } 8433 8434 // Transform a right shift of a multiply into a multiply-high. 8435 // Examples: 8436 // (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b) 8437 // (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b) 8438 static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, 8439 const TargetLowering &TLI) { 8440 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 8441 "SRL or SRA node is required here!"); 8442 8443 // Check the shift amount. Proceed with the transformation if the shift 8444 // amount is constant. 8445 ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1)); 8446 if (!ShiftAmtSrc) 8447 return SDValue(); 8448 8449 SDLoc DL(N); 8450 8451 // The operation feeding into the shift must be a multiply. 8452 SDValue ShiftOperand = N->getOperand(0); 8453 if (ShiftOperand.getOpcode() != ISD::MUL) 8454 return SDValue(); 8455 8456 // Both operands must be equivalent extend nodes. 8457 SDValue LeftOp = ShiftOperand.getOperand(0); 8458 SDValue RightOp = ShiftOperand.getOperand(1); 8459 bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND; 8460 bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND; 8461 8462 if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode()) 8463 return SDValue(); 8464 8465 EVT WideVT1 = LeftOp.getValueType(); 8466 EVT WideVT2 = RightOp.getValueType(); 8467 (void)WideVT2; 8468 // Proceed with the transformation if the wide types match. 8469 assert((WideVT1 == WideVT2) && 8470 "Cannot have a multiply node with two different operand types."); 8471 8472 EVT NarrowVT = LeftOp.getOperand(0).getValueType(); 8473 // Check that the two extend nodes are the same type. 8474 if (NarrowVT != RightOp.getOperand(0).getValueType()) 8475 return SDValue(); 8476 8477 // Proceed with the transformation if the wide type is twice as large 8478 // as the narrow type. 8479 unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits(); 8480 if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize) 8481 return SDValue(); 8482 8483 // Check the shift amount with the narrow type size. 8484 // Proceed with the transformation if the shift amount is the width 8485 // of the narrow type. 8486 unsigned ShiftAmt = ShiftAmtSrc->getZExtValue(); 8487 if (ShiftAmt != NarrowVTSize) 8488 return SDValue(); 8489 8490 // If the operation feeding into the MUL is a sign extend (sext), 8491 // we use mulhs. Othewise, zero extends (zext) use mulhu. 8492 unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU; 8493 8494 // Combine to mulh if mulh is legal/custom for the narrow type on the target. 8495 if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT)) 8496 return SDValue(); 8497 8498 SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), 8499 RightOp.getOperand(0)); 8500 return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1) 8501 : DAG.getZExtOrTrunc(Result, DL, WideVT1)); 8502 } 8503 8504 SDValue DAGCombiner::visitSRA(SDNode *N) { 8505 SDValue N0 = N->getOperand(0); 8506 SDValue N1 = N->getOperand(1); 8507 if (SDValue V = DAG.simplifyShift(N0, N1)) 8508 return V; 8509 8510 EVT VT = N0.getValueType(); 8511 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 8512 8513 // Arithmetic shifting an all-sign-bit value is a no-op. 8514 // fold (sra 0, x) -> 0 8515 // fold (sra -1, x) -> -1 8516 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) 8517 return N0; 8518 8519 // fold vector ops 8520 if (VT.isVector()) 8521 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 8522 return FoldedVOp; 8523 8524 ConstantSDNode *N1C = isConstOrConstSplat(N1); 8525 8526 // fold (sra c1, c2) -> (sra c1, c2) 8527 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) 8528 return C; 8529 8530 if (SDValue NewSel = foldBinOpIntoSelect(N)) 8531 return NewSel; 8532 8533 // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports 8534 // sext_inreg. 8535 if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { 8536 unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); 8537 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); 8538 if (VT.isVector()) 8539 ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, 8540 VT.getVectorElementCount()); 8541 if (!LegalOperations || 8542 TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == 8543 TargetLowering::Legal) 8544 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, 8545 N0.getOperand(0), DAG.getValueType(ExtVT)); 8546 // Even if we can't convert to sext_inreg, we might be able to remove 8547 // this shift pair if the input is already sign extended. 8548 if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue()) 8549 return N0.getOperand(0); 8550 } 8551 8552 // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) 8553 // clamp (add c1, c2) to max shift. 8554 if (N0.getOpcode() == ISD::SRA) { 8555 SDLoc DL(N); 8556 EVT ShiftVT = N1.getValueType(); 8557 EVT ShiftSVT = ShiftVT.getScalarType(); 8558 SmallVector<SDValue, 16> ShiftValues; 8559 8560 auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) { 8561 APInt c1 = LHS->getAPIntValue(); 8562 APInt c2 = RHS->getAPIntValue(); 8563 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8564 APInt Sum = c1 + c2; 8565 unsigned ShiftSum = 8566 Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue(); 8567 ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT)); 8568 return true; 8569 }; 8570 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) { 8571 SDValue ShiftValue; 8572 if (VT.isVector()) 8573 ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues); 8574 else 8575 ShiftValue = ShiftValues[0]; 8576 return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue); 8577 } 8578 } 8579 8580 // fold (sra (shl X, m), (sub result_size, n)) 8581 // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for 8582 // result_size - n != m. 8583 // If truncate is free for the target sext(shl) is likely to result in better 8584 // code. 8585 if (N0.getOpcode() == ISD::SHL && N1C) { 8586 // Get the two constanst of the shifts, CN0 = m, CN = n. 8587 const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); 8588 if (N01C) { 8589 LLVMContext &Ctx = *DAG.getContext(); 8590 // Determine what the truncate's result bitsize and type would be. 8591 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); 8592 8593 if (VT.isVector()) 8594 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount()); 8595 8596 // Determine the residual right-shift amount. 8597 int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); 8598 8599 // If the shift is not a no-op (in which case this should be just a sign 8600 // extend already), the truncated to type is legal, sign_extend is legal 8601 // on that type, and the truncate to that type is both legal and free, 8602 // perform the transform. 8603 if ((ShiftAmt > 0) && 8604 TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && 8605 TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && 8606 TLI.isTruncateFree(VT, TruncVT)) { 8607 SDLoc DL(N); 8608 SDValue Amt = DAG.getConstant(ShiftAmt, DL, 8609 getShiftAmountTy(N0.getOperand(0).getValueType())); 8610 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, 8611 N0.getOperand(0), Amt); 8612 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, 8613 Shift); 8614 return DAG.getNode(ISD::SIGN_EXTEND, DL, 8615 N->getValueType(0), Trunc); 8616 } 8617 } 8618 } 8619 8620 // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper. 8621 // sra (add (shl X, N1C), AddC), N1C --> 8622 // sext (add (trunc X to (width - N1C)), AddC') 8623 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C && 8624 N0.getOperand(0).getOpcode() == ISD::SHL && 8625 N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) { 8626 if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) { 8627 SDValue Shl = N0.getOperand(0); 8628 // Determine what the truncate's type would be and ask the target if that 8629 // is a free operation. 8630 LLVMContext &Ctx = *DAG.getContext(); 8631 unsigned ShiftAmt = N1C->getZExtValue(); 8632 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt); 8633 if (VT.isVector()) 8634 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount()); 8635 8636 // TODO: The simple type check probably belongs in the default hook 8637 // implementation and/or target-specific overrides (because 8638 // non-simple types likely require masking when legalized), but that 8639 // restriction may conflict with other transforms. 8640 if (TruncVT.isSimple() && isTypeLegal(TruncVT) && 8641 TLI.isTruncateFree(VT, TruncVT)) { 8642 SDLoc DL(N); 8643 SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT); 8644 SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt). 8645 trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT); 8646 SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC); 8647 return DAG.getSExtOrTrunc(Add, DL, VT); 8648 } 8649 } 8650 } 8651 8652 // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). 8653 if (N1.getOpcode() == ISD::TRUNCATE && 8654 N1.getOperand(0).getOpcode() == ISD::AND) { 8655 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8656 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); 8657 } 8658 8659 // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) 8660 // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) 8661 // if c1 is equal to the number of bits the trunc removes 8662 // TODO - support non-uniform vector shift amounts. 8663 if (N0.getOpcode() == ISD::TRUNCATE && 8664 (N0.getOperand(0).getOpcode() == ISD::SRL || 8665 N0.getOperand(0).getOpcode() == ISD::SRA) && 8666 N0.getOperand(0).hasOneUse() && 8667 N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { 8668 SDValue N0Op0 = N0.getOperand(0); 8669 if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { 8670 EVT LargeVT = N0Op0.getValueType(); 8671 unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits; 8672 if (LargeShift->getAPIntValue() == TruncBits) { 8673 SDLoc DL(N); 8674 SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL, 8675 getShiftAmountTy(LargeVT)); 8676 SDValue SRA = 8677 DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); 8678 return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); 8679 } 8680 } 8681 } 8682 8683 // Simplify, based on bits shifted out of the LHS. 8684 if (SimplifyDemandedBits(SDValue(N, 0))) 8685 return SDValue(N, 0); 8686 8687 // If the sign bit is known to be zero, switch this to a SRL. 8688 if (DAG.SignBitIsZero(N0)) 8689 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); 8690 8691 if (N1C && !N1C->isOpaque()) 8692 if (SDValue NewSRA = visitShiftByConstant(N)) 8693 return NewSRA; 8694 8695 // Try to transform this shift into a multiply-high if 8696 // it matches the appropriate pattern detected in combineShiftToMULH. 8697 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) 8698 return MULH; 8699 8700 return SDValue(); 8701 } 8702 8703 SDValue DAGCombiner::visitSRL(SDNode *N) { 8704 SDValue N0 = N->getOperand(0); 8705 SDValue N1 = N->getOperand(1); 8706 if (SDValue V = DAG.simplifyShift(N0, N1)) 8707 return V; 8708 8709 EVT VT = N0.getValueType(); 8710 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 8711 8712 // fold vector ops 8713 if (VT.isVector()) 8714 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 8715 return FoldedVOp; 8716 8717 ConstantSDNode *N1C = isConstOrConstSplat(N1); 8718 8719 // fold (srl c1, c2) -> c1 >>u c2 8720 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) 8721 return C; 8722 8723 if (SDValue NewSel = foldBinOpIntoSelect(N)) 8724 return NewSel; 8725 8726 // if (srl x, c) is known to be zero, return 0 8727 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 8728 APInt::getAllOnesValue(OpSizeInBits))) 8729 return DAG.getConstant(0, SDLoc(N), VT); 8730 8731 // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) 8732 if (N0.getOpcode() == ISD::SRL) { 8733 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 8734 ConstantSDNode *RHS) { 8735 APInt c1 = LHS->getAPIntValue(); 8736 APInt c2 = RHS->getAPIntValue(); 8737 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8738 return (c1 + c2).uge(OpSizeInBits); 8739 }; 8740 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 8741 return DAG.getConstant(0, SDLoc(N), VT); 8742 8743 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 8744 ConstantSDNode *RHS) { 8745 APInt c1 = LHS->getAPIntValue(); 8746 APInt c2 = RHS->getAPIntValue(); 8747 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8748 return (c1 + c2).ult(OpSizeInBits); 8749 }; 8750 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 8751 SDLoc DL(N); 8752 EVT ShiftVT = N1.getValueType(); 8753 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 8754 return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); 8755 } 8756 } 8757 8758 if (N1C && N0.getOpcode() == ISD::TRUNCATE && 8759 N0.getOperand(0).getOpcode() == ISD::SRL) { 8760 SDValue InnerShift = N0.getOperand(0); 8761 // TODO - support non-uniform vector shift amounts. 8762 if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) { 8763 uint64_t c1 = N001C->getZExtValue(); 8764 uint64_t c2 = N1C->getZExtValue(); 8765 EVT InnerShiftVT = InnerShift.getValueType(); 8766 EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType(); 8767 uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); 8768 // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2))) 8769 // This is only valid if the OpSizeInBits + c1 = size of inner shift. 8770 if (c1 + OpSizeInBits == InnerShiftSize) { 8771 SDLoc DL(N); 8772 if (c1 + c2 >= InnerShiftSize) 8773 return DAG.getConstant(0, DL, VT); 8774 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 8775 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 8776 InnerShift.getOperand(0), NewShiftAmt); 8777 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift); 8778 } 8779 // In the more general case, we can clear the high bits after the shift: 8780 // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask) 8781 if (N0.hasOneUse() && InnerShift.hasOneUse() && 8782 c1 + c2 < InnerShiftSize) { 8783 SDLoc DL(N); 8784 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 8785 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 8786 InnerShift.getOperand(0), NewShiftAmt); 8787 SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize, 8788 OpSizeInBits - c2), 8789 DL, InnerShiftVT); 8790 SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask); 8791 return DAG.getNode(ISD::TRUNCATE, DL, VT, And); 8792 } 8793 } 8794 } 8795 8796 // fold (srl (shl x, c), c) -> (and x, cst2) 8797 // TODO - (srl (shl x, c1), c2). 8798 if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && 8799 isConstantOrConstantVector(N1, /* NoOpaques */ true)) { 8800 SDLoc DL(N); 8801 SDValue Mask = 8802 DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); 8803 AddToWorklist(Mask.getNode()); 8804 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); 8805 } 8806 8807 // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) 8808 // TODO - support non-uniform vector shift amounts. 8809 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 8810 // Shifting in all undef bits? 8811 EVT SmallVT = N0.getOperand(0).getValueType(); 8812 unsigned BitSize = SmallVT.getScalarSizeInBits(); 8813 if (N1C->getAPIntValue().uge(BitSize)) 8814 return DAG.getUNDEF(VT); 8815 8816 if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { 8817 uint64_t ShiftAmt = N1C->getZExtValue(); 8818 SDLoc DL0(N0); 8819 SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT, 8820 N0.getOperand(0), 8821 DAG.getConstant(ShiftAmt, DL0, 8822 getShiftAmountTy(SmallVT))); 8823 AddToWorklist(SmallShift.getNode()); 8824 APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); 8825 SDLoc DL(N); 8826 return DAG.getNode(ISD::AND, DL, VT, 8827 DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), 8828 DAG.getConstant(Mask, DL, VT)); 8829 } 8830 } 8831 8832 // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign 8833 // bit, which is unmodified by sra. 8834 if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) { 8835 if (N0.getOpcode() == ISD::SRA) 8836 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); 8837 } 8838 8839 // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). 8840 if (N1C && N0.getOpcode() == ISD::CTLZ && 8841 N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { 8842 KnownBits Known = DAG.computeKnownBits(N0.getOperand(0)); 8843 8844 // If any of the input bits are KnownOne, then the input couldn't be all 8845 // zeros, thus the result of the srl will always be zero. 8846 if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); 8847 8848 // If all of the bits input the to ctlz node are known to be zero, then 8849 // the result of the ctlz is "32" and the result of the shift is one. 8850 APInt UnknownBits = ~Known.Zero; 8851 if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); 8852 8853 // Otherwise, check to see if there is exactly one bit input to the ctlz. 8854 if (UnknownBits.isPowerOf2()) { 8855 // Okay, we know that only that the single bit specified by UnknownBits 8856 // could be set on input to the CTLZ node. If this bit is set, the SRL 8857 // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair 8858 // to an SRL/XOR pair, which is likely to simplify more. 8859 unsigned ShAmt = UnknownBits.countTrailingZeros(); 8860 SDValue Op = N0.getOperand(0); 8861 8862 if (ShAmt) { 8863 SDLoc DL(N0); 8864 Op = DAG.getNode(ISD::SRL, DL, VT, Op, 8865 DAG.getConstant(ShAmt, DL, 8866 getShiftAmountTy(Op.getValueType()))); 8867 AddToWorklist(Op.getNode()); 8868 } 8869 8870 SDLoc DL(N); 8871 return DAG.getNode(ISD::XOR, DL, VT, 8872 Op, DAG.getConstant(1, DL, VT)); 8873 } 8874 } 8875 8876 // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). 8877 if (N1.getOpcode() == ISD::TRUNCATE && 8878 N1.getOperand(0).getOpcode() == ISD::AND) { 8879 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8880 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); 8881 } 8882 8883 // fold operands of srl based on knowledge that the low bits are not 8884 // demanded. 8885 if (SimplifyDemandedBits(SDValue(N, 0))) 8886 return SDValue(N, 0); 8887 8888 if (N1C && !N1C->isOpaque()) 8889 if (SDValue NewSRL = visitShiftByConstant(N)) 8890 return NewSRL; 8891 8892 // Attempt to convert a srl of a load into a narrower zero-extending load. 8893 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 8894 return NarrowLoad; 8895 8896 // Here is a common situation. We want to optimize: 8897 // 8898 // %a = ... 8899 // %b = and i32 %a, 2 8900 // %c = srl i32 %b, 1 8901 // brcond i32 %c ... 8902 // 8903 // into 8904 // 8905 // %a = ... 8906 // %b = and %a, 2 8907 // %c = setcc eq %b, 0 8908 // brcond %c ... 8909 // 8910 // However when after the source operand of SRL is optimized into AND, the SRL 8911 // itself may not be optimized further. Look for it and add the BRCOND into 8912 // the worklist. 8913 if (N->hasOneUse()) { 8914 SDNode *Use = *N->use_begin(); 8915 if (Use->getOpcode() == ISD::BRCOND) 8916 AddToWorklist(Use); 8917 else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { 8918 // Also look pass the truncate. 8919 Use = *Use->use_begin(); 8920 if (Use->getOpcode() == ISD::BRCOND) 8921 AddToWorklist(Use); 8922 } 8923 } 8924 8925 // Try to transform this shift into a multiply-high if 8926 // it matches the appropriate pattern detected in combineShiftToMULH. 8927 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) 8928 return MULH; 8929 8930 return SDValue(); 8931 } 8932 8933 SDValue DAGCombiner::visitFunnelShift(SDNode *N) { 8934 EVT VT = N->getValueType(0); 8935 SDValue N0 = N->getOperand(0); 8936 SDValue N1 = N->getOperand(1); 8937 SDValue N2 = N->getOperand(2); 8938 bool IsFSHL = N->getOpcode() == ISD::FSHL; 8939 unsigned BitWidth = VT.getScalarSizeInBits(); 8940 8941 // fold (fshl N0, N1, 0) -> N0 8942 // fold (fshr N0, N1, 0) -> N1 8943 if (isPowerOf2_32(BitWidth)) 8944 if (DAG.MaskedValueIsZero( 8945 N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) 8946 return IsFSHL ? N0 : N1; 8947 8948 auto IsUndefOrZero = [](SDValue V) { 8949 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); 8950 }; 8951 8952 // TODO - support non-uniform vector shift amounts. 8953 if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { 8954 EVT ShAmtTy = N2.getValueType(); 8955 8956 // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) 8957 if (Cst->getAPIntValue().uge(BitWidth)) { 8958 uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); 8959 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, 8960 DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy)); 8961 } 8962 8963 unsigned ShAmt = Cst->getZExtValue(); 8964 if (ShAmt == 0) 8965 return IsFSHL ? N0 : N1; 8966 8967 // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C) 8968 // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C) 8969 // fold fshl(N0, undef_or_zero, C) -> shl(N0, C) 8970 // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C) 8971 if (IsUndefOrZero(N0)) 8972 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, 8973 DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, 8974 SDLoc(N), ShAmtTy)); 8975 if (IsUndefOrZero(N1)) 8976 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, 8977 DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, 8978 SDLoc(N), ShAmtTy)); 8979 8980 // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. 8981 // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. 8982 // TODO - bigendian support once we have test coverage. 8983 // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine? 8984 // TODO - permit LHS EXTLOAD if extensions are shifted out. 8985 if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() && 8986 !DAG.getDataLayout().isBigEndian()) { 8987 auto *LHS = dyn_cast<LoadSDNode>(N0); 8988 auto *RHS = dyn_cast<LoadSDNode>(N1); 8989 if (LHS && RHS && LHS->isSimple() && RHS->isSimple() && 8990 LHS->getAddressSpace() == RHS->getAddressSpace() && 8991 (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) && 8992 ISD::isNON_EXTLoad(LHS)) { 8993 if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) { 8994 SDLoc DL(RHS); 8995 uint64_t PtrOff = 8996 IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8); 8997 Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff); 8998 bool Fast = false; 8999 if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 9000 RHS->getAddressSpace(), NewAlign, 9001 RHS->getMemOperand()->getFlags(), &Fast) && 9002 Fast) { 9003 SDValue NewPtr = DAG.getMemBasePlusOffset( 9004 RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL); 9005 AddToWorklist(NewPtr.getNode()); 9006 SDValue Load = DAG.getLoad( 9007 VT, DL, RHS->getChain(), NewPtr, 9008 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign, 9009 RHS->getMemOperand()->getFlags(), RHS->getAAInfo()); 9010 // Replace the old load's chain with the new load's chain. 9011 WorklistRemover DeadNodes(*this); 9012 DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1)); 9013 return Load; 9014 } 9015 } 9016 } 9017 } 9018 } 9019 9020 // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) 9021 // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2) 9022 // iff We know the shift amount is in range. 9023 // TODO: when is it worth doing SUB(BW, N2) as well? 9024 if (isPowerOf2_32(BitWidth)) { 9025 APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1); 9026 if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 9027 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2); 9028 if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 9029 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2); 9030 } 9031 9032 // fold (fshl N0, N0, N2) -> (rotl N0, N2) 9033 // fold (fshr N0, N0, N2) -> (rotr N0, N2) 9034 // TODO: Investigate flipping this rotate if only one is legal, if funnel shift 9035 // is legal as well we might be better off avoiding non-constant (BW - N2). 9036 unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR; 9037 if (N0 == N1 && hasOperation(RotOpc, VT)) 9038 return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); 9039 9040 // Simplify, based on bits shifted out of N0/N1. 9041 if (SimplifyDemandedBits(SDValue(N, 0))) 9042 return SDValue(N, 0); 9043 9044 return SDValue(); 9045 } 9046 9047 SDValue DAGCombiner::visitABS(SDNode *N) { 9048 SDValue N0 = N->getOperand(0); 9049 EVT VT = N->getValueType(0); 9050 9051 // fold (abs c1) -> c2 9052 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9053 return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); 9054 // fold (abs (abs x)) -> (abs x) 9055 if (N0.getOpcode() == ISD::ABS) 9056 return N0; 9057 // fold (abs x) -> x iff not-negative 9058 if (DAG.SignBitIsZero(N0)) 9059 return N0; 9060 return SDValue(); 9061 } 9062 9063 SDValue DAGCombiner::visitBSWAP(SDNode *N) { 9064 SDValue N0 = N->getOperand(0); 9065 EVT VT = N->getValueType(0); 9066 9067 // fold (bswap c1) -> c2 9068 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9069 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0); 9070 // fold (bswap (bswap x)) -> x 9071 if (N0.getOpcode() == ISD::BSWAP) 9072 return N0->getOperand(0); 9073 return SDValue(); 9074 } 9075 9076 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { 9077 SDValue N0 = N->getOperand(0); 9078 EVT VT = N->getValueType(0); 9079 9080 // fold (bitreverse c1) -> c2 9081 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9082 return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); 9083 // fold (bitreverse (bitreverse x)) -> x 9084 if (N0.getOpcode() == ISD::BITREVERSE) 9085 return N0.getOperand(0); 9086 return SDValue(); 9087 } 9088 9089 SDValue DAGCombiner::visitCTLZ(SDNode *N) { 9090 SDValue N0 = N->getOperand(0); 9091 EVT VT = N->getValueType(0); 9092 9093 // fold (ctlz c1) -> c2 9094 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9095 return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); 9096 9097 // If the value is known never to be zero, switch to the undef version. 9098 if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) { 9099 if (DAG.isKnownNeverZero(N0)) 9100 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 9101 } 9102 9103 return SDValue(); 9104 } 9105 9106 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { 9107 SDValue N0 = N->getOperand(0); 9108 EVT VT = N->getValueType(0); 9109 9110 // fold (ctlz_zero_undef c1) -> c2 9111 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9112 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 9113 return SDValue(); 9114 } 9115 9116 SDValue DAGCombiner::visitCTTZ(SDNode *N) { 9117 SDValue N0 = N->getOperand(0); 9118 EVT VT = N->getValueType(0); 9119 9120 // fold (cttz c1) -> c2 9121 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9122 return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); 9123 9124 // If the value is known never to be zero, switch to the undef version. 9125 if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) { 9126 if (DAG.isKnownNeverZero(N0)) 9127 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 9128 } 9129 9130 return SDValue(); 9131 } 9132 9133 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { 9134 SDValue N0 = N->getOperand(0); 9135 EVT VT = N->getValueType(0); 9136 9137 // fold (cttz_zero_undef c1) -> c2 9138 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9139 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 9140 return SDValue(); 9141 } 9142 9143 SDValue DAGCombiner::visitCTPOP(SDNode *N) { 9144 SDValue N0 = N->getOperand(0); 9145 EVT VT = N->getValueType(0); 9146 9147 // fold (ctpop c1) -> c2 9148 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 9149 return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); 9150 return SDValue(); 9151 } 9152 9153 // FIXME: This should be checking for no signed zeros on individual operands, as 9154 // well as no nans. 9155 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, 9156 SDValue RHS, 9157 const TargetLowering &TLI) { 9158 const TargetOptions &Options = DAG.getTarget().Options; 9159 EVT VT = LHS.getValueType(); 9160 9161 return Options.NoSignedZerosFPMath && VT.isFloatingPoint() && 9162 TLI.isProfitableToCombineMinNumMaxNum(VT) && 9163 DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS); 9164 } 9165 9166 /// Generate Min/Max node 9167 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, 9168 SDValue RHS, SDValue True, SDValue False, 9169 ISD::CondCode CC, const TargetLowering &TLI, 9170 SelectionDAG &DAG) { 9171 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 9172 return SDValue(); 9173 9174 EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 9175 switch (CC) { 9176 case ISD::SETOLT: 9177 case ISD::SETOLE: 9178 case ISD::SETLT: 9179 case ISD::SETLE: 9180 case ISD::SETULT: 9181 case ISD::SETULE: { 9182 // Since it's known never nan to get here already, either fminnum or 9183 // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is 9184 // expanded in terms of it. 9185 unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; 9186 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 9187 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 9188 9189 unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; 9190 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 9191 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 9192 return SDValue(); 9193 } 9194 case ISD::SETOGT: 9195 case ISD::SETOGE: 9196 case ISD::SETGT: 9197 case ISD::SETGE: 9198 case ISD::SETUGT: 9199 case ISD::SETUGE: { 9200 unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; 9201 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 9202 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 9203 9204 unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; 9205 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 9206 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 9207 return SDValue(); 9208 } 9209 default: 9210 return SDValue(); 9211 } 9212 } 9213 9214 /// If a (v)select has a condition value that is a sign-bit test, try to smear 9215 /// the condition operand sign-bit across the value width and use it as a mask. 9216 static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) { 9217 SDValue Cond = N->getOperand(0); 9218 SDValue C1 = N->getOperand(1); 9219 SDValue C2 = N->getOperand(2); 9220 assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) && 9221 "Expected select-of-constants"); 9222 9223 EVT VT = N->getValueType(0); 9224 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() || 9225 VT != Cond.getOperand(0).getValueType()) 9226 return SDValue(); 9227 9228 // The inverted-condition + commuted-select variants of these patterns are 9229 // canonicalized to these forms in IR. 9230 SDValue X = Cond.getOperand(0); 9231 SDValue CondC = Cond.getOperand(1); 9232 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9233 if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) && 9234 isAllOnesOrAllOnesSplat(C2)) { 9235 // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1 9236 SDLoc DL(N); 9237 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 9238 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 9239 return DAG.getNode(ISD::OR, DL, VT, Sra, C1); 9240 } 9241 if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) { 9242 // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1 9243 SDLoc DL(N); 9244 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 9245 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 9246 return DAG.getNode(ISD::AND, DL, VT, Sra, C1); 9247 } 9248 return SDValue(); 9249 } 9250 9251 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { 9252 SDValue Cond = N->getOperand(0); 9253 SDValue N1 = N->getOperand(1); 9254 SDValue N2 = N->getOperand(2); 9255 EVT VT = N->getValueType(0); 9256 EVT CondVT = Cond.getValueType(); 9257 SDLoc DL(N); 9258 9259 if (!VT.isInteger()) 9260 return SDValue(); 9261 9262 auto *C1 = dyn_cast<ConstantSDNode>(N1); 9263 auto *C2 = dyn_cast<ConstantSDNode>(N2); 9264 if (!C1 || !C2) 9265 return SDValue(); 9266 9267 // Only do this before legalization to avoid conflicting with target-specific 9268 // transforms in the other direction (create a select from a zext/sext). There 9269 // is also a target-independent combine here in DAGCombiner in the other 9270 // direction for (select Cond, -1, 0) when the condition is not i1. 9271 if (CondVT == MVT::i1 && !LegalOperations) { 9272 if (C1->isNullValue() && C2->isOne()) { 9273 // select Cond, 0, 1 --> zext (!Cond) 9274 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 9275 if (VT != MVT::i1) 9276 NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); 9277 return NotCond; 9278 } 9279 if (C1->isNullValue() && C2->isAllOnesValue()) { 9280 // select Cond, 0, -1 --> sext (!Cond) 9281 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 9282 if (VT != MVT::i1) 9283 NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); 9284 return NotCond; 9285 } 9286 if (C1->isOne() && C2->isNullValue()) { 9287 // select Cond, 1, 0 --> zext (Cond) 9288 if (VT != MVT::i1) 9289 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 9290 return Cond; 9291 } 9292 if (C1->isAllOnesValue() && C2->isNullValue()) { 9293 // select Cond, -1, 0 --> sext (Cond) 9294 if (VT != MVT::i1) 9295 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 9296 return Cond; 9297 } 9298 9299 // Use a target hook because some targets may prefer to transform in the 9300 // other direction. 9301 if (TLI.convertSelectOfConstantsToMath(VT)) { 9302 // For any constants that differ by 1, we can transform the select into an 9303 // extend and add. 9304 const APInt &C1Val = C1->getAPIntValue(); 9305 const APInt &C2Val = C2->getAPIntValue(); 9306 if (C1Val - 1 == C2Val) { 9307 // select Cond, C1, C1-1 --> add (zext Cond), C1-1 9308 if (VT != MVT::i1) 9309 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 9310 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 9311 } 9312 if (C1Val + 1 == C2Val) { 9313 // select Cond, C1, C1+1 --> add (sext Cond), C1+1 9314 if (VT != MVT::i1) 9315 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 9316 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 9317 } 9318 9319 // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2) 9320 if (C1Val.isPowerOf2() && C2Val.isNullValue()) { 9321 if (VT != MVT::i1) 9322 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 9323 SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT); 9324 return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC); 9325 } 9326 9327 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 9328 return V; 9329 } 9330 9331 return SDValue(); 9332 } 9333 9334 // fold (select Cond, 0, 1) -> (xor Cond, 1) 9335 // We can't do this reliably if integer based booleans have different contents 9336 // to floating point based booleans. This is because we can't tell whether we 9337 // have an integer-based boolean or a floating-point-based boolean unless we 9338 // can find the SETCC that produced it and inspect its operands. This is 9339 // fairly easy if C is the SETCC node, but it can potentially be 9340 // undiscoverable (or not reasonably discoverable). For example, it could be 9341 // in another basic block or it could require searching a complicated 9342 // expression. 9343 if (CondVT.isInteger() && 9344 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) == 9345 TargetLowering::ZeroOrOneBooleanContent && 9346 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) == 9347 TargetLowering::ZeroOrOneBooleanContent && 9348 C1->isNullValue() && C2->isOne()) { 9349 SDValue NotCond = 9350 DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); 9351 if (VT.bitsEq(CondVT)) 9352 return NotCond; 9353 return DAG.getZExtOrTrunc(NotCond, DL, VT); 9354 } 9355 9356 return SDValue(); 9357 } 9358 9359 static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) { 9360 assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT) && 9361 "Expected a (v)select"); 9362 SDValue Cond = N->getOperand(0); 9363 SDValue T = N->getOperand(1), F = N->getOperand(2); 9364 EVT VT = N->getValueType(0); 9365 if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1) 9366 return SDValue(); 9367 9368 // select Cond, Cond, F --> or Cond, F 9369 // select Cond, 1, F --> or Cond, F 9370 if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true)) 9371 return DAG.getNode(ISD::OR, SDLoc(N), VT, Cond, F); 9372 9373 // select Cond, T, Cond --> and Cond, T 9374 // select Cond, T, 0 --> and Cond, T 9375 if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true)) 9376 return DAG.getNode(ISD::AND, SDLoc(N), VT, Cond, T); 9377 9378 // select Cond, T, 1 --> or (not Cond), T 9379 if (isOneOrOneSplat(F, /* AllowUndefs */ true)) { 9380 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT); 9381 return DAG.getNode(ISD::OR, SDLoc(N), VT, NotCond, T); 9382 } 9383 9384 // select Cond, 0, F --> and (not Cond), F 9385 if (isNullOrNullSplat(T, /* AllowUndefs */ true)) { 9386 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT); 9387 return DAG.getNode(ISD::AND, SDLoc(N), VT, NotCond, F); 9388 } 9389 9390 return SDValue(); 9391 } 9392 9393 SDValue DAGCombiner::visitSELECT(SDNode *N) { 9394 SDValue N0 = N->getOperand(0); 9395 SDValue N1 = N->getOperand(1); 9396 SDValue N2 = N->getOperand(2); 9397 EVT VT = N->getValueType(0); 9398 EVT VT0 = N0.getValueType(); 9399 SDLoc DL(N); 9400 SDNodeFlags Flags = N->getFlags(); 9401 9402 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 9403 return V; 9404 9405 if (SDValue V = foldSelectOfConstants(N)) 9406 return V; 9407 9408 if (SDValue V = foldBoolSelectToLogic(N, DAG)) 9409 return V; 9410 9411 // If we can fold this based on the true/false value, do so. 9412 if (SimplifySelectOps(N, N1, N2)) 9413 return SDValue(N, 0); // Don't revisit N. 9414 9415 if (VT0 == MVT::i1) { 9416 // The code in this block deals with the following 2 equivalences: 9417 // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) 9418 // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) 9419 // The target can specify its preferred form with the 9420 // shouldNormalizeToSelectSequence() callback. However we always transform 9421 // to the right anyway if we find the inner select exists in the DAG anyway 9422 // and we always transform to the left side if we know that we can further 9423 // optimize the combination of the conditions. 9424 bool normalizeToSequence = 9425 TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); 9426 // select (and Cond0, Cond1), X, Y 9427 // -> select Cond0, (select Cond1, X, Y), Y 9428 if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { 9429 SDValue Cond0 = N0->getOperand(0); 9430 SDValue Cond1 = N0->getOperand(1); 9431 SDValue InnerSelect = 9432 DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags); 9433 if (normalizeToSequence || !InnerSelect.use_empty()) 9434 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, 9435 InnerSelect, N2, Flags); 9436 // Cleanup on failure. 9437 if (InnerSelect.use_empty()) 9438 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 9439 } 9440 // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) 9441 if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { 9442 SDValue Cond0 = N0->getOperand(0); 9443 SDValue Cond1 = N0->getOperand(1); 9444 SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), 9445 Cond1, N1, N2, Flags); 9446 if (normalizeToSequence || !InnerSelect.use_empty()) 9447 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, 9448 InnerSelect, Flags); 9449 // Cleanup on failure. 9450 if (InnerSelect.use_empty()) 9451 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 9452 } 9453 9454 // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y 9455 if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { 9456 SDValue N1_0 = N1->getOperand(0); 9457 SDValue N1_1 = N1->getOperand(1); 9458 SDValue N1_2 = N1->getOperand(2); 9459 if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { 9460 // Create the actual and node if we can generate good code for it. 9461 if (!normalizeToSequence) { 9462 SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); 9463 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, 9464 N2, Flags); 9465 } 9466 // Otherwise see if we can optimize the "and" to a better pattern. 9467 if (SDValue Combined = visitANDLike(N0, N1_0, N)) { 9468 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, 9469 N2, Flags); 9470 } 9471 } 9472 } 9473 // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y 9474 if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { 9475 SDValue N2_0 = N2->getOperand(0); 9476 SDValue N2_1 = N2->getOperand(1); 9477 SDValue N2_2 = N2->getOperand(2); 9478 if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { 9479 // Create the actual or node if we can generate good code for it. 9480 if (!normalizeToSequence) { 9481 SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); 9482 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, 9483 N2_2, Flags); 9484 } 9485 // Otherwise see if we can optimize to a better pattern. 9486 if (SDValue Combined = visitORLike(N0, N2_0, N)) 9487 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, 9488 N2_2, Flags); 9489 } 9490 } 9491 } 9492 9493 // select (not Cond), N1, N2 -> select Cond, N2, N1 9494 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { 9495 SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1); 9496 SelectOp->setFlags(Flags); 9497 return SelectOp; 9498 } 9499 9500 // Fold selects based on a setcc into other things, such as min/max/abs. 9501 if (N0.getOpcode() == ISD::SETCC) { 9502 SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1); 9503 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 9504 9505 // select (fcmp lt x, y), x, y -> fminnum x, y 9506 // select (fcmp gt x, y), x, y -> fmaxnum x, y 9507 // 9508 // This is OK if we don't care what happens if either operand is a NaN. 9509 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI)) 9510 if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, 9511 CC, TLI, DAG)) 9512 return FMinMax; 9513 9514 // Use 'unsigned add with overflow' to optimize an unsigned saturating add. 9515 // This is conservatively limited to pre-legal-operations to give targets 9516 // a chance to reverse the transform if they want to do that. Also, it is 9517 // unlikely that the pattern would be formed late, so it's probably not 9518 // worth going through the other checks. 9519 if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) && 9520 CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) && 9521 N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) { 9522 auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1)); 9523 auto *NotC = dyn_cast<ConstantSDNode>(Cond1); 9524 if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) { 9525 // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) --> 9526 // uaddo Cond0, C; select uaddo.1, -1, uaddo.0 9527 // 9528 // The IR equivalent of this transform would have this form: 9529 // %a = add %x, C 9530 // %c = icmp ugt %x, ~C 9531 // %r = select %c, -1, %a 9532 // => 9533 // %u = call {iN,i1} llvm.uadd.with.overflow(%x, C) 9534 // %u0 = extractvalue %u, 0 9535 // %u1 = extractvalue %u, 1 9536 // %r = select %u1, -1, %u0 9537 SDVTList VTs = DAG.getVTList(VT, VT0); 9538 SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1)); 9539 return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0)); 9540 } 9541 } 9542 9543 if (TLI.isOperationLegal(ISD::SELECT_CC, VT) || 9544 (!LegalOperations && 9545 TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) { 9546 // Any flags available in a select/setcc fold will be on the setcc as they 9547 // migrated from fcmp 9548 Flags = N0.getNode()->getFlags(); 9549 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, 9550 N2, N0.getOperand(2)); 9551 SelectNode->setFlags(Flags); 9552 return SelectNode; 9553 } 9554 9555 return SimplifySelect(DL, N0, N1, N2); 9556 } 9557 9558 return SDValue(); 9559 } 9560 9561 // This function assumes all the vselect's arguments are CONCAT_VECTOR 9562 // nodes and that the condition is a BV of ConstantSDNodes (or undefs). 9563 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { 9564 SDLoc DL(N); 9565 SDValue Cond = N->getOperand(0); 9566 SDValue LHS = N->getOperand(1); 9567 SDValue RHS = N->getOperand(2); 9568 EVT VT = N->getValueType(0); 9569 int NumElems = VT.getVectorNumElements(); 9570 assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && 9571 RHS.getOpcode() == ISD::CONCAT_VECTORS && 9572 Cond.getOpcode() == ISD::BUILD_VECTOR); 9573 9574 // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about 9575 // binary ones here. 9576 if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) 9577 return SDValue(); 9578 9579 // We're sure we have an even number of elements due to the 9580 // concat_vectors we have as arguments to vselect. 9581 // Skip BV elements until we find one that's not an UNDEF 9582 // After we find an UNDEF element, keep looping until we get to half the 9583 // length of the BV and see if all the non-undef nodes are the same. 9584 ConstantSDNode *BottomHalf = nullptr; 9585 for (int i = 0; i < NumElems / 2; ++i) { 9586 if (Cond->getOperand(i)->isUndef()) 9587 continue; 9588 9589 if (BottomHalf == nullptr) 9590 BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 9591 else if (Cond->getOperand(i).getNode() != BottomHalf) 9592 return SDValue(); 9593 } 9594 9595 // Do the same for the second half of the BuildVector 9596 ConstantSDNode *TopHalf = nullptr; 9597 for (int i = NumElems / 2; i < NumElems; ++i) { 9598 if (Cond->getOperand(i)->isUndef()) 9599 continue; 9600 9601 if (TopHalf == nullptr) 9602 TopHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 9603 else if (Cond->getOperand(i).getNode() != TopHalf) 9604 return SDValue(); 9605 } 9606 9607 assert(TopHalf && BottomHalf && 9608 "One half of the selector was all UNDEFs and the other was all the " 9609 "same value. This should have been addressed before this function."); 9610 return DAG.getNode( 9611 ISD::CONCAT_VECTORS, DL, VT, 9612 BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), 9613 TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); 9614 } 9615 9616 bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) { 9617 if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD) 9618 return false; 9619 9620 // For now we check only the LHS of the add. 9621 SDValue LHS = Index.getOperand(0); 9622 SDValue SplatVal = DAG.getSplatValue(LHS); 9623 if (!SplatVal) 9624 return false; 9625 9626 BasePtr = SplatVal; 9627 Index = Index.getOperand(1); 9628 return true; 9629 } 9630 9631 // Fold sext/zext of index into index type. 9632 bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index, 9633 bool Scaled, SelectionDAG &DAG) { 9634 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9635 9636 if (Index.getOpcode() == ISD::ZERO_EXTEND) { 9637 SDValue Op = Index.getOperand(0); 9638 MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); 9639 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { 9640 Index = Op; 9641 return true; 9642 } 9643 } 9644 9645 if (Index.getOpcode() == ISD::SIGN_EXTEND) { 9646 SDValue Op = Index.getOperand(0); 9647 MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); 9648 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { 9649 Index = Op; 9650 return true; 9651 } 9652 } 9653 9654 return false; 9655 } 9656 9657 SDValue DAGCombiner::visitMSCATTER(SDNode *N) { 9658 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); 9659 SDValue Mask = MSC->getMask(); 9660 SDValue Chain = MSC->getChain(); 9661 SDValue Index = MSC->getIndex(); 9662 SDValue Scale = MSC->getScale(); 9663 SDValue StoreVal = MSC->getValue(); 9664 SDValue BasePtr = MSC->getBasePtr(); 9665 SDLoc DL(N); 9666 9667 // Zap scatters with a zero mask. 9668 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) 9669 return Chain; 9670 9671 if (refineUniformBase(BasePtr, Index, DAG)) { 9672 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 9673 return DAG.getMaskedScatter( 9674 DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops, 9675 MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore()); 9676 } 9677 9678 if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) { 9679 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 9680 return DAG.getMaskedScatter( 9681 DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops, 9682 MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore()); 9683 } 9684 9685 return SDValue(); 9686 } 9687 9688 SDValue DAGCombiner::visitMSTORE(SDNode *N) { 9689 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); 9690 SDValue Mask = MST->getMask(); 9691 SDValue Chain = MST->getChain(); 9692 SDLoc DL(N); 9693 9694 // Zap masked stores with a zero mask. 9695 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) 9696 return Chain; 9697 9698 // If this is a masked load with an all ones mask, we can use a unmasked load. 9699 // FIXME: Can we do this for indexed, compressing, or truncating stores? 9700 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && 9701 MST->isUnindexed() && !MST->isCompressingStore() && 9702 !MST->isTruncatingStore()) 9703 return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), 9704 MST->getBasePtr(), MST->getMemOperand()); 9705 9706 // Try transforming N to an indexed store. 9707 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 9708 return SDValue(N, 0); 9709 9710 return SDValue(); 9711 } 9712 9713 SDValue DAGCombiner::visitMGATHER(SDNode *N) { 9714 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); 9715 SDValue Mask = MGT->getMask(); 9716 SDValue Chain = MGT->getChain(); 9717 SDValue Index = MGT->getIndex(); 9718 SDValue Scale = MGT->getScale(); 9719 SDValue PassThru = MGT->getPassThru(); 9720 SDValue BasePtr = MGT->getBasePtr(); 9721 SDLoc DL(N); 9722 9723 // Zap gathers with a zero mask. 9724 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) 9725 return CombineTo(N, PassThru, MGT->getChain()); 9726 9727 if (refineUniformBase(BasePtr, Index, DAG)) { 9728 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 9729 return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), 9730 PassThru.getValueType(), DL, Ops, 9731 MGT->getMemOperand(), MGT->getIndexType(), 9732 MGT->getExtensionType()); 9733 } 9734 9735 if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) { 9736 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 9737 return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), 9738 PassThru.getValueType(), DL, Ops, 9739 MGT->getMemOperand(), MGT->getIndexType(), 9740 MGT->getExtensionType()); 9741 } 9742 9743 return SDValue(); 9744 } 9745 9746 SDValue DAGCombiner::visitMLOAD(SDNode *N) { 9747 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N); 9748 SDValue Mask = MLD->getMask(); 9749 SDLoc DL(N); 9750 9751 // Zap masked loads with a zero mask. 9752 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) 9753 return CombineTo(N, MLD->getPassThru(), MLD->getChain()); 9754 9755 // If this is a masked load with an all ones mask, we can use a unmasked load. 9756 // FIXME: Can we do this for indexed, expanding, or extending loads? 9757 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && 9758 MLD->isUnindexed() && !MLD->isExpandingLoad() && 9759 MLD->getExtensionType() == ISD::NON_EXTLOAD) { 9760 SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(), 9761 MLD->getBasePtr(), MLD->getMemOperand()); 9762 return CombineTo(N, NewLd, NewLd.getValue(1)); 9763 } 9764 9765 // Try transforming N to an indexed load. 9766 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 9767 return SDValue(N, 0); 9768 9769 return SDValue(); 9770 } 9771 9772 /// A vector select of 2 constant vectors can be simplified to math/logic to 9773 /// avoid a variable select instruction and possibly avoid constant loads. 9774 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { 9775 SDValue Cond = N->getOperand(0); 9776 SDValue N1 = N->getOperand(1); 9777 SDValue N2 = N->getOperand(2); 9778 EVT VT = N->getValueType(0); 9779 if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || 9780 !TLI.convertSelectOfConstantsToMath(VT) || 9781 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || 9782 !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) 9783 return SDValue(); 9784 9785 // Check if we can use the condition value to increment/decrement a single 9786 // constant value. This simplifies a select to an add and removes a constant 9787 // load/materialization from the general case. 9788 bool AllAddOne = true; 9789 bool AllSubOne = true; 9790 unsigned Elts = VT.getVectorNumElements(); 9791 for (unsigned i = 0; i != Elts; ++i) { 9792 SDValue N1Elt = N1.getOperand(i); 9793 SDValue N2Elt = N2.getOperand(i); 9794 if (N1Elt.isUndef() || N2Elt.isUndef()) 9795 continue; 9796 if (N1Elt.getValueType() != N2Elt.getValueType()) 9797 continue; 9798 9799 const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue(); 9800 const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue(); 9801 if (C1 != C2 + 1) 9802 AllAddOne = false; 9803 if (C1 != C2 - 1) 9804 AllSubOne = false; 9805 } 9806 9807 // Further simplifications for the extra-special cases where the constants are 9808 // all 0 or all -1 should be implemented as folds of these patterns. 9809 SDLoc DL(N); 9810 if (AllAddOne || AllSubOne) { 9811 // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C 9812 // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C 9813 auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 9814 SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); 9815 return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); 9816 } 9817 9818 // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C) 9819 APInt Pow2C; 9820 if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() && 9821 isNullOrNullSplat(N2)) { 9822 SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT); 9823 SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT); 9824 return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC); 9825 } 9826 9827 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 9828 return V; 9829 9830 // The general case for select-of-constants: 9831 // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 9832 // ...but that only makes sense if a vselect is slower than 2 logic ops, so 9833 // leave that to a machine-specific pass. 9834 return SDValue(); 9835 } 9836 9837 SDValue DAGCombiner::visitVSELECT(SDNode *N) { 9838 SDValue N0 = N->getOperand(0); 9839 SDValue N1 = N->getOperand(1); 9840 SDValue N2 = N->getOperand(2); 9841 EVT VT = N->getValueType(0); 9842 SDLoc DL(N); 9843 9844 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 9845 return V; 9846 9847 if (SDValue V = foldBoolSelectToLogic(N, DAG)) 9848 return V; 9849 9850 // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1 9851 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) 9852 return DAG.getSelect(DL, VT, F, N2, N1); 9853 9854 // Canonicalize integer abs. 9855 // vselect (setg[te] X, 0), X, -X -> 9856 // vselect (setgt X, -1), X, -X -> 9857 // vselect (setl[te] X, 0), -X, X -> 9858 // Y = sra (X, size(X)-1); xor (add (X, Y), Y) 9859 if (N0.getOpcode() == ISD::SETCC) { 9860 SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); 9861 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 9862 bool isAbs = false; 9863 bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 9864 9865 if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || 9866 (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && 9867 N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) 9868 isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); 9869 else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && 9870 N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) 9871 isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 9872 9873 if (isAbs) { 9874 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) 9875 return DAG.getNode(ISD::ABS, DL, VT, LHS); 9876 9877 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS, 9878 DAG.getConstant(VT.getScalarSizeInBits() - 1, 9879 DL, getShiftAmountTy(VT))); 9880 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); 9881 AddToWorklist(Shift.getNode()); 9882 AddToWorklist(Add.getNode()); 9883 return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); 9884 } 9885 9886 // vselect x, y (fcmp lt x, y) -> fminnum x, y 9887 // vselect x, y (fcmp gt x, y) -> fmaxnum x, y 9888 // 9889 // This is OK if we don't care about what happens if either operand is a 9890 // NaN. 9891 // 9892 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) { 9893 if (SDValue FMinMax = 9894 combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG)) 9895 return FMinMax; 9896 } 9897 9898 // If this select has a condition (setcc) with narrower operands than the 9899 // select, try to widen the compare to match the select width. 9900 // TODO: This should be extended to handle any constant. 9901 // TODO: This could be extended to handle non-loading patterns, but that 9902 // requires thorough testing to avoid regressions. 9903 if (isNullOrNullSplat(RHS)) { 9904 EVT NarrowVT = LHS.getValueType(); 9905 EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); 9906 EVT SetCCVT = getSetCCResultType(LHS.getValueType()); 9907 unsigned SetCCWidth = SetCCVT.getScalarSizeInBits(); 9908 unsigned WideWidth = WideVT.getScalarSizeInBits(); 9909 bool IsSigned = isSignedIntSetCC(CC); 9910 auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 9911 if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && 9912 SetCCWidth != 1 && SetCCWidth < WideWidth && 9913 TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && 9914 TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { 9915 // Both compare operands can be widened for free. The LHS can use an 9916 // extended load, and the RHS is a constant: 9917 // vselect (ext (setcc load(X), C)), N1, N2 --> 9918 // vselect (setcc extload(X), C'), N1, N2 9919 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 9920 SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS); 9921 SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS); 9922 EVT WideSetCCVT = getSetCCResultType(WideVT); 9923 SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC); 9924 return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); 9925 } 9926 } 9927 9928 // Match VSELECTs into add with unsigned saturation. 9929 if (hasOperation(ISD::UADDSAT, VT)) { 9930 // Check if one of the arms of the VSELECT is vector with all bits set. 9931 // If it's on the left side invert the predicate to simplify logic below. 9932 SDValue Other; 9933 ISD::CondCode SatCC = CC; 9934 if (ISD::isBuildVectorAllOnes(N1.getNode())) { 9935 Other = N2; 9936 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); 9937 } else if (ISD::isBuildVectorAllOnes(N2.getNode())) { 9938 Other = N1; 9939 } 9940 9941 if (Other && Other.getOpcode() == ISD::ADD) { 9942 SDValue CondLHS = LHS, CondRHS = RHS; 9943 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); 9944 9945 // Canonicalize condition operands. 9946 if (SatCC == ISD::SETUGE) { 9947 std::swap(CondLHS, CondRHS); 9948 SatCC = ISD::SETULE; 9949 } 9950 9951 // We can test against either of the addition operands. 9952 // x <= x+y ? x+y : ~0 --> uaddsat x, y 9953 // x+y >= x ? x+y : ~0 --> uaddsat x, y 9954 if (SatCC == ISD::SETULE && Other == CondRHS && 9955 (OpLHS == CondLHS || OpRHS == CondLHS)) 9956 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); 9957 9958 if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) && 9959 CondLHS == OpLHS) { 9960 // If the RHS is a constant we have to reverse the const 9961 // canonicalization. 9962 // x >= ~C ? x+C : ~0 --> uaddsat x, C 9963 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { 9964 return Cond->getAPIntValue() == ~Op->getAPIntValue(); 9965 }; 9966 if (SatCC == ISD::SETULE && 9967 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) 9968 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); 9969 } 9970 } 9971 } 9972 9973 // Match VSELECTs into sub with unsigned saturation. 9974 if (hasOperation(ISD::USUBSAT, VT)) { 9975 // Check if one of the arms of the VSELECT is a zero vector. If it's on 9976 // the left side invert the predicate to simplify logic below. 9977 SDValue Other; 9978 ISD::CondCode SatCC = CC; 9979 if (ISD::isBuildVectorAllZeros(N1.getNode())) { 9980 Other = N2; 9981 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); 9982 } else if (ISD::isBuildVectorAllZeros(N2.getNode())) { 9983 Other = N1; 9984 } 9985 9986 if (Other && Other.getNumOperands() == 2) { 9987 SDValue CondRHS = RHS; 9988 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); 9989 9990 if (Other.getOpcode() == ISD::SUB && 9991 LHS.getOpcode() == ISD::ZERO_EXTEND && LHS.getOperand(0) == OpLHS && 9992 OpRHS.getOpcode() == ISD::TRUNCATE && OpRHS.getOperand(0) == RHS) { 9993 // Look for a general sub with unsigned saturation first. 9994 // zext(x) >= y ? x - trunc(y) : 0 9995 // --> usubsat(x,trunc(umin(y,SatLimit))) 9996 // zext(x) > y ? x - trunc(y) : 0 9997 // --> usubsat(x,trunc(umin(y,SatLimit))) 9998 if (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) 9999 return getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS, DAG, 10000 DL); 10001 } 10002 10003 if (OpLHS == LHS) { 10004 // Look for a general sub with unsigned saturation first. 10005 // x >= y ? x-y : 0 --> usubsat x, y 10006 // x > y ? x-y : 0 --> usubsat x, y 10007 if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) && 10008 Other.getOpcode() == ISD::SUB && OpRHS == CondRHS) 10009 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); 10010 10011 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) { 10012 if (isa<BuildVectorSDNode>(CondRHS)) { 10013 // If the RHS is a constant we have to reverse the const 10014 // canonicalization. 10015 // x > C-1 ? x+-C : 0 --> usubsat x, C 10016 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { 10017 return (!Op && !Cond) || 10018 (Op && Cond && 10019 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); 10020 }; 10021 if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD && 10022 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, 10023 /*AllowUndefs*/ true)) { 10024 OpRHS = DAG.getNode(ISD::SUB, DL, VT, 10025 DAG.getConstant(0, DL, VT), OpRHS); 10026 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); 10027 } 10028 10029 // Another special case: If C was a sign bit, the sub has been 10030 // canonicalized into a xor. 10031 // FIXME: Would it be better to use computeKnownBits to determine 10032 // whether it's safe to decanonicalize the xor? 10033 // x s< 0 ? x^C : 0 --> usubsat x, C 10034 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { 10035 if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR && 10036 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 10037 OpRHSConst->getAPIntValue().isSignMask()) { 10038 // Note that we have to rebuild the RHS constant here to 10039 // ensure we don't rely on particular values of undef lanes. 10040 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT); 10041 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); 10042 } 10043 } 10044 } 10045 } 10046 } 10047 } 10048 } 10049 } 10050 10051 if (SimplifySelectOps(N, N1, N2)) 10052 return SDValue(N, 0); // Don't revisit N. 10053 10054 // Fold (vselect all_ones, N1, N2) -> N1 10055 if (ISD::isConstantSplatVectorAllOnes(N0.getNode())) 10056 return N1; 10057 // Fold (vselect all_zeros, N1, N2) -> N2 10058 if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) 10059 return N2; 10060 10061 // The ConvertSelectToConcatVector function is assuming both the above 10062 // checks for (vselect (build_vector all{ones,zeros) ...) have been made 10063 // and addressed. 10064 if (N1.getOpcode() == ISD::CONCAT_VECTORS && 10065 N2.getOpcode() == ISD::CONCAT_VECTORS && 10066 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { 10067 if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) 10068 return CV; 10069 } 10070 10071 if (SDValue V = foldVSelectOfConstants(N)) 10072 return V; 10073 10074 return SDValue(); 10075 } 10076 10077 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { 10078 SDValue N0 = N->getOperand(0); 10079 SDValue N1 = N->getOperand(1); 10080 SDValue N2 = N->getOperand(2); 10081 SDValue N3 = N->getOperand(3); 10082 SDValue N4 = N->getOperand(4); 10083 ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get(); 10084 10085 // fold select_cc lhs, rhs, x, x, cc -> x 10086 if (N2 == N3) 10087 return N2; 10088 10089 // Determine if the condition we're dealing with is constant 10090 if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, 10091 CC, SDLoc(N), false)) { 10092 AddToWorklist(SCC.getNode()); 10093 10094 if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) { 10095 if (!SCCC->isNullValue()) 10096 return N2; // cond always true -> true val 10097 else 10098 return N3; // cond always false -> false val 10099 } else if (SCC->isUndef()) { 10100 // When the condition is UNDEF, just return the first operand. This is 10101 // coherent the DAG creation, no setcc node is created in this case 10102 return N2; 10103 } else if (SCC.getOpcode() == ISD::SETCC) { 10104 // Fold to a simpler select_cc 10105 SDValue SelectOp = DAG.getNode( 10106 ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), 10107 SCC.getOperand(1), N2, N3, SCC.getOperand(2)); 10108 SelectOp->setFlags(SCC->getFlags()); 10109 return SelectOp; 10110 } 10111 } 10112 10113 // If we can fold this based on the true/false value, do so. 10114 if (SimplifySelectOps(N, N2, N3)) 10115 return SDValue(N, 0); // Don't revisit N. 10116 10117 // fold select_cc into other things, such as min/max/abs 10118 return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); 10119 } 10120 10121 SDValue DAGCombiner::visitSETCC(SDNode *N) { 10122 // setcc is very commonly used as an argument to brcond. This pattern 10123 // also lend itself to numerous combines and, as a result, it is desired 10124 // we keep the argument to a brcond as a setcc as much as possible. 10125 bool PreferSetCC = 10126 N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND; 10127 10128 SDValue Combined = SimplifySetCC( 10129 N->getValueType(0), N->getOperand(0), N->getOperand(1), 10130 cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC); 10131 10132 if (!Combined) 10133 return SDValue(); 10134 10135 // If we prefer to have a setcc, and we don't, we'll try our best to 10136 // recreate one using rebuildSetCC. 10137 if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { 10138 SDValue NewSetCC = rebuildSetCC(Combined); 10139 10140 // We don't have anything interesting to combine to. 10141 if (NewSetCC.getNode() == N) 10142 return SDValue(); 10143 10144 if (NewSetCC) 10145 return NewSetCC; 10146 } 10147 10148 return Combined; 10149 } 10150 10151 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { 10152 SDValue LHS = N->getOperand(0); 10153 SDValue RHS = N->getOperand(1); 10154 SDValue Carry = N->getOperand(2); 10155 SDValue Cond = N->getOperand(3); 10156 10157 // If Carry is false, fold to a regular SETCC. 10158 if (isNullConstant(Carry)) 10159 return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); 10160 10161 return SDValue(); 10162 } 10163 10164 /// Check if N satisfies: 10165 /// N is used once. 10166 /// N is a Load. 10167 /// The load is compatible with ExtOpcode. It means 10168 /// If load has explicit zero/sign extension, ExpOpcode must have the same 10169 /// extension. 10170 /// Otherwise returns true. 10171 static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) { 10172 if (!N.hasOneUse()) 10173 return false; 10174 10175 if (!isa<LoadSDNode>(N)) 10176 return false; 10177 10178 LoadSDNode *Load = cast<LoadSDNode>(N); 10179 ISD::LoadExtType LoadExt = Load->getExtensionType(); 10180 if (LoadExt == ISD::NON_EXTLOAD || LoadExt == ISD::EXTLOAD) 10181 return true; 10182 10183 // Now LoadExt is either SEXTLOAD or ZEXTLOAD, ExtOpcode must have the same 10184 // extension. 10185 if ((LoadExt == ISD::SEXTLOAD && ExtOpcode != ISD::SIGN_EXTEND) || 10186 (LoadExt == ISD::ZEXTLOAD && ExtOpcode != ISD::ZERO_EXTEND)) 10187 return false; 10188 10189 return true; 10190 } 10191 10192 /// Fold 10193 /// (sext (select c, load x, load y)) -> (select c, sextload x, sextload y) 10194 /// (zext (select c, load x, load y)) -> (select c, zextload x, zextload y) 10195 /// (aext (select c, load x, load y)) -> (select c, extload x, extload y) 10196 /// This function is called by the DAGCombiner when visiting sext/zext/aext 10197 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). 10198 static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, 10199 SelectionDAG &DAG) { 10200 unsigned Opcode = N->getOpcode(); 10201 SDValue N0 = N->getOperand(0); 10202 EVT VT = N->getValueType(0); 10203 SDLoc DL(N); 10204 10205 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || 10206 Opcode == ISD::ANY_EXTEND) && 10207 "Expected EXTEND dag node in input!"); 10208 10209 if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) || 10210 !N0.hasOneUse()) 10211 return SDValue(); 10212 10213 SDValue Op1 = N0->getOperand(1); 10214 SDValue Op2 = N0->getOperand(2); 10215 if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode)) 10216 return SDValue(); 10217 10218 auto ExtLoadOpcode = ISD::EXTLOAD; 10219 if (Opcode == ISD::SIGN_EXTEND) 10220 ExtLoadOpcode = ISD::SEXTLOAD; 10221 else if (Opcode == ISD::ZERO_EXTEND) 10222 ExtLoadOpcode = ISD::ZEXTLOAD; 10223 10224 LoadSDNode *Load1 = cast<LoadSDNode>(Op1); 10225 LoadSDNode *Load2 = cast<LoadSDNode>(Op2); 10226 if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) || 10227 !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT())) 10228 return SDValue(); 10229 10230 SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1); 10231 SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2); 10232 return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2); 10233 } 10234 10235 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 10236 /// a build_vector of constants. 10237 /// This function is called by the DAGCombiner when visiting sext/zext/aext 10238 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). 10239 /// Vector extends are not folded if operations are legal; this is to 10240 /// avoid introducing illegal build_vector dag nodes. 10241 static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, 10242 SelectionDAG &DAG, bool LegalTypes) { 10243 unsigned Opcode = N->getOpcode(); 10244 SDValue N0 = N->getOperand(0); 10245 EVT VT = N->getValueType(0); 10246 SDLoc DL(N); 10247 10248 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || 10249 Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || 10250 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) 10251 && "Expected EXTEND dag node in input!"); 10252 10253 // fold (sext c1) -> c1 10254 // fold (zext c1) -> c1 10255 // fold (aext c1) -> c1 10256 if (isa<ConstantSDNode>(N0)) 10257 return DAG.getNode(Opcode, DL, VT, N0); 10258 10259 // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 10260 // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2) 10261 // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 10262 if (N0->getOpcode() == ISD::SELECT) { 10263 SDValue Op1 = N0->getOperand(1); 10264 SDValue Op2 = N0->getOperand(2); 10265 if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) && 10266 (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) { 10267 // For any_extend, choose sign extension of the constants to allow a 10268 // possible further transform to sign_extend_inreg.i.e. 10269 // 10270 // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0> 10271 // t2: i64 = any_extend t1 10272 // --> 10273 // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0> 10274 // --> 10275 // t4: i64 = sign_extend_inreg t3 10276 unsigned FoldOpc = Opcode; 10277 if (FoldOpc == ISD::ANY_EXTEND) 10278 FoldOpc = ISD::SIGN_EXTEND; 10279 return DAG.getSelect(DL, VT, N0->getOperand(0), 10280 DAG.getNode(FoldOpc, DL, VT, Op1), 10281 DAG.getNode(FoldOpc, DL, VT, Op2)); 10282 } 10283 } 10284 10285 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) 10286 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) 10287 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) 10288 EVT SVT = VT.getScalarType(); 10289 if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) && 10290 ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) 10291 return SDValue(); 10292 10293 // We can fold this node into a build_vector. 10294 unsigned VTBits = SVT.getSizeInBits(); 10295 unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); 10296 SmallVector<SDValue, 8> Elts; 10297 unsigned NumElts = VT.getVectorNumElements(); 10298 10299 // For zero-extensions, UNDEF elements still guarantee to have the upper 10300 // bits set to zero. 10301 bool IsZext = 10302 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG; 10303 10304 for (unsigned i = 0; i != NumElts; ++i) { 10305 SDValue Op = N0.getOperand(i); 10306 if (Op.isUndef()) { 10307 Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT)); 10308 continue; 10309 } 10310 10311 SDLoc DL(Op); 10312 // Get the constant value and if needed trunc it to the size of the type. 10313 // Nodes like build_vector might have constants wider than the scalar type. 10314 APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits); 10315 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) 10316 Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT)); 10317 else 10318 Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); 10319 } 10320 10321 return DAG.getBuildVector(VT, DL, Elts); 10322 } 10323 10324 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: 10325 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" 10326 // transformation. Returns true if extension are possible and the above 10327 // mentioned transformation is profitable. 10328 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0, 10329 unsigned ExtOpc, 10330 SmallVectorImpl<SDNode *> &ExtendNodes, 10331 const TargetLowering &TLI) { 10332 bool HasCopyToRegUses = false; 10333 bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType()); 10334 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 10335 UE = N0.getNode()->use_end(); 10336 UI != UE; ++UI) { 10337 SDNode *User = *UI; 10338 if (User == N) 10339 continue; 10340 if (UI.getUse().getResNo() != N0.getResNo()) 10341 continue; 10342 // FIXME: Only extend SETCC N, N and SETCC N, c for now. 10343 if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { 10344 ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get(); 10345 if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) 10346 // Sign bits will be lost after a zext. 10347 return false; 10348 bool Add = false; 10349 for (unsigned i = 0; i != 2; ++i) { 10350 SDValue UseOp = User->getOperand(i); 10351 if (UseOp == N0) 10352 continue; 10353 if (!isa<ConstantSDNode>(UseOp)) 10354 return false; 10355 Add = true; 10356 } 10357 if (Add) 10358 ExtendNodes.push_back(User); 10359 continue; 10360 } 10361 // If truncates aren't free and there are users we can't 10362 // extend, it isn't worthwhile. 10363 if (!isTruncFree) 10364 return false; 10365 // Remember if this value is live-out. 10366 if (User->getOpcode() == ISD::CopyToReg) 10367 HasCopyToRegUses = true; 10368 } 10369 10370 if (HasCopyToRegUses) { 10371 bool BothLiveOut = false; 10372 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 10373 UI != UE; ++UI) { 10374 SDUse &Use = UI.getUse(); 10375 if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { 10376 BothLiveOut = true; 10377 break; 10378 } 10379 } 10380 if (BothLiveOut) 10381 // Both unextended and extended values are live out. There had better be 10382 // a good reason for the transformation. 10383 return ExtendNodes.size(); 10384 } 10385 return true; 10386 } 10387 10388 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 10389 SDValue OrigLoad, SDValue ExtLoad, 10390 ISD::NodeType ExtType) { 10391 // Extend SetCC uses if necessary. 10392 SDLoc DL(ExtLoad); 10393 for (SDNode *SetCC : SetCCs) { 10394 SmallVector<SDValue, 4> Ops; 10395 10396 for (unsigned j = 0; j != 2; ++j) { 10397 SDValue SOp = SetCC->getOperand(j); 10398 if (SOp == OrigLoad) 10399 Ops.push_back(ExtLoad); 10400 else 10401 Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); 10402 } 10403 10404 Ops.push_back(SetCC->getOperand(2)); 10405 CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); 10406 } 10407 } 10408 10409 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). 10410 SDValue DAGCombiner::CombineExtLoad(SDNode *N) { 10411 SDValue N0 = N->getOperand(0); 10412 EVT DstVT = N->getValueType(0); 10413 EVT SrcVT = N0.getValueType(); 10414 10415 assert((N->getOpcode() == ISD::SIGN_EXTEND || 10416 N->getOpcode() == ISD::ZERO_EXTEND) && 10417 "Unexpected node type (not an extend)!"); 10418 10419 // fold (sext (load x)) to multiple smaller sextloads; same for zext. 10420 // For example, on a target with legal v4i32, but illegal v8i32, turn: 10421 // (v8i32 (sext (v8i16 (load x)))) 10422 // into: 10423 // (v8i32 (concat_vectors (v4i32 (sextload x)), 10424 // (v4i32 (sextload (x + 16))))) 10425 // Where uses of the original load, i.e.: 10426 // (v8i16 (load x)) 10427 // are replaced with: 10428 // (v8i16 (truncate 10429 // (v8i32 (concat_vectors (v4i32 (sextload x)), 10430 // (v4i32 (sextload (x + 16))))))) 10431 // 10432 // This combine is only applicable to illegal, but splittable, vectors. 10433 // All legal types, and illegal non-vector types, are handled elsewhere. 10434 // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. 10435 // 10436 if (N0->getOpcode() != ISD::LOAD) 10437 return SDValue(); 10438 10439 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10440 10441 if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || 10442 !N0.hasOneUse() || !LN0->isSimple() || 10443 !DstVT.isVector() || !DstVT.isPow2VectorType() || 10444 !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 10445 return SDValue(); 10446 10447 SmallVector<SDNode *, 4> SetCCs; 10448 if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI)) 10449 return SDValue(); 10450 10451 ISD::LoadExtType ExtType = 10452 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 10453 10454 // Try to split the vector types to get down to legal types. 10455 EVT SplitSrcVT = SrcVT; 10456 EVT SplitDstVT = DstVT; 10457 while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && 10458 SplitSrcVT.getVectorNumElements() > 1) { 10459 SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; 10460 SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; 10461 } 10462 10463 if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) 10464 return SDValue(); 10465 10466 assert(!DstVT.isScalableVector() && "Unexpected scalable vector type"); 10467 10468 SDLoc DL(N); 10469 const unsigned NumSplits = 10470 DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); 10471 const unsigned Stride = SplitSrcVT.getStoreSize(); 10472 SmallVector<SDValue, 4> Loads; 10473 SmallVector<SDValue, 4> Chains; 10474 10475 SDValue BasePtr = LN0->getBasePtr(); 10476 for (unsigned Idx = 0; Idx < NumSplits; Idx++) { 10477 const unsigned Offset = Idx * Stride; 10478 const Align Align = commonAlignment(LN0->getAlign(), Offset); 10479 10480 SDValue SplitLoad = DAG.getExtLoad( 10481 ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, 10482 LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, 10483 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 10484 10485 BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL); 10486 10487 Loads.push_back(SplitLoad.getValue(0)); 10488 Chains.push_back(SplitLoad.getValue(1)); 10489 } 10490 10491 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 10492 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); 10493 10494 // Simplify TF. 10495 AddToWorklist(NewChain.getNode()); 10496 10497 CombineTo(N, NewValue); 10498 10499 // Replace uses of the original load (before extension) 10500 // with a truncate of the concatenated sextloaded vectors. 10501 SDValue Trunc = 10502 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); 10503 ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode()); 10504 CombineTo(N0.getNode(), Trunc, NewChain); 10505 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10506 } 10507 10508 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 10509 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 10510 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { 10511 assert(N->getOpcode() == ISD::ZERO_EXTEND); 10512 EVT VT = N->getValueType(0); 10513 EVT OrigVT = N->getOperand(0).getValueType(); 10514 if (TLI.isZExtFree(OrigVT, VT)) 10515 return SDValue(); 10516 10517 // and/or/xor 10518 SDValue N0 = N->getOperand(0); 10519 if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 10520 N0.getOpcode() == ISD::XOR) || 10521 N0.getOperand(1).getOpcode() != ISD::Constant || 10522 (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))) 10523 return SDValue(); 10524 10525 // shl/shr 10526 SDValue N1 = N0->getOperand(0); 10527 if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) || 10528 N1.getOperand(1).getOpcode() != ISD::Constant || 10529 (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT))) 10530 return SDValue(); 10531 10532 // load 10533 if (!isa<LoadSDNode>(N1.getOperand(0))) 10534 return SDValue(); 10535 LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0)); 10536 EVT MemVT = Load->getMemoryVT(); 10537 if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) || 10538 Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed()) 10539 return SDValue(); 10540 10541 10542 // If the shift op is SHL, the logic op must be AND, otherwise the result 10543 // will be wrong. 10544 if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND) 10545 return SDValue(); 10546 10547 if (!N0.hasOneUse() || !N1.hasOneUse()) 10548 return SDValue(); 10549 10550 SmallVector<SDNode*, 4> SetCCs; 10551 if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0), 10552 ISD::ZERO_EXTEND, SetCCs, TLI)) 10553 return SDValue(); 10554 10555 // Actually do the transformation. 10556 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT, 10557 Load->getChain(), Load->getBasePtr(), 10558 Load->getMemoryVT(), Load->getMemOperand()); 10559 10560 SDLoc DL1(N1); 10561 SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad, 10562 N1.getOperand(1)); 10563 10564 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 10565 SDLoc DL0(N0); 10566 SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift, 10567 DAG.getConstant(Mask, DL0, VT)); 10568 10569 ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 10570 CombineTo(N, And); 10571 if (SDValue(Load, 0).hasOneUse()) { 10572 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1)); 10573 } else { 10574 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load), 10575 Load->getValueType(0), ExtLoad); 10576 CombineTo(Load, Trunc, ExtLoad.getValue(1)); 10577 } 10578 10579 // N0 is dead at this point. 10580 recursivelyDeleteUnusedNodes(N0.getNode()); 10581 10582 return SDValue(N,0); // Return N so it doesn't get rechecked! 10583 } 10584 10585 /// If we're narrowing or widening the result of a vector select and the final 10586 /// size is the same size as a setcc (compare) feeding the select, then try to 10587 /// apply the cast operation to the select's operands because matching vector 10588 /// sizes for a select condition and other operands should be more efficient. 10589 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { 10590 unsigned CastOpcode = Cast->getOpcode(); 10591 assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || 10592 CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || 10593 CastOpcode == ISD::FP_ROUND) && 10594 "Unexpected opcode for vector select narrowing/widening"); 10595 10596 // We only do this transform before legal ops because the pattern may be 10597 // obfuscated by target-specific operations after legalization. Do not create 10598 // an illegal select op, however, because that may be difficult to lower. 10599 EVT VT = Cast->getValueType(0); 10600 if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 10601 return SDValue(); 10602 10603 SDValue VSel = Cast->getOperand(0); 10604 if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || 10605 VSel.getOperand(0).getOpcode() != ISD::SETCC) 10606 return SDValue(); 10607 10608 // Does the setcc have the same vector size as the casted select? 10609 SDValue SetCC = VSel.getOperand(0); 10610 EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); 10611 if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) 10612 return SDValue(); 10613 10614 // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) 10615 SDValue A = VSel.getOperand(1); 10616 SDValue B = VSel.getOperand(2); 10617 SDValue CastA, CastB; 10618 SDLoc DL(Cast); 10619 if (CastOpcode == ISD::FP_ROUND) { 10620 // FP_ROUND (fptrunc) has an extra flag operand to pass along. 10621 CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); 10622 CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); 10623 } else { 10624 CastA = DAG.getNode(CastOpcode, DL, VT, A); 10625 CastB = DAG.getNode(CastOpcode, DL, VT, B); 10626 } 10627 return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); 10628 } 10629 10630 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 10631 // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 10632 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, 10633 const TargetLowering &TLI, EVT VT, 10634 bool LegalOperations, SDNode *N, 10635 SDValue N0, ISD::LoadExtType ExtLoadType) { 10636 SDNode *N0Node = N0.getNode(); 10637 bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node) 10638 : ISD::isZEXTLoad(N0Node); 10639 if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) || 10640 !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse()) 10641 return SDValue(); 10642 10643 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10644 EVT MemVT = LN0->getMemoryVT(); 10645 if ((LegalOperations || !LN0->isSimple() || 10646 VT.isVector()) && 10647 !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) 10648 return SDValue(); 10649 10650 SDValue ExtLoad = 10651 DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 10652 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 10653 Combiner.CombineTo(N, ExtLoad); 10654 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10655 if (LN0->use_empty()) 10656 Combiner.recursivelyDeleteUnusedNodes(LN0); 10657 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10658 } 10659 10660 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x))) 10661 // Only generate vector extloads when 1) they're legal, and 2) they are 10662 // deemed desirable by the target. 10663 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, 10664 const TargetLowering &TLI, EVT VT, 10665 bool LegalOperations, SDNode *N, SDValue N0, 10666 ISD::LoadExtType ExtLoadType, 10667 ISD::NodeType ExtOpc) { 10668 if (!ISD::isNON_EXTLoad(N0.getNode()) || 10669 !ISD::isUNINDEXEDLoad(N0.getNode()) || 10670 ((LegalOperations || VT.isVector() || 10671 !cast<LoadSDNode>(N0)->isSimple()) && 10672 !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType()))) 10673 return {}; 10674 10675 bool DoXform = true; 10676 SmallVector<SDNode *, 4> SetCCs; 10677 if (!N0.hasOneUse()) 10678 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI); 10679 if (VT.isVector()) 10680 DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); 10681 if (!DoXform) 10682 return {}; 10683 10684 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10685 SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 10686 LN0->getBasePtr(), N0.getValueType(), 10687 LN0->getMemOperand()); 10688 Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc); 10689 // If the load value is used only by N, replace it via CombineTo N. 10690 bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); 10691 Combiner.CombineTo(N, ExtLoad); 10692 if (NoReplaceTrunc) { 10693 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10694 Combiner.recursivelyDeleteUnusedNodes(LN0); 10695 } else { 10696 SDValue Trunc = 10697 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); 10698 Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 10699 } 10700 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10701 } 10702 10703 static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, 10704 const TargetLowering &TLI, EVT VT, 10705 SDNode *N, SDValue N0, 10706 ISD::LoadExtType ExtLoadType, 10707 ISD::NodeType ExtOpc) { 10708 if (!N0.hasOneUse()) 10709 return SDValue(); 10710 10711 MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0); 10712 if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD) 10713 return SDValue(); 10714 10715 if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0))) 10716 return SDValue(); 10717 10718 if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 10719 return SDValue(); 10720 10721 SDLoc dl(Ld); 10722 SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); 10723 SDValue NewLoad = DAG.getMaskedLoad( 10724 VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), 10725 PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(), 10726 ExtLoadType, Ld->isExpandingLoad()); 10727 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); 10728 return NewLoad; 10729 } 10730 10731 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, 10732 bool LegalOperations) { 10733 assert((N->getOpcode() == ISD::SIGN_EXTEND || 10734 N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext"); 10735 10736 SDValue SetCC = N->getOperand(0); 10737 if (LegalOperations || SetCC.getOpcode() != ISD::SETCC || 10738 !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1) 10739 return SDValue(); 10740 10741 SDValue X = SetCC.getOperand(0); 10742 SDValue Ones = SetCC.getOperand(1); 10743 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 10744 EVT VT = N->getValueType(0); 10745 EVT XVT = X.getValueType(); 10746 // setge X, C is canonicalized to setgt, so we do not need to match that 10747 // pattern. The setlt sibling is folded in SimplifySelectCC() because it does 10748 // not require the 'not' op. 10749 if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) { 10750 // Invert and smear/shift the sign bit: 10751 // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1) 10752 // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1) 10753 SDLoc DL(N); 10754 unsigned ShCt = VT.getSizeInBits() - 1; 10755 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10756 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 10757 SDValue NotX = DAG.getNOT(DL, X, VT); 10758 SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT); 10759 auto ShiftOpcode = 10760 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; 10761 return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); 10762 } 10763 } 10764 return SDValue(); 10765 } 10766 10767 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { 10768 SDValue N0 = N->getOperand(0); 10769 EVT VT = N->getValueType(0); 10770 SDLoc DL(N); 10771 10772 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10773 return Res; 10774 10775 // fold (sext (sext x)) -> (sext x) 10776 // fold (sext (aext x)) -> (sext x) 10777 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 10778 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); 10779 10780 if (N0.getOpcode() == ISD::TRUNCATE) { 10781 // fold (sext (truncate (load x))) -> (sext (smaller load x)) 10782 // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) 10783 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 10784 SDNode *oye = N0.getOperand(0).getNode(); 10785 if (NarrowLoad.getNode() != N0.getNode()) { 10786 CombineTo(N0.getNode(), NarrowLoad); 10787 // CombineTo deleted the truncate, if needed, but not what's under it. 10788 AddToWorklist(oye); 10789 } 10790 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10791 } 10792 10793 // See if the value being truncated is already sign extended. If so, just 10794 // eliminate the trunc/sext pair. 10795 SDValue Op = N0.getOperand(0); 10796 unsigned OpBits = Op.getScalarValueSizeInBits(); 10797 unsigned MidBits = N0.getScalarValueSizeInBits(); 10798 unsigned DestBits = VT.getScalarSizeInBits(); 10799 unsigned NumSignBits = DAG.ComputeNumSignBits(Op); 10800 10801 if (OpBits == DestBits) { 10802 // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign 10803 // bits, it is already ready. 10804 if (NumSignBits > DestBits-MidBits) 10805 return Op; 10806 } else if (OpBits < DestBits) { 10807 // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign 10808 // bits, just sext from i32. 10809 if (NumSignBits > OpBits-MidBits) 10810 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); 10811 } else { 10812 // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign 10813 // bits, just truncate to i32. 10814 if (NumSignBits > OpBits-MidBits) 10815 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); 10816 } 10817 10818 // fold (sext (truncate x)) -> (sextinreg x). 10819 if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, 10820 N0.getValueType())) { 10821 if (OpBits < DestBits) 10822 Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); 10823 else if (OpBits > DestBits) 10824 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); 10825 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, 10826 DAG.getValueType(N0.getValueType())); 10827 } 10828 } 10829 10830 // Try to simplify (sext (load x)). 10831 if (SDValue foldedExt = 10832 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 10833 ISD::SEXTLOAD, ISD::SIGN_EXTEND)) 10834 return foldedExt; 10835 10836 if (SDValue foldedExt = 10837 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD, 10838 ISD::SIGN_EXTEND)) 10839 return foldedExt; 10840 10841 // fold (sext (load x)) to multiple smaller sextloads. 10842 // Only on illegal but splittable vectors. 10843 if (SDValue ExtLoad = CombineExtLoad(N)) 10844 return ExtLoad; 10845 10846 // Try to simplify (sext (sextload x)). 10847 if (SDValue foldedExt = tryToFoldExtOfExtload( 10848 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD)) 10849 return foldedExt; 10850 10851 // fold (sext (and/or/xor (load x), cst)) -> 10852 // (and/or/xor (sextload x), (sext cst)) 10853 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 10854 N0.getOpcode() == ISD::XOR) && 10855 isa<LoadSDNode>(N0.getOperand(0)) && 10856 N0.getOperand(1).getOpcode() == ISD::Constant && 10857 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 10858 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 10859 EVT MemVT = LN00->getMemoryVT(); 10860 if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) && 10861 LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) { 10862 SmallVector<SDNode*, 4> SetCCs; 10863 bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 10864 ISD::SIGN_EXTEND, SetCCs, TLI); 10865 if (DoXform) { 10866 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT, 10867 LN00->getChain(), LN00->getBasePtr(), 10868 LN00->getMemoryVT(), 10869 LN00->getMemOperand()); 10870 APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits()); 10871 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 10872 ExtLoad, DAG.getConstant(Mask, DL, VT)); 10873 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND); 10874 bool NoReplaceTruncAnd = !N0.hasOneUse(); 10875 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 10876 CombineTo(N, And); 10877 // If N0 has multiple uses, change other uses as well. 10878 if (NoReplaceTruncAnd) { 10879 SDValue TruncAnd = 10880 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 10881 CombineTo(N0.getNode(), TruncAnd); 10882 } 10883 if (NoReplaceTrunc) { 10884 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 10885 } else { 10886 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 10887 LN00->getValueType(0), ExtLoad); 10888 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 10889 } 10890 return SDValue(N,0); // Return N so it doesn't get rechecked! 10891 } 10892 } 10893 } 10894 10895 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 10896 return V; 10897 10898 if (N0.getOpcode() == ISD::SETCC) { 10899 SDValue N00 = N0.getOperand(0); 10900 SDValue N01 = N0.getOperand(1); 10901 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 10902 EVT N00VT = N00.getValueType(); 10903 10904 // sext(setcc) -> sext_in_reg(vsetcc) for vectors. 10905 // Only do this before legalize for now. 10906 if (VT.isVector() && !LegalOperations && 10907 TLI.getBooleanContents(N00VT) == 10908 TargetLowering::ZeroOrNegativeOneBooleanContent) { 10909 // On some architectures (such as SSE/NEON/etc) the SETCC result type is 10910 // of the same size as the compared operands. Only optimize sext(setcc()) 10911 // if this is the case. 10912 EVT SVT = getSetCCResultType(N00VT); 10913 10914 // If we already have the desired type, don't change it. 10915 if (SVT != N0.getValueType()) { 10916 // We know that the # elements of the results is the same as the 10917 // # elements of the compare (and the # elements of the compare result 10918 // for that matter). Check to see that they are the same size. If so, 10919 // we know that the element size of the sext'd result matches the 10920 // element size of the compare operands. 10921 if (VT.getSizeInBits() == SVT.getSizeInBits()) 10922 return DAG.getSetCC(DL, VT, N00, N01, CC); 10923 10924 // If the desired elements are smaller or larger than the source 10925 // elements, we can use a matching integer vector type and then 10926 // truncate/sign extend. 10927 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); 10928 if (SVT == MatchingVecType) { 10929 SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); 10930 return DAG.getSExtOrTrunc(VsetCC, DL, VT); 10931 } 10932 } 10933 } 10934 10935 // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) 10936 // Here, T can be 1 or -1, depending on the type of the setcc and 10937 // getBooleanContents(). 10938 unsigned SetCCWidth = N0.getScalarValueSizeInBits(); 10939 10940 // To determine the "true" side of the select, we need to know the high bit 10941 // of the value returned by the setcc if it evaluates to true. 10942 // If the type of the setcc is i1, then the true case of the select is just 10943 // sext(i1 1), that is, -1. 10944 // If the type of the setcc is larger (say, i8) then the value of the high 10945 // bit depends on getBooleanContents(), so ask TLI for a real "true" value 10946 // of the appropriate width. 10947 SDValue ExtTrueVal = (SetCCWidth == 1) 10948 ? DAG.getAllOnesConstant(DL, VT) 10949 : DAG.getBoolConstant(true, DL, VT, N00VT); 10950 SDValue Zero = DAG.getConstant(0, DL, VT); 10951 if (SDValue SCC = 10952 SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) 10953 return SCC; 10954 10955 if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { 10956 EVT SetCCVT = getSetCCResultType(N00VT); 10957 // Don't do this transform for i1 because there's a select transform 10958 // that would reverse it. 10959 // TODO: We should not do this transform at all without a target hook 10960 // because a sext is likely cheaper than a select? 10961 if (SetCCVT.getScalarSizeInBits() != 1 && 10962 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { 10963 SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); 10964 return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); 10965 } 10966 } 10967 } 10968 10969 // fold (sext x) -> (zext x) if the sign bit is known zero. 10970 if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && 10971 DAG.SignBitIsZero(N0)) 10972 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); 10973 10974 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 10975 return NewVSel; 10976 10977 // Eliminate this sign extend by doing a negation in the destination type: 10978 // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64) 10979 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && 10980 isNullOrNullSplat(N0.getOperand(0)) && 10981 N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND && 10982 TLI.isOperationLegalOrCustom(ISD::SUB, VT)) { 10983 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT); 10984 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext); 10985 } 10986 // Eliminate this sign extend by doing a decrement in the destination type: 10987 // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1) 10988 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && 10989 isAllOnesOrAllOnesSplat(N0.getOperand(1)) && 10990 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 10991 TLI.isOperationLegalOrCustom(ISD::ADD, VT)) { 10992 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT); 10993 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); 10994 } 10995 10996 // fold sext (not i1 X) -> add (zext i1 X), -1 10997 // TODO: This could be extended to handle bool vectors. 10998 if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() && 10999 (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) && 11000 TLI.isOperationLegal(ISD::ADD, VT)))) { 11001 // If we can eliminate the 'not', the sext form should be better 11002 if (SDValue NewXor = visitXOR(N0.getNode())) { 11003 // Returning N0 is a form of in-visit replacement that may have 11004 // invalidated N0. 11005 if (NewXor.getNode() == N0.getNode()) { 11006 // Return SDValue here as the xor should have already been replaced in 11007 // this sext. 11008 return SDValue(); 11009 } else { 11010 // Return a new sext with the new xor. 11011 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor); 11012 } 11013 } 11014 11015 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); 11016 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); 11017 } 11018 11019 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) 11020 return Res; 11021 11022 return SDValue(); 11023 } 11024 11025 // isTruncateOf - If N is a truncate of some other value, return true, record 11026 // the value being truncated in Op and which of Op's bits are zero/one in Known. 11027 // This function computes KnownBits to avoid a duplicated call to 11028 // computeKnownBits in the caller. 11029 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, 11030 KnownBits &Known) { 11031 if (N->getOpcode() == ISD::TRUNCATE) { 11032 Op = N->getOperand(0); 11033 Known = DAG.computeKnownBits(Op); 11034 return true; 11035 } 11036 11037 if (N.getOpcode() != ISD::SETCC || 11038 N.getValueType().getScalarType() != MVT::i1 || 11039 cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE) 11040 return false; 11041 11042 SDValue Op0 = N->getOperand(0); 11043 SDValue Op1 = N->getOperand(1); 11044 assert(Op0.getValueType() == Op1.getValueType()); 11045 11046 if (isNullOrNullSplat(Op0)) 11047 Op = Op1; 11048 else if (isNullOrNullSplat(Op1)) 11049 Op = Op0; 11050 else 11051 return false; 11052 11053 Known = DAG.computeKnownBits(Op); 11054 11055 return (Known.Zero | 1).isAllOnesValue(); 11056 } 11057 11058 /// Given an extending node with a pop-count operand, if the target does not 11059 /// support a pop-count in the narrow source type but does support it in the 11060 /// destination type, widen the pop-count to the destination type. 11061 static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) { 11062 assert((Extend->getOpcode() == ISD::ZERO_EXTEND || 11063 Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op"); 11064 11065 SDValue CtPop = Extend->getOperand(0); 11066 if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse()) 11067 return SDValue(); 11068 11069 EVT VT = Extend->getValueType(0); 11070 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11071 if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) || 11072 !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT)) 11073 return SDValue(); 11074 11075 // zext (ctpop X) --> ctpop (zext X) 11076 SDLoc DL(Extend); 11077 SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT); 11078 return DAG.getNode(ISD::CTPOP, DL, VT, NewZext); 11079 } 11080 11081 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { 11082 SDValue N0 = N->getOperand(0); 11083 EVT VT = N->getValueType(0); 11084 11085 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 11086 return Res; 11087 11088 // fold (zext (zext x)) -> (zext x) 11089 // fold (zext (aext x)) -> (zext x) 11090 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 11091 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, 11092 N0.getOperand(0)); 11093 11094 // fold (zext (truncate x)) -> (zext x) or 11095 // (zext (truncate x)) -> (truncate x) 11096 // This is valid when the truncated bits of x are already zero. 11097 SDValue Op; 11098 KnownBits Known; 11099 if (isTruncateOf(DAG, N0, Op, Known)) { 11100 APInt TruncatedBits = 11101 (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ? 11102 APInt(Op.getScalarValueSizeInBits(), 0) : 11103 APInt::getBitsSet(Op.getScalarValueSizeInBits(), 11104 N0.getScalarValueSizeInBits(), 11105 std::min(Op.getScalarValueSizeInBits(), 11106 VT.getScalarSizeInBits())); 11107 if (TruncatedBits.isSubsetOf(Known.Zero)) 11108 return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 11109 } 11110 11111 // fold (zext (truncate x)) -> (and x, mask) 11112 if (N0.getOpcode() == ISD::TRUNCATE) { 11113 // fold (zext (truncate (load x))) -> (zext (smaller load x)) 11114 // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) 11115 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 11116 SDNode *oye = N0.getOperand(0).getNode(); 11117 if (NarrowLoad.getNode() != N0.getNode()) { 11118 CombineTo(N0.getNode(), NarrowLoad); 11119 // CombineTo deleted the truncate, if needed, but not what's under it. 11120 AddToWorklist(oye); 11121 } 11122 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11123 } 11124 11125 EVT SrcVT = N0.getOperand(0).getValueType(); 11126 EVT MinVT = N0.getValueType(); 11127 11128 // Try to mask before the extension to avoid having to generate a larger mask, 11129 // possibly over several sub-vectors. 11130 if (SrcVT.bitsLT(VT) && VT.isVector()) { 11131 if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && 11132 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { 11133 SDValue Op = N0.getOperand(0); 11134 Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); 11135 AddToWorklist(Op.getNode()); 11136 SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 11137 // Transfer the debug info; the new node is equivalent to N0. 11138 DAG.transferDbgValues(N0, ZExtOrTrunc); 11139 return ZExtOrTrunc; 11140 } 11141 } 11142 11143 if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { 11144 SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 11145 AddToWorklist(Op.getNode()); 11146 SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); 11147 // We may safely transfer the debug info describing the truncate node over 11148 // to the equivalent and operation. 11149 DAG.transferDbgValues(N0, And); 11150 return And; 11151 } 11152 } 11153 11154 // Fold (zext (and (trunc x), cst)) -> (and x, cst), 11155 // if either of the casts is not free. 11156 if (N0.getOpcode() == ISD::AND && 11157 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 11158 N0.getOperand(1).getOpcode() == ISD::Constant && 11159 (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 11160 N0.getValueType()) || 11161 !TLI.isZExtFree(N0.getValueType(), VT))) { 11162 SDValue X = N0.getOperand(0).getOperand(0); 11163 X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); 11164 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 11165 SDLoc DL(N); 11166 return DAG.getNode(ISD::AND, DL, VT, 11167 X, DAG.getConstant(Mask, DL, VT)); 11168 } 11169 11170 // Try to simplify (zext (load x)). 11171 if (SDValue foldedExt = 11172 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 11173 ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) 11174 return foldedExt; 11175 11176 if (SDValue foldedExt = 11177 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD, 11178 ISD::ZERO_EXTEND)) 11179 return foldedExt; 11180 11181 // fold (zext (load x)) to multiple smaller zextloads. 11182 // Only on illegal but splittable vectors. 11183 if (SDValue ExtLoad = CombineExtLoad(N)) 11184 return ExtLoad; 11185 11186 // fold (zext (and/or/xor (load x), cst)) -> 11187 // (and/or/xor (zextload x), (zext cst)) 11188 // Unless (and (load x) cst) will match as a zextload already and has 11189 // additional users. 11190 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 11191 N0.getOpcode() == ISD::XOR) && 11192 isa<LoadSDNode>(N0.getOperand(0)) && 11193 N0.getOperand(1).getOpcode() == ISD::Constant && 11194 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 11195 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 11196 EVT MemVT = LN00->getMemoryVT(); 11197 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) && 11198 LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) { 11199 bool DoXform = true; 11200 SmallVector<SDNode*, 4> SetCCs; 11201 if (!N0.hasOneUse()) { 11202 if (N0.getOpcode() == ISD::AND) { 11203 auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); 11204 EVT LoadResultTy = AndC->getValueType(0); 11205 EVT ExtVT; 11206 if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT)) 11207 DoXform = false; 11208 } 11209 } 11210 if (DoXform) 11211 DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 11212 ISD::ZERO_EXTEND, SetCCs, TLI); 11213 if (DoXform) { 11214 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT, 11215 LN00->getChain(), LN00->getBasePtr(), 11216 LN00->getMemoryVT(), 11217 LN00->getMemOperand()); 11218 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 11219 SDLoc DL(N); 11220 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 11221 ExtLoad, DAG.getConstant(Mask, DL, VT)); 11222 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 11223 bool NoReplaceTruncAnd = !N0.hasOneUse(); 11224 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 11225 CombineTo(N, And); 11226 // If N0 has multiple uses, change other uses as well. 11227 if (NoReplaceTruncAnd) { 11228 SDValue TruncAnd = 11229 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 11230 CombineTo(N0.getNode(), TruncAnd); 11231 } 11232 if (NoReplaceTrunc) { 11233 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 11234 } else { 11235 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 11236 LN00->getValueType(0), ExtLoad); 11237 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 11238 } 11239 return SDValue(N,0); // Return N so it doesn't get rechecked! 11240 } 11241 } 11242 } 11243 11244 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 11245 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 11246 if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N)) 11247 return ZExtLoad; 11248 11249 // Try to simplify (zext (zextload x)). 11250 if (SDValue foldedExt = tryToFoldExtOfExtload( 11251 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD)) 11252 return foldedExt; 11253 11254 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 11255 return V; 11256 11257 if (N0.getOpcode() == ISD::SETCC) { 11258 // Only do this before legalize for now. 11259 if (!LegalOperations && VT.isVector() && 11260 N0.getValueType().getVectorElementType() == MVT::i1) { 11261 EVT N00VT = N0.getOperand(0).getValueType(); 11262 if (getSetCCResultType(N00VT) == N0.getValueType()) 11263 return SDValue(); 11264 11265 // We know that the # elements of the results is the same as the # 11266 // elements of the compare (and the # elements of the compare result for 11267 // that matter). Check to see that they are the same size. If so, we know 11268 // that the element size of the sext'd result matches the element size of 11269 // the compare operands. 11270 SDLoc DL(N); 11271 if (VT.getSizeInBits() == N00VT.getSizeInBits()) { 11272 // zext(setcc) -> zext_in_reg(vsetcc) for vectors. 11273 SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), 11274 N0.getOperand(1), N0.getOperand(2)); 11275 return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType()); 11276 } 11277 11278 // If the desired elements are smaller or larger than the source 11279 // elements we can use a matching integer vector type and then 11280 // truncate/any extend followed by zext_in_reg. 11281 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 11282 SDValue VsetCC = 11283 DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), 11284 N0.getOperand(1), N0.getOperand(2)); 11285 return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL, 11286 N0.getValueType()); 11287 } 11288 11289 // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc) 11290 SDLoc DL(N); 11291 EVT N0VT = N0.getValueType(); 11292 EVT N00VT = N0.getOperand(0).getValueType(); 11293 if (SDValue SCC = SimplifySelectCC( 11294 DL, N0.getOperand(0), N0.getOperand(1), 11295 DAG.getBoolConstant(true, DL, N0VT, N00VT), 11296 DAG.getBoolConstant(false, DL, N0VT, N00VT), 11297 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 11298 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC); 11299 } 11300 11301 // (zext (shl (zext x), cst)) -> (shl (zext x), cst) 11302 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && 11303 isa<ConstantSDNode>(N0.getOperand(1)) && 11304 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 11305 N0.hasOneUse()) { 11306 SDValue ShAmt = N0.getOperand(1); 11307 if (N0.getOpcode() == ISD::SHL) { 11308 SDValue InnerZExt = N0.getOperand(0); 11309 // If the original shl may be shifting out bits, do not perform this 11310 // transformation. 11311 unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - 11312 InnerZExt.getOperand(0).getValueSizeInBits(); 11313 if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits)) 11314 return SDValue(); 11315 } 11316 11317 SDLoc DL(N); 11318 11319 // Ensure that the shift amount is wide enough for the shifted value. 11320 if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits()) 11321 ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); 11322 11323 return DAG.getNode(N0.getOpcode(), DL, VT, 11324 DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), 11325 ShAmt); 11326 } 11327 11328 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 11329 return NewVSel; 11330 11331 if (SDValue NewCtPop = widenCtPop(N, DAG)) 11332 return NewCtPop; 11333 11334 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) 11335 return Res; 11336 11337 return SDValue(); 11338 } 11339 11340 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { 11341 SDValue N0 = N->getOperand(0); 11342 EVT VT = N->getValueType(0); 11343 11344 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 11345 return Res; 11346 11347 // fold (aext (aext x)) -> (aext x) 11348 // fold (aext (zext x)) -> (zext x) 11349 // fold (aext (sext x)) -> (sext x) 11350 if (N0.getOpcode() == ISD::ANY_EXTEND || 11351 N0.getOpcode() == ISD::ZERO_EXTEND || 11352 N0.getOpcode() == ISD::SIGN_EXTEND) 11353 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 11354 11355 // fold (aext (truncate (load x))) -> (aext (smaller load x)) 11356 // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) 11357 if (N0.getOpcode() == ISD::TRUNCATE) { 11358 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 11359 SDNode *oye = N0.getOperand(0).getNode(); 11360 if (NarrowLoad.getNode() != N0.getNode()) { 11361 CombineTo(N0.getNode(), NarrowLoad); 11362 // CombineTo deleted the truncate, if needed, but not what's under it. 11363 AddToWorklist(oye); 11364 } 11365 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11366 } 11367 } 11368 11369 // fold (aext (truncate x)) 11370 if (N0.getOpcode() == ISD::TRUNCATE) 11371 return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 11372 11373 // Fold (aext (and (trunc x), cst)) -> (and x, cst) 11374 // if the trunc is not free. 11375 if (N0.getOpcode() == ISD::AND && 11376 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 11377 N0.getOperand(1).getOpcode() == ISD::Constant && 11378 !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 11379 N0.getValueType())) { 11380 SDLoc DL(N); 11381 SDValue X = N0.getOperand(0).getOperand(0); 11382 X = DAG.getAnyExtOrTrunc(X, DL, VT); 11383 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 11384 return DAG.getNode(ISD::AND, DL, VT, 11385 X, DAG.getConstant(Mask, DL, VT)); 11386 } 11387 11388 // fold (aext (load x)) -> (aext (truncate (extload x))) 11389 // None of the supported targets knows how to perform load and any_ext 11390 // on vectors in one instruction, so attempt to fold to zext instead. 11391 if (VT.isVector()) { 11392 // Try to simplify (zext (load x)). 11393 if (SDValue foldedExt = 11394 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 11395 ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) 11396 return foldedExt; 11397 } else if (ISD::isNON_EXTLoad(N0.getNode()) && 11398 ISD::isUNINDEXEDLoad(N0.getNode()) && 11399 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 11400 bool DoXform = true; 11401 SmallVector<SDNode *, 4> SetCCs; 11402 if (!N0.hasOneUse()) 11403 DoXform = 11404 ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI); 11405 if (DoXform) { 11406 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11407 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 11408 LN0->getChain(), LN0->getBasePtr(), 11409 N0.getValueType(), LN0->getMemOperand()); 11410 ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); 11411 // If the load value is used only by N, replace it via CombineTo N. 11412 bool NoReplaceTrunc = N0.hasOneUse(); 11413 CombineTo(N, ExtLoad); 11414 if (NoReplaceTrunc) { 11415 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 11416 recursivelyDeleteUnusedNodes(LN0); 11417 } else { 11418 SDValue Trunc = 11419 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); 11420 CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 11421 } 11422 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11423 } 11424 } 11425 11426 // fold (aext (zextload x)) -> (aext (truncate (zextload x))) 11427 // fold (aext (sextload x)) -> (aext (truncate (sextload x))) 11428 // fold (aext ( extload x)) -> (aext (truncate (extload x))) 11429 if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && 11430 ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { 11431 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11432 ISD::LoadExtType ExtType = LN0->getExtensionType(); 11433 EVT MemVT = LN0->getMemoryVT(); 11434 if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { 11435 SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), 11436 VT, LN0->getChain(), LN0->getBasePtr(), 11437 MemVT, LN0->getMemOperand()); 11438 CombineTo(N, ExtLoad); 11439 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 11440 recursivelyDeleteUnusedNodes(LN0); 11441 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11442 } 11443 } 11444 11445 if (N0.getOpcode() == ISD::SETCC) { 11446 // For vectors: 11447 // aext(setcc) -> vsetcc 11448 // aext(setcc) -> truncate(vsetcc) 11449 // aext(setcc) -> aext(vsetcc) 11450 // Only do this before legalize for now. 11451 if (VT.isVector() && !LegalOperations) { 11452 EVT N00VT = N0.getOperand(0).getValueType(); 11453 if (getSetCCResultType(N00VT) == N0.getValueType()) 11454 return SDValue(); 11455 11456 // We know that the # elements of the results is the same as the 11457 // # elements of the compare (and the # elements of the compare result 11458 // for that matter). Check to see that they are the same size. If so, 11459 // we know that the element size of the sext'd result matches the 11460 // element size of the compare operands. 11461 if (VT.getSizeInBits() == N00VT.getSizeInBits()) 11462 return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), 11463 N0.getOperand(1), 11464 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 11465 11466 // If the desired elements are smaller or larger than the source 11467 // elements we can use a matching integer vector type and then 11468 // truncate/any extend 11469 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 11470 SDValue VsetCC = 11471 DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), 11472 N0.getOperand(1), 11473 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 11474 return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); 11475 } 11476 11477 // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc 11478 SDLoc DL(N); 11479 if (SDValue SCC = SimplifySelectCC( 11480 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), 11481 DAG.getConstant(0, DL, VT), 11482 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 11483 return SCC; 11484 } 11485 11486 if (SDValue NewCtPop = widenCtPop(N, DAG)) 11487 return NewCtPop; 11488 11489 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) 11490 return Res; 11491 11492 return SDValue(); 11493 } 11494 11495 SDValue DAGCombiner::visitAssertExt(SDNode *N) { 11496 unsigned Opcode = N->getOpcode(); 11497 SDValue N0 = N->getOperand(0); 11498 SDValue N1 = N->getOperand(1); 11499 EVT AssertVT = cast<VTSDNode>(N1)->getVT(); 11500 11501 // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) 11502 if (N0.getOpcode() == Opcode && 11503 AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) 11504 return N0; 11505 11506 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 11507 N0.getOperand(0).getOpcode() == Opcode) { 11508 // We have an assert, truncate, assert sandwich. Make one stronger assert 11509 // by asserting on the smallest asserted type to the larger source type. 11510 // This eliminates the later assert: 11511 // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN 11512 // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN 11513 SDValue BigA = N0.getOperand(0); 11514 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 11515 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 11516 "Asserting zero/sign-extended bits to a type larger than the " 11517 "truncated destination does not provide information"); 11518 11519 SDLoc DL(N); 11520 EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; 11521 SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); 11522 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 11523 BigA.getOperand(0), MinAssertVTVal); 11524 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 11525 } 11526 11527 // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller 11528 // than X. Just move the AssertZext in front of the truncate and drop the 11529 // AssertSExt. 11530 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 11531 N0.getOperand(0).getOpcode() == ISD::AssertSext && 11532 Opcode == ISD::AssertZext) { 11533 SDValue BigA = N0.getOperand(0); 11534 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 11535 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 11536 "Asserting zero/sign-extended bits to a type larger than the " 11537 "truncated destination does not provide information"); 11538 11539 if (AssertVT.bitsLT(BigA_AssertVT)) { 11540 SDLoc DL(N); 11541 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 11542 BigA.getOperand(0), N1); 11543 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 11544 } 11545 } 11546 11547 return SDValue(); 11548 } 11549 11550 SDValue DAGCombiner::visitAssertAlign(SDNode *N) { 11551 SDLoc DL(N); 11552 11553 Align AL = cast<AssertAlignSDNode>(N)->getAlign(); 11554 SDValue N0 = N->getOperand(0); 11555 11556 // Fold (assertalign (assertalign x, AL0), AL1) -> 11557 // (assertalign x, max(AL0, AL1)) 11558 if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0)) 11559 return DAG.getAssertAlign(DL, N0.getOperand(0), 11560 std::max(AL, AAN->getAlign())); 11561 11562 // In rare cases, there are trivial arithmetic ops in source operands. Sink 11563 // this assert down to source operands so that those arithmetic ops could be 11564 // exposed to the DAG combining. 11565 switch (N0.getOpcode()) { 11566 default: 11567 break; 11568 case ISD::ADD: 11569 case ISD::SUB: { 11570 unsigned AlignShift = Log2(AL); 11571 SDValue LHS = N0.getOperand(0); 11572 SDValue RHS = N0.getOperand(1); 11573 unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros(); 11574 unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros(); 11575 if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) { 11576 if (LHSAlignShift < AlignShift) 11577 LHS = DAG.getAssertAlign(DL, LHS, AL); 11578 if (RHSAlignShift < AlignShift) 11579 RHS = DAG.getAssertAlign(DL, RHS, AL); 11580 return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS); 11581 } 11582 break; 11583 } 11584 } 11585 11586 return SDValue(); 11587 } 11588 11589 /// If the result of a wider load is shifted to right of N bits and then 11590 /// truncated to a narrower type and where N is a multiple of number of bits of 11591 /// the narrower type, transform it to a narrower load from address + N / num of 11592 /// bits of new type. Also narrow the load if the result is masked with an AND 11593 /// to effectively produce a smaller type. If the result is to be extended, also 11594 /// fold the extension to form a extending load. 11595 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { 11596 unsigned Opc = N->getOpcode(); 11597 11598 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 11599 SDValue N0 = N->getOperand(0); 11600 EVT VT = N->getValueType(0); 11601 EVT ExtVT = VT; 11602 11603 // This transformation isn't valid for vector loads. 11604 if (VT.isVector()) 11605 return SDValue(); 11606 11607 unsigned ShAmt = 0; 11608 bool HasShiftedOffset = false; 11609 // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then 11610 // extended to VT. 11611 if (Opc == ISD::SIGN_EXTEND_INREG) { 11612 ExtType = ISD::SEXTLOAD; 11613 ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 11614 } else if (Opc == ISD::SRL) { 11615 // Another special-case: SRL is basically zero-extending a narrower value, 11616 // or it maybe shifting a higher subword, half or byte into the lowest 11617 // bits. 11618 ExtType = ISD::ZEXTLOAD; 11619 N0 = SDValue(N, 0); 11620 11621 auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); 11622 auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11623 if (!N01 || !LN0) 11624 return SDValue(); 11625 11626 uint64_t ShiftAmt = N01->getZExtValue(); 11627 uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits(); 11628 if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) 11629 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); 11630 else 11631 ExtVT = EVT::getIntegerVT(*DAG.getContext(), 11632 VT.getScalarSizeInBits() - ShiftAmt); 11633 } else if (Opc == ISD::AND) { 11634 // An AND with a constant mask is the same as a truncate + zero-extend. 11635 auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11636 if (!AndC) 11637 return SDValue(); 11638 11639 const APInt &Mask = AndC->getAPIntValue(); 11640 unsigned ActiveBits = 0; 11641 if (Mask.isMask()) { 11642 ActiveBits = Mask.countTrailingOnes(); 11643 } else if (Mask.isShiftedMask()) { 11644 ShAmt = Mask.countTrailingZeros(); 11645 APInt ShiftedMask = Mask.lshr(ShAmt); 11646 ActiveBits = ShiftedMask.countTrailingOnes(); 11647 HasShiftedOffset = true; 11648 } else 11649 return SDValue(); 11650 11651 ExtType = ISD::ZEXTLOAD; 11652 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 11653 } 11654 11655 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 11656 SDValue SRL = N0; 11657 if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) { 11658 ShAmt = ConstShift->getZExtValue(); 11659 unsigned EVTBits = ExtVT.getScalarSizeInBits(); 11660 // Is the shift amount a multiple of size of VT? 11661 if ((ShAmt & (EVTBits-1)) == 0) { 11662 N0 = N0.getOperand(0); 11663 // Is the load width a multiple of size of VT? 11664 if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0) 11665 return SDValue(); 11666 } 11667 11668 // At this point, we must have a load or else we can't do the transform. 11669 auto *LN0 = dyn_cast<LoadSDNode>(N0); 11670 if (!LN0) return SDValue(); 11671 11672 // Because a SRL must be assumed to *need* to zero-extend the high bits 11673 // (as opposed to anyext the high bits), we can't combine the zextload 11674 // lowering of SRL and an sextload. 11675 if (LN0->getExtensionType() == ISD::SEXTLOAD) 11676 return SDValue(); 11677 11678 // If the shift amount is larger than the input type then we're not 11679 // accessing any of the loaded bytes. If the load was a zextload/extload 11680 // then the result of the shift+trunc is zero/undef (handled elsewhere). 11681 if (ShAmt >= LN0->getMemoryVT().getSizeInBits()) 11682 return SDValue(); 11683 11684 // If the SRL is only used by a masking AND, we may be able to adjust 11685 // the ExtVT to make the AND redundant. 11686 SDNode *Mask = *(SRL->use_begin()); 11687 if (Mask->getOpcode() == ISD::AND && 11688 isa<ConstantSDNode>(Mask->getOperand(1))) { 11689 const APInt& ShiftMask = Mask->getConstantOperandAPInt(1); 11690 if (ShiftMask.isMask()) { 11691 EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), 11692 ShiftMask.countTrailingOnes()); 11693 // If the mask is smaller, recompute the type. 11694 if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && 11695 TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) 11696 ExtVT = MaskedVT; 11697 } 11698 } 11699 } 11700 } 11701 11702 // If the load is shifted left (and the result isn't shifted back right), 11703 // we can fold the truncate through the shift. 11704 unsigned ShLeftAmt = 0; 11705 if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 11706 ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { 11707 if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 11708 ShLeftAmt = N01->getZExtValue(); 11709 N0 = N0.getOperand(0); 11710 } 11711 } 11712 11713 // If we haven't found a load, we can't narrow it. 11714 if (!isa<LoadSDNode>(N0)) 11715 return SDValue(); 11716 11717 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11718 // Reducing the width of a volatile load is illegal. For atomics, we may be 11719 // able to reduce the width provided we never widen again. (see D66309) 11720 if (!LN0->isSimple() || 11721 !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt)) 11722 return SDValue(); 11723 11724 auto AdjustBigEndianShift = [&](unsigned ShAmt) { 11725 unsigned LVTStoreBits = 11726 LN0->getMemoryVT().getStoreSizeInBits().getFixedSize(); 11727 unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedSize(); 11728 return LVTStoreBits - EVTStoreBits - ShAmt; 11729 }; 11730 11731 // For big endian targets, we need to adjust the offset to the pointer to 11732 // load the correct bytes. 11733 if (DAG.getDataLayout().isBigEndian()) 11734 ShAmt = AdjustBigEndianShift(ShAmt); 11735 11736 uint64_t PtrOff = ShAmt / 8; 11737 Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff); 11738 SDLoc DL(LN0); 11739 // The original load itself didn't wrap, so an offset within it doesn't. 11740 SDNodeFlags Flags; 11741 Flags.setNoUnsignedWrap(true); 11742 SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(), 11743 TypeSize::Fixed(PtrOff), DL, Flags); 11744 AddToWorklist(NewPtr.getNode()); 11745 11746 SDValue Load; 11747 if (ExtType == ISD::NON_EXTLOAD) 11748 Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr, 11749 LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign, 11750 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 11751 else 11752 Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr, 11753 LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, 11754 NewAlign, LN0->getMemOperand()->getFlags(), 11755 LN0->getAAInfo()); 11756 11757 // Replace the old load's chain with the new load's chain. 11758 WorklistRemover DeadNodes(*this); 11759 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 11760 11761 // Shift the result left, if we've swallowed a left shift. 11762 SDValue Result = Load; 11763 if (ShLeftAmt != 0) { 11764 EVT ShImmTy = getShiftAmountTy(Result.getValueType()); 11765 if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt)) 11766 ShImmTy = VT; 11767 // If the shift amount is as large as the result size (but, presumably, 11768 // no larger than the source) then the useful bits of the result are 11769 // zero; we can't simply return the shortened shift, because the result 11770 // of that operation is undefined. 11771 if (ShLeftAmt >= VT.getScalarSizeInBits()) 11772 Result = DAG.getConstant(0, DL, VT); 11773 else 11774 Result = DAG.getNode(ISD::SHL, DL, VT, 11775 Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy)); 11776 } 11777 11778 if (HasShiftedOffset) { 11779 // Recalculate the shift amount after it has been altered to calculate 11780 // the offset. 11781 if (DAG.getDataLayout().isBigEndian()) 11782 ShAmt = AdjustBigEndianShift(ShAmt); 11783 11784 // We're using a shifted mask, so the load now has an offset. This means 11785 // that data has been loaded into the lower bytes than it would have been 11786 // before, so we need to shl the loaded data into the correct position in the 11787 // register. 11788 SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT); 11789 Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC); 11790 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 11791 } 11792 11793 // Return the new loaded value. 11794 return Result; 11795 } 11796 11797 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { 11798 SDValue N0 = N->getOperand(0); 11799 SDValue N1 = N->getOperand(1); 11800 EVT VT = N->getValueType(0); 11801 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 11802 unsigned VTBits = VT.getScalarSizeInBits(); 11803 unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); 11804 11805 // sext_vector_inreg(undef) = 0 because the top bit will all be the same. 11806 if (N0.isUndef()) 11807 return DAG.getConstant(0, SDLoc(N), VT); 11808 11809 // fold (sext_in_reg c1) -> c1 11810 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 11811 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); 11812 11813 // If the input is already sign extended, just drop the extension. 11814 if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1)) 11815 return N0; 11816 11817 // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 11818 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && 11819 ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) 11820 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), 11821 N1); 11822 11823 // fold (sext_in_reg (sext x)) -> (sext x) 11824 // fold (sext_in_reg (aext x)) -> (sext x) 11825 // if x is small enough or if we know that x has more than 1 sign bit and the 11826 // sign_extend_inreg is extending from one of them. 11827 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { 11828 SDValue N00 = N0.getOperand(0); 11829 unsigned N00Bits = N00.getScalarValueSizeInBits(); 11830 if ((N00Bits <= ExtVTBits || 11831 (N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) && 11832 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 11833 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); 11834 } 11835 11836 // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) 11837 // if x is small enough or if we know that x has more than 1 sign bit and the 11838 // sign_extend_inreg is extending from one of them. 11839 if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || 11840 N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || 11841 N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { 11842 SDValue N00 = N0.getOperand(0); 11843 unsigned N00Bits = N00.getScalarValueSizeInBits(); 11844 unsigned DstElts = N0.getValueType().getVectorMinNumElements(); 11845 unsigned SrcElts = N00.getValueType().getVectorMinNumElements(); 11846 bool IsZext = N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG; 11847 APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts); 11848 if ((N00Bits == ExtVTBits || 11849 (!IsZext && (N00Bits < ExtVTBits || 11850 (N00Bits - DAG.ComputeNumSignBits(N00, DemandedSrcElts)) < 11851 ExtVTBits))) && 11852 (!LegalOperations || 11853 TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))) 11854 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00); 11855 } 11856 11857 // fold (sext_in_reg (zext x)) -> (sext x) 11858 // iff we are extending the source sign bit. 11859 if (N0.getOpcode() == ISD::ZERO_EXTEND) { 11860 SDValue N00 = N0.getOperand(0); 11861 if (N00.getScalarValueSizeInBits() == ExtVTBits && 11862 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 11863 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); 11864 } 11865 11866 // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. 11867 if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1))) 11868 return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT); 11869 11870 // fold operands of sext_in_reg based on knowledge that the top bits are not 11871 // demanded. 11872 if (SimplifyDemandedBits(SDValue(N, 0))) 11873 return SDValue(N, 0); 11874 11875 // fold (sext_in_reg (load x)) -> (smaller sextload x) 11876 // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) 11877 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 11878 return NarrowLoad; 11879 11880 // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) 11881 // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. 11882 // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. 11883 if (N0.getOpcode() == ISD::SRL) { 11884 if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1))) 11885 if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) { 11886 // We can turn this into an SRA iff the input to the SRL is already sign 11887 // extended enough. 11888 unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); 11889 if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits) 11890 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), 11891 N0.getOperand(1)); 11892 } 11893 } 11894 11895 // fold (sext_inreg (extload x)) -> (sextload x) 11896 // If sextload is not supported by target, we can only do the combine when 11897 // load has one use. Doing otherwise can block folding the extload with other 11898 // extends that the target does support. 11899 if (ISD::isEXTLoad(N0.getNode()) && 11900 ISD::isUNINDEXEDLoad(N0.getNode()) && 11901 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() && 11902 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() && 11903 N0.hasOneUse()) || 11904 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { 11905 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11906 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 11907 LN0->getChain(), 11908 LN0->getBasePtr(), ExtVT, 11909 LN0->getMemOperand()); 11910 CombineTo(N, ExtLoad); 11911 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 11912 AddToWorklist(ExtLoad.getNode()); 11913 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11914 } 11915 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use 11916 if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && 11917 N0.hasOneUse() && 11918 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() && 11919 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) && 11920 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { 11921 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11922 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 11923 LN0->getChain(), 11924 LN0->getBasePtr(), ExtVT, 11925 LN0->getMemOperand()); 11926 CombineTo(N, ExtLoad); 11927 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 11928 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11929 } 11930 11931 // fold (sext_inreg (masked_load x)) -> (sext_masked_load x) 11932 // ignore it if the masked load is already sign extended 11933 if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) { 11934 if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() && 11935 Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD && 11936 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) { 11937 SDValue ExtMaskedLoad = DAG.getMaskedLoad( 11938 VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), 11939 Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(), 11940 Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad()); 11941 CombineTo(N, ExtMaskedLoad); 11942 CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1)); 11943 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11944 } 11945 } 11946 11947 // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x) 11948 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) { 11949 if (SDValue(GN0, 0).hasOneUse() && 11950 ExtVT == GN0->getMemoryVT() && 11951 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { 11952 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), 11953 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; 11954 11955 SDValue ExtLoad = DAG.getMaskedGather( 11956 DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops, 11957 GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD); 11958 11959 CombineTo(N, ExtLoad); 11960 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 11961 AddToWorklist(ExtLoad.getNode()); 11962 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11963 } 11964 } 11965 11966 // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) 11967 if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { 11968 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 11969 N0.getOperand(1), false)) 11970 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); 11971 } 11972 11973 return SDValue(); 11974 } 11975 11976 SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { 11977 SDValue N0 = N->getOperand(0); 11978 EVT VT = N->getValueType(0); 11979 11980 // sext_vector_inreg(undef) = 0 because the top bit will all be the same. 11981 if (N0.isUndef()) 11982 return DAG.getConstant(0, SDLoc(N), VT); 11983 11984 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 11985 return Res; 11986 11987 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 11988 return SDValue(N, 0); 11989 11990 return SDValue(); 11991 } 11992 11993 SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { 11994 SDValue N0 = N->getOperand(0); 11995 EVT VT = N->getValueType(0); 11996 11997 // zext_vector_inreg(undef) = 0 because the top bits will be zero. 11998 if (N0.isUndef()) 11999 return DAG.getConstant(0, SDLoc(N), VT); 12000 12001 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 12002 return Res; 12003 12004 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 12005 return SDValue(N, 0); 12006 12007 return SDValue(); 12008 } 12009 12010 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { 12011 SDValue N0 = N->getOperand(0); 12012 EVT VT = N->getValueType(0); 12013 EVT SrcVT = N0.getValueType(); 12014 bool isLE = DAG.getDataLayout().isLittleEndian(); 12015 12016 // noop truncate 12017 if (SrcVT == VT) 12018 return N0; 12019 12020 // fold (truncate (truncate x)) -> (truncate x) 12021 if (N0.getOpcode() == ISD::TRUNCATE) 12022 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 12023 12024 // fold (truncate c1) -> c1 12025 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 12026 SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); 12027 if (C.getNode() != N) 12028 return C; 12029 } 12030 12031 // fold (truncate (ext x)) -> (ext x) or (truncate x) or x 12032 if (N0.getOpcode() == ISD::ZERO_EXTEND || 12033 N0.getOpcode() == ISD::SIGN_EXTEND || 12034 N0.getOpcode() == ISD::ANY_EXTEND) { 12035 // if the source is smaller than the dest, we still need an extend. 12036 if (N0.getOperand(0).getValueType().bitsLT(VT)) 12037 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 12038 // if the source is larger than the dest, than we just need the truncate. 12039 if (N0.getOperand(0).getValueType().bitsGT(VT)) 12040 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 12041 // if the source and dest are the same type, we can drop both the extend 12042 // and the truncate. 12043 return N0.getOperand(0); 12044 } 12045 12046 // If this is anyext(trunc), don't fold it, allow ourselves to be folded. 12047 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) 12048 return SDValue(); 12049 12050 // Fold extract-and-trunc into a narrow extract. For example: 12051 // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) 12052 // i32 y = TRUNCATE(i64 x) 12053 // -- becomes -- 12054 // v16i8 b = BITCAST (v2i64 val) 12055 // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) 12056 // 12057 // Note: We only run this optimization after type legalization (which often 12058 // creates this pattern) and before operation legalization after which 12059 // we need to be more careful about the vector instructions that we generate. 12060 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 12061 LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { 12062 EVT VecTy = N0.getOperand(0).getValueType(); 12063 EVT ExTy = N0.getValueType(); 12064 EVT TrTy = N->getValueType(0); 12065 12066 auto EltCnt = VecTy.getVectorElementCount(); 12067 unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); 12068 auto NewEltCnt = EltCnt * SizeRatio; 12069 12070 EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt); 12071 assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); 12072 12073 SDValue EltNo = N0->getOperand(1); 12074 if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) { 12075 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 12076 int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); 12077 12078 SDLoc DL(N); 12079 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy, 12080 DAG.getBitcast(NVT, N0.getOperand(0)), 12081 DAG.getVectorIdxConstant(Index, DL)); 12082 } 12083 } 12084 12085 // trunc (select c, a, b) -> select c, (trunc a), (trunc b) 12086 if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { 12087 if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && 12088 TLI.isTruncateFree(SrcVT, VT)) { 12089 SDLoc SL(N0); 12090 SDValue Cond = N0.getOperand(0); 12091 SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 12092 SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); 12093 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); 12094 } 12095 } 12096 12097 // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() 12098 if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 12099 (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) && 12100 TLI.isTypeDesirableForOp(ISD::SHL, VT)) { 12101 SDValue Amt = N0.getOperand(1); 12102 KnownBits Known = DAG.computeKnownBits(Amt); 12103 unsigned Size = VT.getScalarSizeInBits(); 12104 if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { 12105 SDLoc SL(N); 12106 EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); 12107 12108 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 12109 if (AmtVT != Amt.getValueType()) { 12110 Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); 12111 AddToWorklist(Amt.getNode()); 12112 } 12113 return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); 12114 } 12115 } 12116 12117 if (SDValue V = foldSubToUSubSat(VT, N0.getNode())) 12118 return V; 12119 12120 // Attempt to pre-truncate BUILD_VECTOR sources. 12121 if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && 12122 TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) && 12123 // Avoid creating illegal types if running after type legalizer. 12124 (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) { 12125 SDLoc DL(N); 12126 EVT SVT = VT.getScalarType(); 12127 SmallVector<SDValue, 8> TruncOps; 12128 for (const SDValue &Op : N0->op_values()) { 12129 SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op); 12130 TruncOps.push_back(TruncOp); 12131 } 12132 return DAG.getBuildVector(VT, DL, TruncOps); 12133 } 12134 12135 // Fold a series of buildvector, bitcast, and truncate if possible. 12136 // For example fold 12137 // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to 12138 // (2xi32 (buildvector x, y)). 12139 if (Level == AfterLegalizeVectorOps && VT.isVector() && 12140 N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 12141 N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 12142 N0.getOperand(0).hasOneUse()) { 12143 SDValue BuildVect = N0.getOperand(0); 12144 EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); 12145 EVT TruncVecEltTy = VT.getVectorElementType(); 12146 12147 // Check that the element types match. 12148 if (BuildVectEltTy == TruncVecEltTy) { 12149 // Now we only need to compute the offset of the truncated elements. 12150 unsigned BuildVecNumElts = BuildVect.getNumOperands(); 12151 unsigned TruncVecNumElts = VT.getVectorNumElements(); 12152 unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; 12153 12154 assert((BuildVecNumElts % TruncVecNumElts) == 0 && 12155 "Invalid number of elements"); 12156 12157 SmallVector<SDValue, 8> Opnds; 12158 for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) 12159 Opnds.push_back(BuildVect.getOperand(i)); 12160 12161 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 12162 } 12163 } 12164 12165 // See if we can simplify the input to this truncate through knowledge that 12166 // only the low bits are being used. 12167 // For example "trunc (or (shl x, 8), y)" // -> trunc y 12168 // Currently we only perform this optimization on scalars because vectors 12169 // may have different active low bits. 12170 if (!VT.isVector()) { 12171 APInt Mask = 12172 APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); 12173 if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) 12174 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); 12175 } 12176 12177 // fold (truncate (load x)) -> (smaller load x) 12178 // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) 12179 if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { 12180 if (SDValue Reduced = ReduceLoadWidth(N)) 12181 return Reduced; 12182 12183 // Handle the case where the load remains an extending load even 12184 // after truncation. 12185 if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { 12186 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 12187 if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) { 12188 SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), 12189 VT, LN0->getChain(), LN0->getBasePtr(), 12190 LN0->getMemoryVT(), 12191 LN0->getMemOperand()); 12192 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); 12193 return NewLoad; 12194 } 12195 } 12196 } 12197 12198 // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), 12199 // where ... are all 'undef'. 12200 if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { 12201 SmallVector<EVT, 8> VTs; 12202 SDValue V; 12203 unsigned Idx = 0; 12204 unsigned NumDefs = 0; 12205 12206 for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { 12207 SDValue X = N0.getOperand(i); 12208 if (!X.isUndef()) { 12209 V = X; 12210 Idx = i; 12211 NumDefs++; 12212 } 12213 // Stop if more than one members are non-undef. 12214 if (NumDefs > 1) 12215 break; 12216 12217 VTs.push_back(EVT::getVectorVT(*DAG.getContext(), 12218 VT.getVectorElementType(), 12219 X.getValueType().getVectorElementCount())); 12220 } 12221 12222 if (NumDefs == 0) 12223 return DAG.getUNDEF(VT); 12224 12225 if (NumDefs == 1) { 12226 assert(V.getNode() && "The single defined operand is empty!"); 12227 SmallVector<SDValue, 8> Opnds; 12228 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 12229 if (i != Idx) { 12230 Opnds.push_back(DAG.getUNDEF(VTs[i])); 12231 continue; 12232 } 12233 SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); 12234 AddToWorklist(NV.getNode()); 12235 Opnds.push_back(NV); 12236 } 12237 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); 12238 } 12239 } 12240 12241 // Fold truncate of a bitcast of a vector to an extract of the low vector 12242 // element. 12243 // 12244 // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx 12245 if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { 12246 SDValue VecSrc = N0.getOperand(0); 12247 EVT VecSrcVT = VecSrc.getValueType(); 12248 if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT && 12249 (!LegalOperations || 12250 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) { 12251 SDLoc SL(N); 12252 12253 unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1; 12254 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, 12255 DAG.getVectorIdxConstant(Idx, SL)); 12256 } 12257 } 12258 12259 // Simplify the operands using demanded-bits information. 12260 if (SimplifyDemandedBits(SDValue(N, 0))) 12261 return SDValue(N, 0); 12262 12263 // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) 12264 // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) 12265 // When the adde's carry is not used. 12266 if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && 12267 N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && 12268 // We only do for addcarry before legalize operation 12269 ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) || 12270 TLI.isOperationLegal(N0.getOpcode(), VT))) { 12271 SDLoc SL(N); 12272 auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 12273 auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 12274 auto VTs = DAG.getVTList(VT, N0->getValueType(1)); 12275 return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); 12276 } 12277 12278 // fold (truncate (extract_subvector(ext x))) -> 12279 // (extract_subvector x) 12280 // TODO: This can be generalized to cover cases where the truncate and extract 12281 // do not fully cancel each other out. 12282 if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 12283 SDValue N00 = N0.getOperand(0); 12284 if (N00.getOpcode() == ISD::SIGN_EXTEND || 12285 N00.getOpcode() == ISD::ZERO_EXTEND || 12286 N00.getOpcode() == ISD::ANY_EXTEND) { 12287 if (N00.getOperand(0)->getValueType(0).getVectorElementType() == 12288 VT.getVectorElementType()) 12289 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT, 12290 N00.getOperand(0), N0.getOperand(1)); 12291 } 12292 } 12293 12294 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 12295 return NewVSel; 12296 12297 // Narrow a suitable binary operation with a non-opaque constant operand by 12298 // moving it ahead of the truncate. This is limited to pre-legalization 12299 // because targets may prefer a wider type during later combines and invert 12300 // this transform. 12301 switch (N0.getOpcode()) { 12302 case ISD::ADD: 12303 case ISD::SUB: 12304 case ISD::MUL: 12305 case ISD::AND: 12306 case ISD::OR: 12307 case ISD::XOR: 12308 if (!LegalOperations && N0.hasOneUse() && 12309 (isConstantOrConstantVector(N0.getOperand(0), true) || 12310 isConstantOrConstantVector(N0.getOperand(1), true))) { 12311 // TODO: We already restricted this to pre-legalization, but for vectors 12312 // we are extra cautious to not create an unsupported operation. 12313 // Target-specific changes are likely needed to avoid regressions here. 12314 if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { 12315 SDLoc DL(N); 12316 SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); 12317 SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); 12318 return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); 12319 } 12320 } 12321 break; 12322 case ISD::USUBSAT: 12323 // Truncate the USUBSAT only if LHS is a known zero-extension, its not 12324 // enough to know that the upper bits are zero we must ensure that we don't 12325 // introduce an extra truncate. 12326 if (!LegalOperations && N0.hasOneUse() && 12327 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 12328 N0.getOperand(0).getOperand(0).getScalarValueSizeInBits() <= 12329 VT.getScalarSizeInBits() && 12330 hasOperation(N0.getOpcode(), VT)) { 12331 return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1), 12332 DAG, SDLoc(N)); 12333 } 12334 break; 12335 } 12336 12337 return SDValue(); 12338 } 12339 12340 static SDNode *getBuildPairElt(SDNode *N, unsigned i) { 12341 SDValue Elt = N->getOperand(i); 12342 if (Elt.getOpcode() != ISD::MERGE_VALUES) 12343 return Elt.getNode(); 12344 return Elt.getOperand(Elt.getResNo()).getNode(); 12345 } 12346 12347 /// build_pair (load, load) -> load 12348 /// if load locations are consecutive. 12349 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { 12350 assert(N->getOpcode() == ISD::BUILD_PAIR); 12351 12352 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0)); 12353 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1)); 12354 12355 // A BUILD_PAIR is always having the least significant part in elt 0 and the 12356 // most significant part in elt 1. So when combining into one large load, we 12357 // need to consider the endianness. 12358 if (DAG.getDataLayout().isBigEndian()) 12359 std::swap(LD1, LD2); 12360 12361 if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || 12362 LD1->getAddressSpace() != LD2->getAddressSpace()) 12363 return SDValue(); 12364 EVT LD1VT = LD1->getValueType(0); 12365 unsigned LD1Bytes = LD1VT.getStoreSize(); 12366 if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && 12367 DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { 12368 Align Alignment = LD1->getAlign(); 12369 Align NewAlign = DAG.getDataLayout().getABITypeAlign( 12370 VT.getTypeForEVT(*DAG.getContext())); 12371 12372 if (NewAlign <= Alignment && 12373 (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) 12374 return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), 12375 LD1->getPointerInfo(), Alignment); 12376 } 12377 12378 return SDValue(); 12379 } 12380 12381 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { 12382 // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi 12383 // and Lo parts; on big-endian machines it doesn't. 12384 return DAG.getDataLayout().isBigEndian() ? 1 : 0; 12385 } 12386 12387 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, 12388 const TargetLowering &TLI) { 12389 // If this is not a bitcast to an FP type or if the target doesn't have 12390 // IEEE754-compliant FP logic, we're done. 12391 EVT VT = N->getValueType(0); 12392 if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) 12393 return SDValue(); 12394 12395 // TODO: Handle cases where the integer constant is a different scalar 12396 // bitwidth to the FP. 12397 SDValue N0 = N->getOperand(0); 12398 EVT SourceVT = N0.getValueType(); 12399 if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits()) 12400 return SDValue(); 12401 12402 unsigned FPOpcode; 12403 APInt SignMask; 12404 switch (N0.getOpcode()) { 12405 case ISD::AND: 12406 FPOpcode = ISD::FABS; 12407 SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits()); 12408 break; 12409 case ISD::XOR: 12410 FPOpcode = ISD::FNEG; 12411 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 12412 break; 12413 case ISD::OR: 12414 FPOpcode = ISD::FABS; 12415 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 12416 break; 12417 default: 12418 return SDValue(); 12419 } 12420 12421 // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X 12422 // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X 12423 // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) -> 12424 // fneg (fabs X) 12425 SDValue LogicOp0 = N0.getOperand(0); 12426 ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true); 12427 if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && 12428 LogicOp0.getOpcode() == ISD::BITCAST && 12429 LogicOp0.getOperand(0).getValueType() == VT) { 12430 SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0)); 12431 NumFPLogicOpsConv++; 12432 if (N0.getOpcode() == ISD::OR) 12433 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp); 12434 return FPOp; 12435 } 12436 12437 return SDValue(); 12438 } 12439 12440 SDValue DAGCombiner::visitBITCAST(SDNode *N) { 12441 SDValue N0 = N->getOperand(0); 12442 EVT VT = N->getValueType(0); 12443 12444 if (N0.isUndef()) 12445 return DAG.getUNDEF(VT); 12446 12447 // If the input is a BUILD_VECTOR with all constant elements, fold this now. 12448 // Only do this before legalize types, unless both types are integer and the 12449 // scalar type is legal. Only do this before legalize ops, since the target 12450 // maybe depending on the bitcast. 12451 // First check to see if this is all constant. 12452 // TODO: Support FP bitcasts after legalize types. 12453 if (VT.isVector() && 12454 (!LegalTypes || 12455 (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() && 12456 TLI.isTypeLegal(VT.getVectorElementType()))) && 12457 N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && 12458 cast<BuildVectorSDNode>(N0)->isConstant()) 12459 return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), 12460 VT.getVectorElementType()); 12461 12462 // If the input is a constant, let getNode fold it. 12463 if (isIntOrFPConstant(N0)) { 12464 // If we can't allow illegal operations, we need to check that this is just 12465 // a fp -> int or int -> conversion and that the resulting operation will 12466 // be legal. 12467 if (!LegalOperations || 12468 (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() && 12469 TLI.isOperationLegal(ISD::ConstantFP, VT)) || 12470 (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() && 12471 TLI.isOperationLegal(ISD::Constant, VT))) { 12472 SDValue C = DAG.getBitcast(VT, N0); 12473 if (C.getNode() != N) 12474 return C; 12475 } 12476 } 12477 12478 // (conv (conv x, t1), t2) -> (conv x, t2) 12479 if (N0.getOpcode() == ISD::BITCAST) 12480 return DAG.getBitcast(VT, N0.getOperand(0)); 12481 12482 // fold (conv (load x)) -> (load (conv*)x) 12483 // If the resultant load doesn't need a higher alignment than the original! 12484 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 12485 // Do not remove the cast if the types differ in endian layout. 12486 TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == 12487 TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && 12488 // If the load is volatile, we only want to change the load type if the 12489 // resulting load is legal. Otherwise we might increase the number of 12490 // memory accesses. We don't care if the original type was legal or not 12491 // as we assume software couldn't rely on the number of accesses of an 12492 // illegal type. 12493 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) || 12494 TLI.isOperationLegal(ISD::LOAD, VT))) { 12495 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 12496 12497 if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, 12498 *LN0->getMemOperand())) { 12499 SDValue Load = 12500 DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 12501 LN0->getPointerInfo(), LN0->getAlign(), 12502 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 12503 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 12504 return Load; 12505 } 12506 } 12507 12508 if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) 12509 return V; 12510 12511 // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) 12512 // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) 12513 // 12514 // For ppc_fp128: 12515 // fold (bitcast (fneg x)) -> 12516 // flipbit = signbit 12517 // (xor (bitcast x) (build_pair flipbit, flipbit)) 12518 // 12519 // fold (bitcast (fabs x)) -> 12520 // flipbit = (and (extract_element (bitcast x), 0), signbit) 12521 // (xor (bitcast x) (build_pair flipbit, flipbit)) 12522 // This often reduces constant pool loads. 12523 if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || 12524 (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && 12525 N0.getNode()->hasOneUse() && VT.isInteger() && 12526 !VT.isVector() && !N0.getValueType().isVector()) { 12527 SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0)); 12528 AddToWorklist(NewConv.getNode()); 12529 12530 SDLoc DL(N); 12531 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 12532 assert(VT.getSizeInBits() == 128); 12533 SDValue SignBit = DAG.getConstant( 12534 APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); 12535 SDValue FlipBit; 12536 if (N0.getOpcode() == ISD::FNEG) { 12537 FlipBit = SignBit; 12538 AddToWorklist(FlipBit.getNode()); 12539 } else { 12540 assert(N0.getOpcode() == ISD::FABS); 12541 SDValue Hi = 12542 DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, 12543 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 12544 SDLoc(NewConv))); 12545 AddToWorklist(Hi.getNode()); 12546 FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); 12547 AddToWorklist(FlipBit.getNode()); 12548 } 12549 SDValue FlipBits = 12550 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 12551 AddToWorklist(FlipBits.getNode()); 12552 return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); 12553 } 12554 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 12555 if (N0.getOpcode() == ISD::FNEG) 12556 return DAG.getNode(ISD::XOR, DL, VT, 12557 NewConv, DAG.getConstant(SignBit, DL, VT)); 12558 assert(N0.getOpcode() == ISD::FABS); 12559 return DAG.getNode(ISD::AND, DL, VT, 12560 NewConv, DAG.getConstant(~SignBit, DL, VT)); 12561 } 12562 12563 // fold (bitconvert (fcopysign cst, x)) -> 12564 // (or (and (bitconvert x), sign), (and cst, (not sign))) 12565 // Note that we don't handle (copysign x, cst) because this can always be 12566 // folded to an fneg or fabs. 12567 // 12568 // For ppc_fp128: 12569 // fold (bitcast (fcopysign cst, x)) -> 12570 // flipbit = (and (extract_element 12571 // (xor (bitcast cst), (bitcast x)), 0), 12572 // signbit) 12573 // (xor (bitcast cst) (build_pair flipbit, flipbit)) 12574 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && 12575 isa<ConstantFPSDNode>(N0.getOperand(0)) && 12576 VT.isInteger() && !VT.isVector()) { 12577 unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits(); 12578 EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); 12579 if (isTypeLegal(IntXVT)) { 12580 SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1)); 12581 AddToWorklist(X.getNode()); 12582 12583 // If X has a different width than the result/lhs, sext it or truncate it. 12584 unsigned VTWidth = VT.getSizeInBits(); 12585 if (OrigXWidth < VTWidth) { 12586 X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); 12587 AddToWorklist(X.getNode()); 12588 } else if (OrigXWidth > VTWidth) { 12589 // To get the sign bit in the right place, we have to shift it right 12590 // before truncating. 12591 SDLoc DL(X); 12592 X = DAG.getNode(ISD::SRL, DL, 12593 X.getValueType(), X, 12594 DAG.getConstant(OrigXWidth-VTWidth, DL, 12595 X.getValueType())); 12596 AddToWorklist(X.getNode()); 12597 X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); 12598 AddToWorklist(X.getNode()); 12599 } 12600 12601 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 12602 APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); 12603 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 12604 AddToWorklist(Cst.getNode()); 12605 SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); 12606 AddToWorklist(X.getNode()); 12607 SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); 12608 AddToWorklist(XorResult.getNode()); 12609 SDValue XorResult64 = DAG.getNode( 12610 ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, 12611 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 12612 SDLoc(XorResult))); 12613 AddToWorklist(XorResult64.getNode()); 12614 SDValue FlipBit = 12615 DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, 12616 DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); 12617 AddToWorklist(FlipBit.getNode()); 12618 SDValue FlipBits = 12619 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 12620 AddToWorklist(FlipBits.getNode()); 12621 return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); 12622 } 12623 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 12624 X = DAG.getNode(ISD::AND, SDLoc(X), VT, 12625 X, DAG.getConstant(SignBit, SDLoc(X), VT)); 12626 AddToWorklist(X.getNode()); 12627 12628 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 12629 Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, 12630 Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT)); 12631 AddToWorklist(Cst.getNode()); 12632 12633 return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); 12634 } 12635 } 12636 12637 // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. 12638 if (N0.getOpcode() == ISD::BUILD_PAIR) 12639 if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) 12640 return CombineLD; 12641 12642 // Remove double bitcasts from shuffles - this is often a legacy of 12643 // XformToShuffleWithZero being used to combine bitmaskings (of 12644 // float vectors bitcast to integer vectors) into shuffles. 12645 // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1) 12646 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() && 12647 N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() && 12648 VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() && 12649 !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) { 12650 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0); 12651 12652 // If operands are a bitcast, peek through if it casts the original VT. 12653 // If operands are a constant, just bitcast back to original VT. 12654 auto PeekThroughBitcast = [&](SDValue Op) { 12655 if (Op.getOpcode() == ISD::BITCAST && 12656 Op.getOperand(0).getValueType() == VT) 12657 return SDValue(Op.getOperand(0)); 12658 if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || 12659 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) 12660 return DAG.getBitcast(VT, Op); 12661 return SDValue(); 12662 }; 12663 12664 // FIXME: If either input vector is bitcast, try to convert the shuffle to 12665 // the result type of this bitcast. This would eliminate at least one 12666 // bitcast. See the transform in InstCombine. 12667 SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); 12668 SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); 12669 if (!(SV0 && SV1)) 12670 return SDValue(); 12671 12672 int MaskScale = 12673 VT.getVectorNumElements() / N0.getValueType().getVectorNumElements(); 12674 SmallVector<int, 8> NewMask; 12675 for (int M : SVN->getMask()) 12676 for (int i = 0; i != MaskScale; ++i) 12677 NewMask.push_back(M < 0 ? -1 : M * MaskScale + i); 12678 12679 SDValue LegalShuffle = 12680 TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG); 12681 if (LegalShuffle) 12682 return LegalShuffle; 12683 } 12684 12685 return SDValue(); 12686 } 12687 12688 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { 12689 EVT VT = N->getValueType(0); 12690 return CombineConsecutiveLoads(N, VT); 12691 } 12692 12693 SDValue DAGCombiner::visitFREEZE(SDNode *N) { 12694 SDValue N0 = N->getOperand(0); 12695 12696 // (freeze (freeze x)) -> (freeze x) 12697 if (N0.getOpcode() == ISD::FREEZE) 12698 return N0; 12699 12700 // If the input is a constant, return it. 12701 if (isIntOrFPConstant(N0)) 12702 return N0; 12703 12704 return SDValue(); 12705 } 12706 12707 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef 12708 /// operands. DstEltVT indicates the destination element value type. 12709 SDValue DAGCombiner:: 12710 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { 12711 EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); 12712 12713 // If this is already the right type, we're done. 12714 if (SrcEltVT == DstEltVT) return SDValue(BV, 0); 12715 12716 unsigned SrcBitSize = SrcEltVT.getSizeInBits(); 12717 unsigned DstBitSize = DstEltVT.getSizeInBits(); 12718 12719 // If this is a conversion of N elements of one type to N elements of another 12720 // type, convert each element. This handles FP<->INT cases. 12721 if (SrcBitSize == DstBitSize) { 12722 SmallVector<SDValue, 8> Ops; 12723 for (SDValue Op : BV->op_values()) { 12724 // If the vector element type is not legal, the BUILD_VECTOR operands 12725 // are promoted and implicitly truncated. Make that explicit here. 12726 if (Op.getValueType() != SrcEltVT) 12727 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); 12728 Ops.push_back(DAG.getBitcast(DstEltVT, Op)); 12729 AddToWorklist(Ops.back().getNode()); 12730 } 12731 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 12732 BV->getValueType(0).getVectorNumElements()); 12733 return DAG.getBuildVector(VT, SDLoc(BV), Ops); 12734 } 12735 12736 // Otherwise, we're growing or shrinking the elements. To avoid having to 12737 // handle annoying details of growing/shrinking FP values, we convert them to 12738 // int first. 12739 if (SrcEltVT.isFloatingPoint()) { 12740 // Convert the input float vector to a int vector where the elements are the 12741 // same sizes. 12742 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); 12743 BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); 12744 SrcEltVT = IntVT; 12745 } 12746 12747 // Now we know the input is an integer vector. If the output is a FP type, 12748 // convert to integer first, then to FP of the right size. 12749 if (DstEltVT.isFloatingPoint()) { 12750 EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); 12751 SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); 12752 12753 // Next, convert to FP elements of the same size. 12754 return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); 12755 } 12756 12757 SDLoc DL(BV); 12758 12759 // Okay, we know the src/dst types are both integers of differing types. 12760 // Handling growing first. 12761 assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); 12762 if (SrcBitSize < DstBitSize) { 12763 unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; 12764 12765 SmallVector<SDValue, 8> Ops; 12766 for (unsigned i = 0, e = BV->getNumOperands(); i != e; 12767 i += NumInputsPerOutput) { 12768 bool isLE = DAG.getDataLayout().isLittleEndian(); 12769 APInt NewBits = APInt(DstBitSize, 0); 12770 bool EltIsUndef = true; 12771 for (unsigned j = 0; j != NumInputsPerOutput; ++j) { 12772 // Shift the previously computed bits over. 12773 NewBits <<= SrcBitSize; 12774 SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); 12775 if (Op.isUndef()) continue; 12776 EltIsUndef = false; 12777 12778 NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue(). 12779 zextOrTrunc(SrcBitSize).zext(DstBitSize); 12780 } 12781 12782 if (EltIsUndef) 12783 Ops.push_back(DAG.getUNDEF(DstEltVT)); 12784 else 12785 Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT)); 12786 } 12787 12788 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); 12789 return DAG.getBuildVector(VT, DL, Ops); 12790 } 12791 12792 // Finally, this must be the case where we are shrinking elements: each input 12793 // turns into multiple outputs. 12794 unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; 12795 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 12796 NumOutputsPerInput*BV->getNumOperands()); 12797 SmallVector<SDValue, 8> Ops; 12798 12799 for (const SDValue &Op : BV->op_values()) { 12800 if (Op.isUndef()) { 12801 Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); 12802 continue; 12803 } 12804 12805 APInt OpVal = cast<ConstantSDNode>(Op)-> 12806 getAPIntValue().zextOrTrunc(SrcBitSize); 12807 12808 for (unsigned j = 0; j != NumOutputsPerInput; ++j) { 12809 APInt ThisVal = OpVal.trunc(DstBitSize); 12810 Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); 12811 OpVal.lshrInPlace(DstBitSize); 12812 } 12813 12814 // For big endian targets, swap the order of the pieces of each element. 12815 if (DAG.getDataLayout().isBigEndian()) 12816 std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); 12817 } 12818 12819 return DAG.getBuildVector(VT, DL, Ops); 12820 } 12821 12822 static bool isContractable(SDNode *N) { 12823 SDNodeFlags F = N->getFlags(); 12824 return F.hasAllowContract() || F.hasAllowReassociation(); 12825 } 12826 12827 /// Try to perform FMA combining on a given FADD node. 12828 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { 12829 SDValue N0 = N->getOperand(0); 12830 SDValue N1 = N->getOperand(1); 12831 EVT VT = N->getValueType(0); 12832 SDLoc SL(N); 12833 12834 const TargetOptions &Options = DAG.getTarget().Options; 12835 12836 // Floating-point multiply-add with intermediate rounding. 12837 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); 12838 12839 // Floating-point multiply-add without intermediate rounding. 12840 bool HasFMA = 12841 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 12842 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 12843 12844 // No valid opcode, do not combine. 12845 if (!HasFMAD && !HasFMA) 12846 return SDValue(); 12847 12848 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 12849 bool CanReassociate = 12850 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); 12851 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 12852 CanFuse || HasFMAD); 12853 // If the addition is not contractable, do not combine. 12854 if (!AllowFusionGlobally && !isContractable(N)) 12855 return SDValue(); 12856 12857 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) 12858 return SDValue(); 12859 12860 // Always prefer FMAD to FMA for precision. 12861 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 12862 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 12863 12864 // Is the node an FMUL and contractable either due to global flags or 12865 // SDNodeFlags. 12866 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 12867 if (N.getOpcode() != ISD::FMUL) 12868 return false; 12869 return AllowFusionGlobally || isContractable(N.getNode()); 12870 }; 12871 // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), 12872 // prefer to fold the multiply with fewer uses. 12873 if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { 12874 if (N0.getNode()->use_size() > N1.getNode()->use_size()) 12875 std::swap(N0, N1); 12876 } 12877 12878 // fold (fadd (fmul x, y), z) -> (fma x, y, z) 12879 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { 12880 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), 12881 N0.getOperand(1), N1); 12882 } 12883 12884 // fold (fadd x, (fmul y, z)) -> (fma y, z, x) 12885 // Note: Commutes FADD operands. 12886 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { 12887 return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), 12888 N1.getOperand(1), N0); 12889 } 12890 12891 // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) 12892 // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E) 12893 // This requires reassociation because it changes the order of operations. 12894 SDValue FMA, E; 12895 if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode && 12896 N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() && 12897 N0.getOperand(2).hasOneUse()) { 12898 FMA = N0; 12899 E = N1; 12900 } else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode && 12901 N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() && 12902 N1.getOperand(2).hasOneUse()) { 12903 FMA = N1; 12904 E = N0; 12905 } 12906 if (FMA && E) { 12907 SDValue A = FMA.getOperand(0); 12908 SDValue B = FMA.getOperand(1); 12909 SDValue C = FMA.getOperand(2).getOperand(0); 12910 SDValue D = FMA.getOperand(2).getOperand(1); 12911 SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E); 12912 return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE); 12913 } 12914 12915 // Look through FP_EXTEND nodes to do more combining. 12916 12917 // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) 12918 if (N0.getOpcode() == ISD::FP_EXTEND) { 12919 SDValue N00 = N0.getOperand(0); 12920 if (isContractableFMUL(N00) && 12921 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12922 N00.getValueType())) { 12923 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12924 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), 12925 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), 12926 N1); 12927 } 12928 } 12929 12930 // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) 12931 // Note: Commutes FADD operands. 12932 if (N1.getOpcode() == ISD::FP_EXTEND) { 12933 SDValue N10 = N1.getOperand(0); 12934 if (isContractableFMUL(N10) && 12935 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12936 N10.getValueType())) { 12937 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12938 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), 12939 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), 12940 N0); 12941 } 12942 } 12943 12944 // More folding opportunities when target permits. 12945 if (Aggressive) { 12946 // fold (fadd (fma x, y, (fpext (fmul u, v))), z) 12947 // -> (fma x, y, (fma (fpext u), (fpext v), z)) 12948 auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, 12949 SDValue Z) { 12950 return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, 12951 DAG.getNode(PreferredFusedOpcode, SL, VT, 12952 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 12953 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), 12954 Z)); 12955 }; 12956 if (N0.getOpcode() == PreferredFusedOpcode) { 12957 SDValue N02 = N0.getOperand(2); 12958 if (N02.getOpcode() == ISD::FP_EXTEND) { 12959 SDValue N020 = N02.getOperand(0); 12960 if (isContractableFMUL(N020) && 12961 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12962 N020.getValueType())) { 12963 return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), 12964 N020.getOperand(0), N020.getOperand(1), 12965 N1); 12966 } 12967 } 12968 } 12969 12970 // fold (fadd (fpext (fma x, y, (fmul u, v))), z) 12971 // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) 12972 // FIXME: This turns two single-precision and one double-precision 12973 // operation into two double-precision operations, which might not be 12974 // interesting for all targets, especially GPUs. 12975 auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, 12976 SDValue Z) { 12977 return DAG.getNode( 12978 PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), 12979 DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), 12980 DAG.getNode(PreferredFusedOpcode, SL, VT, 12981 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 12982 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); 12983 }; 12984 if (N0.getOpcode() == ISD::FP_EXTEND) { 12985 SDValue N00 = N0.getOperand(0); 12986 if (N00.getOpcode() == PreferredFusedOpcode) { 12987 SDValue N002 = N00.getOperand(2); 12988 if (isContractableFMUL(N002) && 12989 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12990 N00.getValueType())) { 12991 return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), 12992 N002.getOperand(0), N002.getOperand(1), 12993 N1); 12994 } 12995 } 12996 } 12997 12998 // fold (fadd x, (fma y, z, (fpext (fmul u, v))) 12999 // -> (fma y, z, (fma (fpext u), (fpext v), x)) 13000 if (N1.getOpcode() == PreferredFusedOpcode) { 13001 SDValue N12 = N1.getOperand(2); 13002 if (N12.getOpcode() == ISD::FP_EXTEND) { 13003 SDValue N120 = N12.getOperand(0); 13004 if (isContractableFMUL(N120) && 13005 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13006 N120.getValueType())) { 13007 return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), 13008 N120.getOperand(0), N120.getOperand(1), 13009 N0); 13010 } 13011 } 13012 } 13013 13014 // fold (fadd x, (fpext (fma y, z, (fmul u, v))) 13015 // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) 13016 // FIXME: This turns two single-precision and one double-precision 13017 // operation into two double-precision operations, which might not be 13018 // interesting for all targets, especially GPUs. 13019 if (N1.getOpcode() == ISD::FP_EXTEND) { 13020 SDValue N10 = N1.getOperand(0); 13021 if (N10.getOpcode() == PreferredFusedOpcode) { 13022 SDValue N102 = N10.getOperand(2); 13023 if (isContractableFMUL(N102) && 13024 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13025 N10.getValueType())) { 13026 return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), 13027 N102.getOperand(0), N102.getOperand(1), 13028 N0); 13029 } 13030 } 13031 } 13032 } 13033 13034 return SDValue(); 13035 } 13036 13037 /// Try to perform FMA combining on a given FSUB node. 13038 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { 13039 SDValue N0 = N->getOperand(0); 13040 SDValue N1 = N->getOperand(1); 13041 EVT VT = N->getValueType(0); 13042 SDLoc SL(N); 13043 13044 const TargetOptions &Options = DAG.getTarget().Options; 13045 // Floating-point multiply-add with intermediate rounding. 13046 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); 13047 13048 // Floating-point multiply-add without intermediate rounding. 13049 bool HasFMA = 13050 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 13051 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 13052 13053 // No valid opcode, do not combine. 13054 if (!HasFMAD && !HasFMA) 13055 return SDValue(); 13056 13057 const SDNodeFlags Flags = N->getFlags(); 13058 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 13059 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 13060 CanFuse || HasFMAD); 13061 13062 // If the subtraction is not contractable, do not combine. 13063 if (!AllowFusionGlobally && !isContractable(N)) 13064 return SDValue(); 13065 13066 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) 13067 return SDValue(); 13068 13069 // Always prefer FMAD to FMA for precision. 13070 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 13071 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 13072 bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); 13073 13074 // Is the node an FMUL and contractable either due to global flags or 13075 // SDNodeFlags. 13076 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 13077 if (N.getOpcode() != ISD::FMUL) 13078 return false; 13079 return AllowFusionGlobally || isContractable(N.getNode()); 13080 }; 13081 13082 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 13083 auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { 13084 if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { 13085 return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), 13086 XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z)); 13087 } 13088 return SDValue(); 13089 }; 13090 13091 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 13092 // Note: Commutes FSUB operands. 13093 auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { 13094 if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { 13095 return DAG.getNode(PreferredFusedOpcode, SL, VT, 13096 DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), 13097 YZ.getOperand(1), X); 13098 } 13099 return SDValue(); 13100 }; 13101 13102 // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)), 13103 // prefer to fold the multiply with fewer uses. 13104 if (isContractableFMUL(N0) && isContractableFMUL(N1) && 13105 (N0.getNode()->use_size() > N1.getNode()->use_size())) { 13106 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b)) 13107 if (SDValue V = tryToFoldXSubYZ(N0, N1)) 13108 return V; 13109 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d))) 13110 if (SDValue V = tryToFoldXYSubZ(N0, N1)) 13111 return V; 13112 } else { 13113 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 13114 if (SDValue V = tryToFoldXYSubZ(N0, N1)) 13115 return V; 13116 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 13117 if (SDValue V = tryToFoldXSubYZ(N0, N1)) 13118 return V; 13119 } 13120 13121 // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) 13122 if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && 13123 (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { 13124 SDValue N00 = N0.getOperand(0).getOperand(0); 13125 SDValue N01 = N0.getOperand(0).getOperand(1); 13126 return DAG.getNode(PreferredFusedOpcode, SL, VT, 13127 DAG.getNode(ISD::FNEG, SL, VT, N00), N01, 13128 DAG.getNode(ISD::FNEG, SL, VT, N1)); 13129 } 13130 13131 // Look through FP_EXTEND nodes to do more combining. 13132 13133 // fold (fsub (fpext (fmul x, y)), z) 13134 // -> (fma (fpext x), (fpext y), (fneg z)) 13135 if (N0.getOpcode() == ISD::FP_EXTEND) { 13136 SDValue N00 = N0.getOperand(0); 13137 if (isContractableFMUL(N00) && 13138 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13139 N00.getValueType())) { 13140 return DAG.getNode(PreferredFusedOpcode, SL, VT, 13141 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), 13142 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), 13143 DAG.getNode(ISD::FNEG, SL, VT, N1)); 13144 } 13145 } 13146 13147 // fold (fsub x, (fpext (fmul y, z))) 13148 // -> (fma (fneg (fpext y)), (fpext z), x) 13149 // Note: Commutes FSUB operands. 13150 if (N1.getOpcode() == ISD::FP_EXTEND) { 13151 SDValue N10 = N1.getOperand(0); 13152 if (isContractableFMUL(N10) && 13153 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13154 N10.getValueType())) { 13155 return DAG.getNode( 13156 PreferredFusedOpcode, SL, VT, 13157 DAG.getNode(ISD::FNEG, SL, VT, 13158 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), 13159 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); 13160 } 13161 } 13162 13163 // fold (fsub (fpext (fneg (fmul, x, y))), z) 13164 // -> (fneg (fma (fpext x), (fpext y), z)) 13165 // Note: This could be removed with appropriate canonicalization of the 13166 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 13167 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 13168 // from implementing the canonicalization in visitFSUB. 13169 if (N0.getOpcode() == ISD::FP_EXTEND) { 13170 SDValue N00 = N0.getOperand(0); 13171 if (N00.getOpcode() == ISD::FNEG) { 13172 SDValue N000 = N00.getOperand(0); 13173 if (isContractableFMUL(N000) && 13174 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13175 N00.getValueType())) { 13176 return DAG.getNode( 13177 ISD::FNEG, SL, VT, 13178 DAG.getNode(PreferredFusedOpcode, SL, VT, 13179 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), 13180 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), 13181 N1)); 13182 } 13183 } 13184 } 13185 13186 // fold (fsub (fneg (fpext (fmul, x, y))), z) 13187 // -> (fneg (fma (fpext x)), (fpext y), z) 13188 // Note: This could be removed with appropriate canonicalization of the 13189 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 13190 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 13191 // from implementing the canonicalization in visitFSUB. 13192 if (N0.getOpcode() == ISD::FNEG) { 13193 SDValue N00 = N0.getOperand(0); 13194 if (N00.getOpcode() == ISD::FP_EXTEND) { 13195 SDValue N000 = N00.getOperand(0); 13196 if (isContractableFMUL(N000) && 13197 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13198 N000.getValueType())) { 13199 return DAG.getNode( 13200 ISD::FNEG, SL, VT, 13201 DAG.getNode(PreferredFusedOpcode, SL, VT, 13202 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), 13203 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), 13204 N1)); 13205 } 13206 } 13207 } 13208 13209 // More folding opportunities when target permits. 13210 if (Aggressive) { 13211 // fold (fsub (fma x, y, (fmul u, v)), z) 13212 // -> (fma x, y (fma u, v, (fneg z))) 13213 if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && 13214 isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && 13215 N0.getOperand(2)->hasOneUse()) { 13216 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), 13217 N0.getOperand(1), 13218 DAG.getNode(PreferredFusedOpcode, SL, VT, 13219 N0.getOperand(2).getOperand(0), 13220 N0.getOperand(2).getOperand(1), 13221 DAG.getNode(ISD::FNEG, SL, VT, N1))); 13222 } 13223 13224 // fold (fsub x, (fma y, z, (fmul u, v))) 13225 // -> (fma (fneg y), z, (fma (fneg u), v, x)) 13226 if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && 13227 isContractableFMUL(N1.getOperand(2)) && 13228 N1->hasOneUse() && NoSignedZero) { 13229 SDValue N20 = N1.getOperand(2).getOperand(0); 13230 SDValue N21 = N1.getOperand(2).getOperand(1); 13231 return DAG.getNode( 13232 PreferredFusedOpcode, SL, VT, 13233 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), 13234 DAG.getNode(PreferredFusedOpcode, SL, VT, 13235 DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); 13236 } 13237 13238 13239 // fold (fsub (fma x, y, (fpext (fmul u, v))), z) 13240 // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) 13241 if (N0.getOpcode() == PreferredFusedOpcode && 13242 N0->hasOneUse()) { 13243 SDValue N02 = N0.getOperand(2); 13244 if (N02.getOpcode() == ISD::FP_EXTEND) { 13245 SDValue N020 = N02.getOperand(0); 13246 if (isContractableFMUL(N020) && 13247 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13248 N020.getValueType())) { 13249 return DAG.getNode( 13250 PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), 13251 DAG.getNode( 13252 PreferredFusedOpcode, SL, VT, 13253 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), 13254 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), 13255 DAG.getNode(ISD::FNEG, SL, VT, N1))); 13256 } 13257 } 13258 } 13259 13260 // fold (fsub (fpext (fma x, y, (fmul u, v))), z) 13261 // -> (fma (fpext x), (fpext y), 13262 // (fma (fpext u), (fpext v), (fneg z))) 13263 // FIXME: This turns two single-precision and one double-precision 13264 // operation into two double-precision operations, which might not be 13265 // interesting for all targets, especially GPUs. 13266 if (N0.getOpcode() == ISD::FP_EXTEND) { 13267 SDValue N00 = N0.getOperand(0); 13268 if (N00.getOpcode() == PreferredFusedOpcode) { 13269 SDValue N002 = N00.getOperand(2); 13270 if (isContractableFMUL(N002) && 13271 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13272 N00.getValueType())) { 13273 return DAG.getNode( 13274 PreferredFusedOpcode, SL, VT, 13275 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), 13276 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), 13277 DAG.getNode( 13278 PreferredFusedOpcode, SL, VT, 13279 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), 13280 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), 13281 DAG.getNode(ISD::FNEG, SL, VT, N1))); 13282 } 13283 } 13284 } 13285 13286 // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) 13287 // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) 13288 if (N1.getOpcode() == PreferredFusedOpcode && 13289 N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && 13290 N1->hasOneUse()) { 13291 SDValue N120 = N1.getOperand(2).getOperand(0); 13292 if (isContractableFMUL(N120) && 13293 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13294 N120.getValueType())) { 13295 SDValue N1200 = N120.getOperand(0); 13296 SDValue N1201 = N120.getOperand(1); 13297 return DAG.getNode( 13298 PreferredFusedOpcode, SL, VT, 13299 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), 13300 DAG.getNode(PreferredFusedOpcode, SL, VT, 13301 DAG.getNode(ISD::FNEG, SL, VT, 13302 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)), 13303 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0)); 13304 } 13305 } 13306 13307 // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) 13308 // -> (fma (fneg (fpext y)), (fpext z), 13309 // (fma (fneg (fpext u)), (fpext v), x)) 13310 // FIXME: This turns two single-precision and one double-precision 13311 // operation into two double-precision operations, which might not be 13312 // interesting for all targets, especially GPUs. 13313 if (N1.getOpcode() == ISD::FP_EXTEND && 13314 N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { 13315 SDValue CvtSrc = N1.getOperand(0); 13316 SDValue N100 = CvtSrc.getOperand(0); 13317 SDValue N101 = CvtSrc.getOperand(1); 13318 SDValue N102 = CvtSrc.getOperand(2); 13319 if (isContractableFMUL(N102) && 13320 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 13321 CvtSrc.getValueType())) { 13322 SDValue N1020 = N102.getOperand(0); 13323 SDValue N1021 = N102.getOperand(1); 13324 return DAG.getNode( 13325 PreferredFusedOpcode, SL, VT, 13326 DAG.getNode(ISD::FNEG, SL, VT, 13327 DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)), 13328 DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), 13329 DAG.getNode(PreferredFusedOpcode, SL, VT, 13330 DAG.getNode(ISD::FNEG, SL, VT, 13331 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)), 13332 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0)); 13333 } 13334 } 13335 } 13336 13337 return SDValue(); 13338 } 13339 13340 /// Try to perform FMA combining on a given FMUL node based on the distributive 13341 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions, 13342 /// subtraction instead of addition). 13343 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { 13344 SDValue N0 = N->getOperand(0); 13345 SDValue N1 = N->getOperand(1); 13346 EVT VT = N->getValueType(0); 13347 SDLoc SL(N); 13348 13349 assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); 13350 13351 const TargetOptions &Options = DAG.getTarget().Options; 13352 13353 // The transforms below are incorrect when x == 0 and y == inf, because the 13354 // intermediate multiplication produces a nan. 13355 if (!Options.NoInfsFPMath) 13356 return SDValue(); 13357 13358 // Floating-point multiply-add without intermediate rounding. 13359 bool HasFMA = 13360 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && 13361 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 13362 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 13363 13364 // Floating-point multiply-add with intermediate rounding. This can result 13365 // in a less precise result due to the changed rounding order. 13366 bool HasFMAD = Options.UnsafeFPMath && 13367 (LegalOperations && TLI.isFMADLegal(DAG, N)); 13368 13369 // No valid opcode, do not combine. 13370 if (!HasFMAD && !HasFMA) 13371 return SDValue(); 13372 13373 // Always prefer FMAD to FMA for precision. 13374 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 13375 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 13376 13377 // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y) 13378 // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y)) 13379 auto FuseFADD = [&](SDValue X, SDValue Y) { 13380 if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { 13381 if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) { 13382 if (C->isExactlyValue(+1.0)) 13383 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 13384 Y); 13385 if (C->isExactlyValue(-1.0)) 13386 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 13387 DAG.getNode(ISD::FNEG, SL, VT, Y)); 13388 } 13389 } 13390 return SDValue(); 13391 }; 13392 13393 if (SDValue FMA = FuseFADD(N0, N1)) 13394 return FMA; 13395 if (SDValue FMA = FuseFADD(N1, N0)) 13396 return FMA; 13397 13398 // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y) 13399 // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y)) 13400 // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y)) 13401 // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y) 13402 auto FuseFSUB = [&](SDValue X, SDValue Y) { 13403 if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { 13404 if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) { 13405 if (C0->isExactlyValue(+1.0)) 13406 return DAG.getNode(PreferredFusedOpcode, SL, VT, 13407 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 13408 Y); 13409 if (C0->isExactlyValue(-1.0)) 13410 return DAG.getNode(PreferredFusedOpcode, SL, VT, 13411 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 13412 DAG.getNode(ISD::FNEG, SL, VT, Y)); 13413 } 13414 if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) { 13415 if (C1->isExactlyValue(+1.0)) 13416 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 13417 DAG.getNode(ISD::FNEG, SL, VT, Y)); 13418 if (C1->isExactlyValue(-1.0)) 13419 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 13420 Y); 13421 } 13422 } 13423 return SDValue(); 13424 }; 13425 13426 if (SDValue FMA = FuseFSUB(N0, N1)) 13427 return FMA; 13428 if (SDValue FMA = FuseFSUB(N1, N0)) 13429 return FMA; 13430 13431 return SDValue(); 13432 } 13433 13434 SDValue DAGCombiner::visitFADD(SDNode *N) { 13435 SDValue N0 = N->getOperand(0); 13436 SDValue N1 = N->getOperand(1); 13437 bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); 13438 bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); 13439 EVT VT = N->getValueType(0); 13440 SDLoc DL(N); 13441 const TargetOptions &Options = DAG.getTarget().Options; 13442 SDNodeFlags Flags = N->getFlags(); 13443 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 13444 13445 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 13446 return R; 13447 13448 // fold vector ops 13449 if (VT.isVector()) 13450 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 13451 return FoldedVOp; 13452 13453 // fold (fadd c1, c2) -> c1 + c2 13454 if (N0CFP && N1CFP) 13455 return DAG.getNode(ISD::FADD, DL, VT, N0, N1); 13456 13457 // canonicalize constant to RHS 13458 if (N0CFP && !N1CFP) 13459 return DAG.getNode(ISD::FADD, DL, VT, N1, N0); 13460 13461 // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) 13462 ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); 13463 if (N1C && N1C->isZero()) 13464 if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) 13465 return N0; 13466 13467 if (SDValue NewSel = foldBinOpIntoSelect(N)) 13468 return NewSel; 13469 13470 // fold (fadd A, (fneg B)) -> (fsub A, B) 13471 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) 13472 if (SDValue NegN1 = TLI.getCheaperNegatedExpression( 13473 N1, DAG, LegalOperations, ForCodeSize)) 13474 return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1); 13475 13476 // fold (fadd (fneg A), B) -> (fsub B, A) 13477 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) 13478 if (SDValue NegN0 = TLI.getCheaperNegatedExpression( 13479 N0, DAG, LegalOperations, ForCodeSize)) 13480 return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0); 13481 13482 auto isFMulNegTwo = [](SDValue FMul) { 13483 if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) 13484 return false; 13485 auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true); 13486 return C && C->isExactlyValue(-2.0); 13487 }; 13488 13489 // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B) 13490 if (isFMulNegTwo(N0)) { 13491 SDValue B = N0.getOperand(0); 13492 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B); 13493 return DAG.getNode(ISD::FSUB, DL, VT, N1, Add); 13494 } 13495 // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B) 13496 if (isFMulNegTwo(N1)) { 13497 SDValue B = N1.getOperand(0); 13498 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B); 13499 return DAG.getNode(ISD::FSUB, DL, VT, N0, Add); 13500 } 13501 13502 // No FP constant should be created after legalization as Instruction 13503 // Selection pass has a hard time dealing with FP constants. 13504 bool AllowNewConst = (Level < AfterLegalizeDAG); 13505 13506 // If nnan is enabled, fold lots of things. 13507 if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) { 13508 // If allowed, fold (fadd (fneg x), x) -> 0.0 13509 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) 13510 return DAG.getConstantFP(0.0, DL, VT); 13511 13512 // If allowed, fold (fadd x, (fneg x)) -> 0.0 13513 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) 13514 return DAG.getConstantFP(0.0, DL, VT); 13515 } 13516 13517 // If 'unsafe math' or reassoc and nsz, fold lots of things. 13518 // TODO: break out portions of the transformations below for which Unsafe is 13519 // considered and which do not require both nsz and reassoc 13520 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 13521 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 13522 AllowNewConst) { 13523 // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 13524 if (N1CFP && N0.getOpcode() == ISD::FADD && 13525 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 13526 SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1); 13527 return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC); 13528 } 13529 13530 // We can fold chains of FADD's of the same value into multiplications. 13531 // This transform is not safe in general because we are reducing the number 13532 // of rounding steps. 13533 if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { 13534 if (N0.getOpcode() == ISD::FMUL) { 13535 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 13536 bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); 13537 13538 // (fadd (fmul x, c), x) -> (fmul x, c+1) 13539 if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { 13540 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 13541 DAG.getConstantFP(1.0, DL, VT)); 13542 return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP); 13543 } 13544 13545 // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) 13546 if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && 13547 N1.getOperand(0) == N1.getOperand(1) && 13548 N0.getOperand(0) == N1.getOperand(0)) { 13549 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 13550 DAG.getConstantFP(2.0, DL, VT)); 13551 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP); 13552 } 13553 } 13554 13555 if (N1.getOpcode() == ISD::FMUL) { 13556 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 13557 bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); 13558 13559 // (fadd x, (fmul x, c)) -> (fmul x, c+1) 13560 if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { 13561 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 13562 DAG.getConstantFP(1.0, DL, VT)); 13563 return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP); 13564 } 13565 13566 // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) 13567 if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && 13568 N0.getOperand(0) == N0.getOperand(1) && 13569 N1.getOperand(0) == N0.getOperand(0)) { 13570 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 13571 DAG.getConstantFP(2.0, DL, VT)); 13572 return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP); 13573 } 13574 } 13575 13576 if (N0.getOpcode() == ISD::FADD) { 13577 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 13578 // (fadd (fadd x, x), x) -> (fmul x, 3.0) 13579 if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && 13580 (N0.getOperand(0) == N1)) { 13581 return DAG.getNode(ISD::FMUL, DL, VT, N1, 13582 DAG.getConstantFP(3.0, DL, VT)); 13583 } 13584 } 13585 13586 if (N1.getOpcode() == ISD::FADD) { 13587 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 13588 // (fadd x, (fadd x, x)) -> (fmul x, 3.0) 13589 if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && 13590 N1.getOperand(0) == N0) { 13591 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13592 DAG.getConstantFP(3.0, DL, VT)); 13593 } 13594 } 13595 13596 // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) 13597 if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && 13598 N0.getOperand(0) == N0.getOperand(1) && 13599 N1.getOperand(0) == N1.getOperand(1) && 13600 N0.getOperand(0) == N1.getOperand(0)) { 13601 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), 13602 DAG.getConstantFP(4.0, DL, VT)); 13603 } 13604 } 13605 } // enable-unsafe-fp-math 13606 13607 // FADD -> FMA combines: 13608 if (SDValue Fused = visitFADDForFMACombine(N)) { 13609 AddToWorklist(Fused.getNode()); 13610 return Fused; 13611 } 13612 return SDValue(); 13613 } 13614 13615 SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) { 13616 SDValue Chain = N->getOperand(0); 13617 SDValue N0 = N->getOperand(1); 13618 SDValue N1 = N->getOperand(2); 13619 EVT VT = N->getValueType(0); 13620 EVT ChainVT = N->getValueType(1); 13621 SDLoc DL(N); 13622 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 13623 13624 // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B) 13625 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) 13626 if (SDValue NegN1 = TLI.getCheaperNegatedExpression( 13627 N1, DAG, LegalOperations, ForCodeSize)) { 13628 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), 13629 {Chain, N0, NegN1}); 13630 } 13631 13632 // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A) 13633 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) 13634 if (SDValue NegN0 = TLI.getCheaperNegatedExpression( 13635 N0, DAG, LegalOperations, ForCodeSize)) { 13636 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), 13637 {Chain, N1, NegN0}); 13638 } 13639 return SDValue(); 13640 } 13641 13642 SDValue DAGCombiner::visitFSUB(SDNode *N) { 13643 SDValue N0 = N->getOperand(0); 13644 SDValue N1 = N->getOperand(1); 13645 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 13646 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 13647 EVT VT = N->getValueType(0); 13648 SDLoc DL(N); 13649 const TargetOptions &Options = DAG.getTarget().Options; 13650 const SDNodeFlags Flags = N->getFlags(); 13651 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 13652 13653 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 13654 return R; 13655 13656 // fold vector ops 13657 if (VT.isVector()) 13658 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 13659 return FoldedVOp; 13660 13661 // fold (fsub c1, c2) -> c1-c2 13662 if (N0CFP && N1CFP) 13663 return DAG.getNode(ISD::FSUB, DL, VT, N0, N1); 13664 13665 if (SDValue NewSel = foldBinOpIntoSelect(N)) 13666 return NewSel; 13667 13668 // (fsub A, 0) -> A 13669 if (N1CFP && N1CFP->isZero()) { 13670 if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath || 13671 Flags.hasNoSignedZeros()) { 13672 return N0; 13673 } 13674 } 13675 13676 if (N0 == N1) { 13677 // (fsub x, x) -> 0.0 13678 if (Options.NoNaNsFPMath || Flags.hasNoNaNs()) 13679 return DAG.getConstantFP(0.0f, DL, VT); 13680 } 13681 13682 // (fsub -0.0, N1) -> -N1 13683 if (N0CFP && N0CFP->isZero()) { 13684 if (N0CFP->isNegative() || 13685 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { 13686 // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are 13687 // flushed to zero, unless all users treat denorms as zero (DAZ). 13688 // FIXME: This transform will change the sign of a NaN and the behavior 13689 // of a signaling NaN. It is only valid when a NoNaN flag is present. 13690 DenormalMode DenormMode = DAG.getDenormalMode(VT); 13691 if (DenormMode == DenormalMode::getIEEE()) { 13692 if (SDValue NegN1 = 13693 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) 13694 return NegN1; 13695 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 13696 return DAG.getNode(ISD::FNEG, DL, VT, N1); 13697 } 13698 } 13699 } 13700 13701 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 13702 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 13703 N1.getOpcode() == ISD::FADD) { 13704 // X - (X + Y) -> -Y 13705 if (N0 == N1->getOperand(0)) 13706 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1)); 13707 // X - (Y + X) -> -Y 13708 if (N0 == N1->getOperand(1)) 13709 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0)); 13710 } 13711 13712 // fold (fsub A, (fneg B)) -> (fadd A, B) 13713 if (SDValue NegN1 = 13714 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) 13715 return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1); 13716 13717 // FSUB -> FMA combines: 13718 if (SDValue Fused = visitFSUBForFMACombine(N)) { 13719 AddToWorklist(Fused.getNode()); 13720 return Fused; 13721 } 13722 13723 return SDValue(); 13724 } 13725 13726 SDValue DAGCombiner::visitFMUL(SDNode *N) { 13727 SDValue N0 = N->getOperand(0); 13728 SDValue N1 = N->getOperand(1); 13729 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 13730 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 13731 EVT VT = N->getValueType(0); 13732 SDLoc DL(N); 13733 const TargetOptions &Options = DAG.getTarget().Options; 13734 const SDNodeFlags Flags = N->getFlags(); 13735 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 13736 13737 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 13738 return R; 13739 13740 // fold vector ops 13741 if (VT.isVector()) { 13742 // This just handles C1 * C2 for vectors. Other vector folds are below. 13743 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 13744 return FoldedVOp; 13745 } 13746 13747 // fold (fmul c1, c2) -> c1*c2 13748 if (N0CFP && N1CFP) 13749 return DAG.getNode(ISD::FMUL, DL, VT, N0, N1); 13750 13751 // canonicalize constant to RHS 13752 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && 13753 !DAG.isConstantFPBuildVectorOrConstantFP(N1)) 13754 return DAG.getNode(ISD::FMUL, DL, VT, N1, N0); 13755 13756 if (SDValue NewSel = foldBinOpIntoSelect(N)) 13757 return NewSel; 13758 13759 if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) { 13760 // fmul (fmul X, C1), C2 -> fmul X, C1 * C2 13761 if (DAG.isConstantFPBuildVectorOrConstantFP(N1) && 13762 N0.getOpcode() == ISD::FMUL) { 13763 SDValue N00 = N0.getOperand(0); 13764 SDValue N01 = N0.getOperand(1); 13765 // Avoid an infinite loop by making sure that N00 is not a constant 13766 // (the inner multiply has not been constant folded yet). 13767 if (DAG.isConstantFPBuildVectorOrConstantFP(N01) && 13768 !DAG.isConstantFPBuildVectorOrConstantFP(N00)) { 13769 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1); 13770 return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts); 13771 } 13772 } 13773 13774 // Match a special-case: we convert X * 2.0 into fadd. 13775 // fmul (fadd X, X), C -> fmul X, 2.0 * C 13776 if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() && 13777 N0.getOperand(0) == N0.getOperand(1)) { 13778 const SDValue Two = DAG.getConstantFP(2.0, DL, VT); 13779 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1); 13780 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts); 13781 } 13782 } 13783 13784 // fold (fmul X, 2.0) -> (fadd X, X) 13785 if (N1CFP && N1CFP->isExactlyValue(+2.0)) 13786 return DAG.getNode(ISD::FADD, DL, VT, N0, N0); 13787 13788 // fold (fmul X, -1.0) -> (fneg X) 13789 if (N1CFP && N1CFP->isExactlyValue(-1.0)) 13790 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 13791 return DAG.getNode(ISD::FNEG, DL, VT, N0); 13792 13793 // -N0 * -N1 --> N0 * N1 13794 TargetLowering::NegatibleCost CostN0 = 13795 TargetLowering::NegatibleCost::Expensive; 13796 TargetLowering::NegatibleCost CostN1 = 13797 TargetLowering::NegatibleCost::Expensive; 13798 SDValue NegN0 = 13799 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 13800 SDValue NegN1 = 13801 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 13802 if (NegN0 && NegN1 && 13803 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 13804 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 13805 return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1); 13806 13807 // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) 13808 // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) 13809 if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && 13810 (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && 13811 TLI.isOperationLegal(ISD::FABS, VT)) { 13812 SDValue Select = N0, X = N1; 13813 if (Select.getOpcode() != ISD::SELECT) 13814 std::swap(Select, X); 13815 13816 SDValue Cond = Select.getOperand(0); 13817 auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1)); 13818 auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2)); 13819 13820 if (TrueOpnd && FalseOpnd && 13821 Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && 13822 isa<ConstantFPSDNode>(Cond.getOperand(1)) && 13823 cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) { 13824 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13825 switch (CC) { 13826 default: break; 13827 case ISD::SETOLT: 13828 case ISD::SETULT: 13829 case ISD::SETOLE: 13830 case ISD::SETULE: 13831 case ISD::SETLT: 13832 case ISD::SETLE: 13833 std::swap(TrueOpnd, FalseOpnd); 13834 LLVM_FALLTHROUGH; 13835 case ISD::SETOGT: 13836 case ISD::SETUGT: 13837 case ISD::SETOGE: 13838 case ISD::SETUGE: 13839 case ISD::SETGT: 13840 case ISD::SETGE: 13841 if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && 13842 TLI.isOperationLegal(ISD::FNEG, VT)) 13843 return DAG.getNode(ISD::FNEG, DL, VT, 13844 DAG.getNode(ISD::FABS, DL, VT, X)); 13845 if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) 13846 return DAG.getNode(ISD::FABS, DL, VT, X); 13847 13848 break; 13849 } 13850 } 13851 } 13852 13853 // FMUL -> FMA combines: 13854 if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { 13855 AddToWorklist(Fused.getNode()); 13856 return Fused; 13857 } 13858 13859 return SDValue(); 13860 } 13861 13862 SDValue DAGCombiner::visitFMA(SDNode *N) { 13863 SDValue N0 = N->getOperand(0); 13864 SDValue N1 = N->getOperand(1); 13865 SDValue N2 = N->getOperand(2); 13866 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13867 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 13868 EVT VT = N->getValueType(0); 13869 SDLoc DL(N); 13870 const TargetOptions &Options = DAG.getTarget().Options; 13871 // FMA nodes have flags that propagate to the created nodes. 13872 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 13873 13874 bool UnsafeFPMath = 13875 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); 13876 13877 // Constant fold FMA. 13878 if (isa<ConstantFPSDNode>(N0) && 13879 isa<ConstantFPSDNode>(N1) && 13880 isa<ConstantFPSDNode>(N2)) { 13881 return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); 13882 } 13883 13884 // (-N0 * -N1) + N2 --> (N0 * N1) + N2 13885 TargetLowering::NegatibleCost CostN0 = 13886 TargetLowering::NegatibleCost::Expensive; 13887 TargetLowering::NegatibleCost CostN1 = 13888 TargetLowering::NegatibleCost::Expensive; 13889 SDValue NegN0 = 13890 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 13891 SDValue NegN1 = 13892 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 13893 if (NegN0 && NegN1 && 13894 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 13895 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 13896 return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); 13897 13898 if (UnsafeFPMath) { 13899 if (N0CFP && N0CFP->isZero()) 13900 return N2; 13901 if (N1CFP && N1CFP->isZero()) 13902 return N2; 13903 } 13904 13905 if (N0CFP && N0CFP->isExactlyValue(1.0)) 13906 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); 13907 if (N1CFP && N1CFP->isExactlyValue(1.0)) 13908 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); 13909 13910 // Canonicalize (fma c, x, y) -> (fma x, c, y) 13911 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && 13912 !DAG.isConstantFPBuildVectorOrConstantFP(N1)) 13913 return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); 13914 13915 if (UnsafeFPMath) { 13916 // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) 13917 if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && 13918 DAG.isConstantFPBuildVectorOrConstantFP(N1) && 13919 DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { 13920 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13921 DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1))); 13922 } 13923 13924 // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) 13925 if (N0.getOpcode() == ISD::FMUL && 13926 DAG.isConstantFPBuildVectorOrConstantFP(N1) && 13927 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 13928 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), 13929 DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)), 13930 N2); 13931 } 13932 } 13933 13934 // (fma x, -1, y) -> (fadd (fneg x), y) 13935 if (N1CFP) { 13936 if (N1CFP->isExactlyValue(1.0)) 13937 return DAG.getNode(ISD::FADD, DL, VT, N0, N2); 13938 13939 if (N1CFP->isExactlyValue(-1.0) && 13940 (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { 13941 SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); 13942 AddToWorklist(RHSNeg.getNode()); 13943 return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); 13944 } 13945 13946 // fma (fneg x), K, y -> fma x -K, y 13947 if (N0.getOpcode() == ISD::FNEG && 13948 (TLI.isOperationLegal(ISD::ConstantFP, VT) || 13949 (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, 13950 ForCodeSize)))) { 13951 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), 13952 DAG.getNode(ISD::FNEG, DL, VT, N1), N2); 13953 } 13954 } 13955 13956 if (UnsafeFPMath) { 13957 // (fma x, c, x) -> (fmul x, (c+1)) 13958 if (N1CFP && N0 == N2) { 13959 return DAG.getNode( 13960 ISD::FMUL, DL, VT, N0, 13961 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT))); 13962 } 13963 13964 // (fma x, c, (fneg x)) -> (fmul x, (c-1)) 13965 if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { 13966 return DAG.getNode( 13967 ISD::FMUL, DL, VT, N0, 13968 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT))); 13969 } 13970 } 13971 13972 // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z)) 13973 // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z)) 13974 if (!TLI.isFNegFree(VT)) 13975 if (SDValue Neg = TLI.getCheaperNegatedExpression( 13976 SDValue(N, 0), DAG, LegalOperations, ForCodeSize)) 13977 return DAG.getNode(ISD::FNEG, DL, VT, Neg); 13978 return SDValue(); 13979 } 13980 13981 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 13982 // reciprocal. 13983 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) 13984 // Notice that this is not always beneficial. One reason is different targets 13985 // may have different costs for FDIV and FMUL, so sometimes the cost of two 13986 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason 13987 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". 13988 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { 13989 // TODO: Limit this transform based on optsize/minsize - it always creates at 13990 // least 1 extra instruction. But the perf win may be substantial enough 13991 // that only minsize should restrict this. 13992 bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; 13993 const SDNodeFlags Flags = N->getFlags(); 13994 if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal())) 13995 return SDValue(); 13996 13997 // Skip if current node is a reciprocal/fneg-reciprocal. 13998 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 13999 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true); 14000 if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) 14001 return SDValue(); 14002 14003 // Exit early if the target does not want this transform or if there can't 14004 // possibly be enough uses of the divisor to make the transform worthwhile. 14005 unsigned MinUses = TLI.combineRepeatedFPDivisors(); 14006 14007 // For splat vectors, scale the number of uses by the splat factor. If we can 14008 // convert the division into a scalar op, that will likely be much faster. 14009 unsigned NumElts = 1; 14010 EVT VT = N->getValueType(0); 14011 if (VT.isVector() && DAG.isSplatValue(N1)) 14012 NumElts = VT.getVectorNumElements(); 14013 14014 if (!MinUses || (N1->use_size() * NumElts) < MinUses) 14015 return SDValue(); 14016 14017 // Find all FDIV users of the same divisor. 14018 // Use a set because duplicates may be present in the user list. 14019 SetVector<SDNode *> Users; 14020 for (auto *U : N1->uses()) { 14021 if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { 14022 // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet. 14023 if (U->getOperand(1).getOpcode() == ISD::FSQRT && 14024 U->getOperand(0) == U->getOperand(1).getOperand(0) && 14025 U->getFlags().hasAllowReassociation() && 14026 U->getFlags().hasNoSignedZeros()) 14027 continue; 14028 14029 // This division is eligible for optimization only if global unsafe math 14030 // is enabled or if this division allows reciprocal formation. 14031 if (UnsafeMath || U->getFlags().hasAllowReciprocal()) 14032 Users.insert(U); 14033 } 14034 } 14035 14036 // Now that we have the actual number of divisor uses, make sure it meets 14037 // the minimum threshold specified by the target. 14038 if ((Users.size() * NumElts) < MinUses) 14039 return SDValue(); 14040 14041 SDLoc DL(N); 14042 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 14043 SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); 14044 14045 // Dividend / Divisor -> Dividend * Reciprocal 14046 for (auto *U : Users) { 14047 SDValue Dividend = U->getOperand(0); 14048 if (Dividend != FPOne) { 14049 SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, 14050 Reciprocal, Flags); 14051 CombineTo(U, NewNode); 14052 } else if (U != Reciprocal.getNode()) { 14053 // In the absence of fast-math-flags, this user node is always the 14054 // same node as Reciprocal, but with FMF they may be different nodes. 14055 CombineTo(U, Reciprocal); 14056 } 14057 } 14058 return SDValue(N, 0); // N was replaced. 14059 } 14060 14061 SDValue DAGCombiner::visitFDIV(SDNode *N) { 14062 SDValue N0 = N->getOperand(0); 14063 SDValue N1 = N->getOperand(1); 14064 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 14065 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 14066 EVT VT = N->getValueType(0); 14067 SDLoc DL(N); 14068 const TargetOptions &Options = DAG.getTarget().Options; 14069 SDNodeFlags Flags = N->getFlags(); 14070 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 14071 14072 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 14073 return R; 14074 14075 // fold vector ops 14076 if (VT.isVector()) 14077 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 14078 return FoldedVOp; 14079 14080 // fold (fdiv c1, c2) -> c1/c2 14081 if (N0CFP && N1CFP) 14082 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1); 14083 14084 if (SDValue NewSel = foldBinOpIntoSelect(N)) 14085 return NewSel; 14086 14087 if (SDValue V = combineRepeatedFPDivisors(N)) 14088 return V; 14089 14090 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { 14091 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. 14092 if (N1CFP) { 14093 // Compute the reciprocal 1.0 / c2. 14094 const APFloat &N1APF = N1CFP->getValueAPF(); 14095 APFloat Recip(N1APF.getSemantics(), 1); // 1.0 14096 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); 14097 // Only do the transform if the reciprocal is a legal fp immediate that 14098 // isn't too nasty (eg NaN, denormal, ...). 14099 if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty 14100 (!LegalOperations || 14101 // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM 14102 // backend)... we should handle this gracefully after Legalize. 14103 // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || 14104 TLI.isOperationLegal(ISD::ConstantFP, VT) || 14105 TLI.isFPImmLegal(Recip, VT, ForCodeSize))) 14106 return DAG.getNode(ISD::FMUL, DL, VT, N0, 14107 DAG.getConstantFP(Recip, DL, VT)); 14108 } 14109 14110 // If this FDIV is part of a reciprocal square root, it may be folded 14111 // into a target-specific square root estimate instruction. 14112 if (N1.getOpcode() == ISD::FSQRT) { 14113 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) 14114 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); 14115 } else if (N1.getOpcode() == ISD::FP_EXTEND && 14116 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 14117 if (SDValue RV = 14118 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { 14119 RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); 14120 AddToWorklist(RV.getNode()); 14121 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); 14122 } 14123 } else if (N1.getOpcode() == ISD::FP_ROUND && 14124 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 14125 if (SDValue RV = 14126 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { 14127 RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); 14128 AddToWorklist(RV.getNode()); 14129 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); 14130 } 14131 } else if (N1.getOpcode() == ISD::FMUL) { 14132 // Look through an FMUL. Even though this won't remove the FDIV directly, 14133 // it's still worthwhile to get rid of the FSQRT if possible. 14134 SDValue Sqrt, Y; 14135 if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { 14136 Sqrt = N1.getOperand(0); 14137 Y = N1.getOperand(1); 14138 } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { 14139 Sqrt = N1.getOperand(1); 14140 Y = N1.getOperand(0); 14141 } 14142 if (Sqrt.getNode()) { 14143 // If the other multiply operand is known positive, pull it into the 14144 // sqrt. That will eliminate the division if we convert to an estimate. 14145 if (Flags.hasAllowReassociation() && N1.hasOneUse() && 14146 N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) { 14147 SDValue A; 14148 if (Y.getOpcode() == ISD::FABS && Y.hasOneUse()) 14149 A = Y.getOperand(0); 14150 else if (Y == Sqrt.getOperand(0)) 14151 A = Y; 14152 if (A) { 14153 // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z) 14154 // X / (A * sqrt(A)) --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A) 14155 SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A); 14156 SDValue AAZ = 14157 DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0)); 14158 if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags)) 14159 return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt); 14160 14161 // Estimate creation failed. Clean up speculatively created nodes. 14162 recursivelyDeleteUnusedNodes(AAZ.getNode()); 14163 } 14164 } 14165 14166 // We found a FSQRT, so try to make this fold: 14167 // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y) 14168 if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) { 14169 SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y); 14170 AddToWorklist(Div.getNode()); 14171 return DAG.getNode(ISD::FMUL, DL, VT, N0, Div); 14172 } 14173 } 14174 } 14175 14176 // Fold into a reciprocal estimate and multiply instead of a real divide. 14177 if (Options.NoInfsFPMath || Flags.hasNoInfs()) 14178 if (SDValue RV = BuildDivEstimate(N0, N1, Flags)) 14179 return RV; 14180 } 14181 14182 // Fold X/Sqrt(X) -> Sqrt(X) 14183 if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && 14184 (Options.UnsafeFPMath || Flags.hasAllowReassociation())) 14185 if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0)) 14186 return N1; 14187 14188 // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) 14189 TargetLowering::NegatibleCost CostN0 = 14190 TargetLowering::NegatibleCost::Expensive; 14191 TargetLowering::NegatibleCost CostN1 = 14192 TargetLowering::NegatibleCost::Expensive; 14193 SDValue NegN0 = 14194 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 14195 SDValue NegN1 = 14196 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 14197 if (NegN0 && NegN1 && 14198 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 14199 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 14200 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1); 14201 14202 return SDValue(); 14203 } 14204 14205 SDValue DAGCombiner::visitFREM(SDNode *N) { 14206 SDValue N0 = N->getOperand(0); 14207 SDValue N1 = N->getOperand(1); 14208 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 14209 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 14210 EVT VT = N->getValueType(0); 14211 SDNodeFlags Flags = N->getFlags(); 14212 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 14213 14214 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 14215 return R; 14216 14217 // fold (frem c1, c2) -> fmod(c1,c2) 14218 if (N0CFP && N1CFP) 14219 return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1); 14220 14221 if (SDValue NewSel = foldBinOpIntoSelect(N)) 14222 return NewSel; 14223 14224 return SDValue(); 14225 } 14226 14227 SDValue DAGCombiner::visitFSQRT(SDNode *N) { 14228 SDNodeFlags Flags = N->getFlags(); 14229 const TargetOptions &Options = DAG.getTarget().Options; 14230 14231 // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as: 14232 // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN 14233 if (!Flags.hasApproximateFuncs() || 14234 (!Options.NoInfsFPMath && !Flags.hasNoInfs())) 14235 return SDValue(); 14236 14237 SDValue N0 = N->getOperand(0); 14238 if (TLI.isFsqrtCheap(N0, DAG)) 14239 return SDValue(); 14240 14241 // FSQRT nodes have flags that propagate to the created nodes. 14242 // TODO: If this is N0/sqrt(N0), and we reach this node before trying to 14243 // transform the fdiv, we may produce a sub-optimal estimate sequence 14244 // because the reciprocal calculation may not have to filter out a 14245 // 0.0 input. 14246 return buildSqrtEstimate(N0, Flags); 14247 } 14248 14249 /// copysign(x, fp_extend(y)) -> copysign(x, y) 14250 /// copysign(x, fp_round(y)) -> copysign(x, y) 14251 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { 14252 SDValue N1 = N->getOperand(1); 14253 if ((N1.getOpcode() == ISD::FP_EXTEND || 14254 N1.getOpcode() == ISD::FP_ROUND)) { 14255 EVT N1VT = N1->getValueType(0); 14256 EVT N1Op0VT = N1->getOperand(0).getValueType(); 14257 14258 // Always fold no-op FP casts. 14259 if (N1VT == N1Op0VT) 14260 return true; 14261 14262 // Do not optimize out type conversion of f128 type yet. 14263 // For some targets like x86_64, configuration is changed to keep one f128 14264 // value in one SSE register, but instruction selection cannot handle 14265 // FCOPYSIGN on SSE registers yet. 14266 if (N1Op0VT == MVT::f128) 14267 return false; 14268 14269 // Avoid mismatched vector operand types, for better instruction selection. 14270 if (N1Op0VT.isVector()) 14271 return false; 14272 14273 return true; 14274 } 14275 return false; 14276 } 14277 14278 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { 14279 SDValue N0 = N->getOperand(0); 14280 SDValue N1 = N->getOperand(1); 14281 bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); 14282 bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); 14283 EVT VT = N->getValueType(0); 14284 14285 if (N0CFP && N1CFP) // Constant fold 14286 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); 14287 14288 if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { 14289 const APFloat &V = N1C->getValueAPF(); 14290 // copysign(x, c1) -> fabs(x) iff ispos(c1) 14291 // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) 14292 if (!V.isNegative()) { 14293 if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) 14294 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 14295 } else { 14296 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 14297 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, 14298 DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); 14299 } 14300 } 14301 14302 // copysign(fabs(x), y) -> copysign(x, y) 14303 // copysign(fneg(x), y) -> copysign(x, y) 14304 // copysign(copysign(x,z), y) -> copysign(x, y) 14305 if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || 14306 N0.getOpcode() == ISD::FCOPYSIGN) 14307 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); 14308 14309 // copysign(x, abs(y)) -> abs(x) 14310 if (N1.getOpcode() == ISD::FABS) 14311 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 14312 14313 // copysign(x, copysign(y,z)) -> copysign(x, z) 14314 if (N1.getOpcode() == ISD::FCOPYSIGN) 14315 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); 14316 14317 // copysign(x, fp_extend(y)) -> copysign(x, y) 14318 // copysign(x, fp_round(y)) -> copysign(x, y) 14319 if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) 14320 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); 14321 14322 return SDValue(); 14323 } 14324 14325 SDValue DAGCombiner::visitFPOW(SDNode *N) { 14326 ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1)); 14327 if (!ExponentC) 14328 return SDValue(); 14329 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 14330 14331 // Try to convert x ** (1/3) into cube root. 14332 // TODO: Handle the various flavors of long double. 14333 // TODO: Since we're approximating, we don't need an exact 1/3 exponent. 14334 // Some range near 1/3 should be fine. 14335 EVT VT = N->getValueType(0); 14336 if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) || 14337 (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) { 14338 // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0. 14339 // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf. 14340 // pow(-val, 1/3) = nan; cbrt(-val) = -num. 14341 // For regular numbers, rounding may cause the results to differ. 14342 // Therefore, we require { nsz ninf nnan afn } for this transform. 14343 // TODO: We could select out the special cases if we don't have nsz/ninf. 14344 SDNodeFlags Flags = N->getFlags(); 14345 if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() || 14346 !Flags.hasApproximateFuncs()) 14347 return SDValue(); 14348 14349 // Do not create a cbrt() libcall if the target does not have it, and do not 14350 // turn a pow that has lowering support into a cbrt() libcall. 14351 if (!DAG.getLibInfo().has(LibFunc_cbrt) || 14352 (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) && 14353 DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT))) 14354 return SDValue(); 14355 14356 return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0)); 14357 } 14358 14359 // Try to convert x ** (1/4) and x ** (3/4) into square roots. 14360 // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case. 14361 // TODO: This could be extended (using a target hook) to handle smaller 14362 // power-of-2 fractional exponents. 14363 bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25); 14364 bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75); 14365 if (ExponentIs025 || ExponentIs075) { 14366 // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0. 14367 // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN. 14368 // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0. 14369 // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN. 14370 // For regular numbers, rounding may cause the results to differ. 14371 // Therefore, we require { nsz ninf afn } for this transform. 14372 // TODO: We could select out the special cases if we don't have nsz/ninf. 14373 SDNodeFlags Flags = N->getFlags(); 14374 14375 // We only need no signed zeros for the 0.25 case. 14376 if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() || 14377 !Flags.hasApproximateFuncs()) 14378 return SDValue(); 14379 14380 // Don't double the number of libcalls. We are trying to inline fast code. 14381 if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT)) 14382 return SDValue(); 14383 14384 // Assume that libcalls are the smallest code. 14385 // TODO: This restriction should probably be lifted for vectors. 14386 if (ForCodeSize) 14387 return SDValue(); 14388 14389 // pow(X, 0.25) --> sqrt(sqrt(X)) 14390 SDLoc DL(N); 14391 SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0)); 14392 SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt); 14393 if (ExponentIs025) 14394 return SqrtSqrt; 14395 // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X)) 14396 return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt); 14397 } 14398 14399 return SDValue(); 14400 } 14401 14402 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, 14403 const TargetLowering &TLI) { 14404 // This optimization is guarded by a function attribute because it may produce 14405 // unexpected results. Ie, programs may be relying on the platform-specific 14406 // undefined behavior when the float-to-int conversion overflows. 14407 const Function &F = DAG.getMachineFunction().getFunction(); 14408 Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow"); 14409 if (StrictOverflow.getValueAsString().equals("false")) 14410 return SDValue(); 14411 14412 // We only do this if the target has legal ftrunc. Otherwise, we'd likely be 14413 // replacing casts with a libcall. We also must be allowed to ignore -0.0 14414 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer 14415 // conversions would return +0.0. 14416 // FIXME: We should be able to use node-level FMF here. 14417 // TODO: If strict math, should we use FABS (+ range check for signed cast)? 14418 EVT VT = N->getValueType(0); 14419 if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || 14420 !DAG.getTarget().Options.NoSignedZerosFPMath) 14421 return SDValue(); 14422 14423 // fptosi/fptoui round towards zero, so converting from FP to integer and 14424 // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X 14425 SDValue N0 = N->getOperand(0); 14426 if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && 14427 N0.getOperand(0).getValueType() == VT) 14428 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 14429 14430 if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && 14431 N0.getOperand(0).getValueType() == VT) 14432 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 14433 14434 return SDValue(); 14435 } 14436 14437 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { 14438 SDValue N0 = N->getOperand(0); 14439 EVT VT = N->getValueType(0); 14440 EVT OpVT = N0.getValueType(); 14441 14442 // [us]itofp(undef) = 0, because the result value is bounded. 14443 if (N0.isUndef()) 14444 return DAG.getConstantFP(0.0, SDLoc(N), VT); 14445 14446 // fold (sint_to_fp c1) -> c1fp 14447 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 14448 // ...but only if the target supports immediate floating-point values 14449 (!LegalOperations || 14450 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 14451 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 14452 14453 // If the input is a legal type, and SINT_TO_FP is not legal on this target, 14454 // but UINT_TO_FP is legal on this target, try to convert. 14455 if (!hasOperation(ISD::SINT_TO_FP, OpVT) && 14456 hasOperation(ISD::UINT_TO_FP, OpVT)) { 14457 // If the sign bit is known to be zero, we can change this to UINT_TO_FP. 14458 if (DAG.SignBitIsZero(N0)) 14459 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 14460 } 14461 14462 // The next optimizations are desirable only if SELECT_CC can be lowered. 14463 // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0) 14464 if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && 14465 !VT.isVector() && 14466 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 14467 SDLoc DL(N); 14468 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT), 14469 DAG.getConstantFP(0.0, DL, VT)); 14470 } 14471 14472 // fold (sint_to_fp (zext (setcc x, y, cc))) -> 14473 // (select (setcc x, y, cc), 1.0, 0.0) 14474 if (N0.getOpcode() == ISD::ZERO_EXTEND && 14475 N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() && 14476 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 14477 SDLoc DL(N); 14478 return DAG.getSelect(DL, VT, N0.getOperand(0), 14479 DAG.getConstantFP(1.0, DL, VT), 14480 DAG.getConstantFP(0.0, DL, VT)); 14481 } 14482 14483 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 14484 return FTrunc; 14485 14486 return SDValue(); 14487 } 14488 14489 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { 14490 SDValue N0 = N->getOperand(0); 14491 EVT VT = N->getValueType(0); 14492 EVT OpVT = N0.getValueType(); 14493 14494 // [us]itofp(undef) = 0, because the result value is bounded. 14495 if (N0.isUndef()) 14496 return DAG.getConstantFP(0.0, SDLoc(N), VT); 14497 14498 // fold (uint_to_fp c1) -> c1fp 14499 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 14500 // ...but only if the target supports immediate floating-point values 14501 (!LegalOperations || 14502 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 14503 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 14504 14505 // If the input is a legal type, and UINT_TO_FP is not legal on this target, 14506 // but SINT_TO_FP is legal on this target, try to convert. 14507 if (!hasOperation(ISD::UINT_TO_FP, OpVT) && 14508 hasOperation(ISD::SINT_TO_FP, OpVT)) { 14509 // If the sign bit is known to be zero, we can change this to SINT_TO_FP. 14510 if (DAG.SignBitIsZero(N0)) 14511 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 14512 } 14513 14514 // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0) 14515 if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && 14516 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 14517 SDLoc DL(N); 14518 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT), 14519 DAG.getConstantFP(0.0, DL, VT)); 14520 } 14521 14522 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 14523 return FTrunc; 14524 14525 return SDValue(); 14526 } 14527 14528 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x 14529 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { 14530 SDValue N0 = N->getOperand(0); 14531 EVT VT = N->getValueType(0); 14532 14533 if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) 14534 return SDValue(); 14535 14536 SDValue Src = N0.getOperand(0); 14537 EVT SrcVT = Src.getValueType(); 14538 bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; 14539 bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; 14540 14541 // We can safely assume the conversion won't overflow the output range, 14542 // because (for example) (uint8_t)18293.f is undefined behavior. 14543 14544 // Since we can assume the conversion won't overflow, our decision as to 14545 // whether the input will fit in the float should depend on the minimum 14546 // of the input range and output range. 14547 14548 // This means this is also safe for a signed input and unsigned output, since 14549 // a negative input would lead to undefined behavior. 14550 unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; 14551 unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; 14552 unsigned ActualSize = std::min(InputSize, OutputSize); 14553 const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); 14554 14555 // We can only fold away the float conversion if the input range can be 14556 // represented exactly in the float range. 14557 if (APFloat::semanticsPrecision(sem) >= ActualSize) { 14558 if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { 14559 unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND 14560 : ISD::ZERO_EXTEND; 14561 return DAG.getNode(ExtOp, SDLoc(N), VT, Src); 14562 } 14563 if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) 14564 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); 14565 return DAG.getBitcast(VT, Src); 14566 } 14567 return SDValue(); 14568 } 14569 14570 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { 14571 SDValue N0 = N->getOperand(0); 14572 EVT VT = N->getValueType(0); 14573 14574 // fold (fp_to_sint undef) -> undef 14575 if (N0.isUndef()) 14576 return DAG.getUNDEF(VT); 14577 14578 // fold (fp_to_sint c1fp) -> c1 14579 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14580 return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); 14581 14582 return FoldIntToFPToInt(N, DAG); 14583 } 14584 14585 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { 14586 SDValue N0 = N->getOperand(0); 14587 EVT VT = N->getValueType(0); 14588 14589 // fold (fp_to_uint undef) -> undef 14590 if (N0.isUndef()) 14591 return DAG.getUNDEF(VT); 14592 14593 // fold (fp_to_uint c1fp) -> c1 14594 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14595 return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); 14596 14597 return FoldIntToFPToInt(N, DAG); 14598 } 14599 14600 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { 14601 SDValue N0 = N->getOperand(0); 14602 SDValue N1 = N->getOperand(1); 14603 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 14604 EVT VT = N->getValueType(0); 14605 14606 // fold (fp_round c1fp) -> c1fp 14607 if (N0CFP) 14608 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); 14609 14610 // fold (fp_round (fp_extend x)) -> x 14611 if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) 14612 return N0.getOperand(0); 14613 14614 // fold (fp_round (fp_round x)) -> (fp_round x) 14615 if (N0.getOpcode() == ISD::FP_ROUND) { 14616 const bool NIsTrunc = N->getConstantOperandVal(1) == 1; 14617 const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; 14618 14619 // Skip this folding if it results in an fp_round from f80 to f16. 14620 // 14621 // f80 to f16 always generates an expensive (and as yet, unimplemented) 14622 // libcall to __truncxfhf2 instead of selecting native f16 conversion 14623 // instructions from f32 or f64. Moreover, the first (value-preserving) 14624 // fp_round from f80 to either f32 or f64 may become a NOP in platforms like 14625 // x86. 14626 if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) 14627 return SDValue(); 14628 14629 // If the first fp_round isn't a value preserving truncation, it might 14630 // introduce a tie in the second fp_round, that wouldn't occur in the 14631 // single-step fp_round we want to fold to. 14632 // In other words, double rounding isn't the same as rounding. 14633 // Also, this is a value preserving truncation iff both fp_round's are. 14634 if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { 14635 SDLoc DL(N); 14636 return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), 14637 DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); 14638 } 14639 } 14640 14641 // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) 14642 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { 14643 SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, 14644 N0.getOperand(0), N1); 14645 AddToWorklist(Tmp.getNode()); 14646 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, 14647 Tmp, N0.getOperand(1)); 14648 } 14649 14650 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 14651 return NewVSel; 14652 14653 return SDValue(); 14654 } 14655 14656 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { 14657 SDValue N0 = N->getOperand(0); 14658 EVT VT = N->getValueType(0); 14659 14660 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. 14661 if (N->hasOneUse() && 14662 N->use_begin()->getOpcode() == ISD::FP_ROUND) 14663 return SDValue(); 14664 14665 // fold (fp_extend c1fp) -> c1fp 14666 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14667 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); 14668 14669 // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) 14670 if (N0.getOpcode() == ISD::FP16_TO_FP && 14671 TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) 14672 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); 14673 14674 // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the 14675 // value of X. 14676 if (N0.getOpcode() == ISD::FP_ROUND 14677 && N0.getConstantOperandVal(1) == 1) { 14678 SDValue In = N0.getOperand(0); 14679 if (In.getValueType() == VT) return In; 14680 if (VT.bitsLT(In.getValueType())) 14681 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, 14682 In, N0.getOperand(1)); 14683 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); 14684 } 14685 14686 // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) 14687 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 14688 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 14689 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 14690 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 14691 LN0->getChain(), 14692 LN0->getBasePtr(), N0.getValueType(), 14693 LN0->getMemOperand()); 14694 CombineTo(N, ExtLoad); 14695 CombineTo(N0.getNode(), 14696 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), 14697 N0.getValueType(), ExtLoad, 14698 DAG.getIntPtrConstant(1, SDLoc(N0))), 14699 ExtLoad.getValue(1)); 14700 return SDValue(N, 0); // Return N so it doesn't get rechecked! 14701 } 14702 14703 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 14704 return NewVSel; 14705 14706 return SDValue(); 14707 } 14708 14709 SDValue DAGCombiner::visitFCEIL(SDNode *N) { 14710 SDValue N0 = N->getOperand(0); 14711 EVT VT = N->getValueType(0); 14712 14713 // fold (fceil c1) -> fceil(c1) 14714 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14715 return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); 14716 14717 return SDValue(); 14718 } 14719 14720 SDValue DAGCombiner::visitFTRUNC(SDNode *N) { 14721 SDValue N0 = N->getOperand(0); 14722 EVT VT = N->getValueType(0); 14723 14724 // fold (ftrunc c1) -> ftrunc(c1) 14725 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14726 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); 14727 14728 // fold ftrunc (known rounded int x) -> x 14729 // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is 14730 // likely to be generated to extract integer from a rounded floating value. 14731 switch (N0.getOpcode()) { 14732 default: break; 14733 case ISD::FRINT: 14734 case ISD::FTRUNC: 14735 case ISD::FNEARBYINT: 14736 case ISD::FFLOOR: 14737 case ISD::FCEIL: 14738 return N0; 14739 } 14740 14741 return SDValue(); 14742 } 14743 14744 SDValue DAGCombiner::visitFFLOOR(SDNode *N) { 14745 SDValue N0 = N->getOperand(0); 14746 EVT VT = N->getValueType(0); 14747 14748 // fold (ffloor c1) -> ffloor(c1) 14749 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14750 return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); 14751 14752 return SDValue(); 14753 } 14754 14755 SDValue DAGCombiner::visitFNEG(SDNode *N) { 14756 SDValue N0 = N->getOperand(0); 14757 EVT VT = N->getValueType(0); 14758 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 14759 14760 // Constant fold FNEG. 14761 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14762 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); 14763 14764 if (SDValue NegN0 = 14765 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize)) 14766 return NegN0; 14767 14768 // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 14769 // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't 14770 // know it was called from a context with a nsz flag if the input fsub does 14771 // not. 14772 if (N0.getOpcode() == ISD::FSUB && 14773 (DAG.getTarget().Options.NoSignedZerosFPMath || 14774 N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { 14775 return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), 14776 N0.getOperand(0)); 14777 } 14778 14779 if (SDValue Cast = foldSignChangeInBitcast(N)) 14780 return Cast; 14781 14782 return SDValue(); 14783 } 14784 14785 static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, 14786 APFloat (*Op)(const APFloat &, const APFloat &)) { 14787 SDValue N0 = N->getOperand(0); 14788 SDValue N1 = N->getOperand(1); 14789 EVT VT = N->getValueType(0); 14790 const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); 14791 const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); 14792 const SDNodeFlags Flags = N->getFlags(); 14793 unsigned Opc = N->getOpcode(); 14794 bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; 14795 bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; 14796 SelectionDAG::FlagInserter FlagsInserter(DAG, N); 14797 14798 if (N0CFP && N1CFP) { 14799 const APFloat &C0 = N0CFP->getValueAPF(); 14800 const APFloat &C1 = N1CFP->getValueAPF(); 14801 return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT); 14802 } 14803 14804 // Canonicalize to constant on RHS. 14805 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && 14806 !DAG.isConstantFPBuildVectorOrConstantFP(N1)) 14807 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 14808 14809 if (N1CFP) { 14810 const APFloat &AF = N1CFP->getValueAPF(); 14811 14812 // minnum(X, nan) -> X 14813 // maxnum(X, nan) -> X 14814 // minimum(X, nan) -> nan 14815 // maximum(X, nan) -> nan 14816 if (AF.isNaN()) 14817 return PropagatesNaN ? N->getOperand(1) : N->getOperand(0); 14818 14819 // In the following folds, inf can be replaced with the largest finite 14820 // float, if the ninf flag is set. 14821 if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) { 14822 // minnum(X, -inf) -> -inf 14823 // maxnum(X, +inf) -> +inf 14824 // minimum(X, -inf) -> -inf if nnan 14825 // maximum(X, +inf) -> +inf if nnan 14826 if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs())) 14827 return N->getOperand(1); 14828 14829 // minnum(X, +inf) -> X if nnan 14830 // maxnum(X, -inf) -> X if nnan 14831 // minimum(X, +inf) -> X 14832 // maximum(X, -inf) -> X 14833 if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs())) 14834 return N->getOperand(0); 14835 } 14836 } 14837 14838 return SDValue(); 14839 } 14840 14841 SDValue DAGCombiner::visitFMINNUM(SDNode *N) { 14842 return visitFMinMax(DAG, N, minnum); 14843 } 14844 14845 SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { 14846 return visitFMinMax(DAG, N, maxnum); 14847 } 14848 14849 SDValue DAGCombiner::visitFMINIMUM(SDNode *N) { 14850 return visitFMinMax(DAG, N, minimum); 14851 } 14852 14853 SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) { 14854 return visitFMinMax(DAG, N, maximum); 14855 } 14856 14857 SDValue DAGCombiner::visitFABS(SDNode *N) { 14858 SDValue N0 = N->getOperand(0); 14859 EVT VT = N->getValueType(0); 14860 14861 // fold (fabs c1) -> fabs(c1) 14862 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) 14863 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 14864 14865 // fold (fabs (fabs x)) -> (fabs x) 14866 if (N0.getOpcode() == ISD::FABS) 14867 return N->getOperand(0); 14868 14869 // fold (fabs (fneg x)) -> (fabs x) 14870 // fold (fabs (fcopysign x, y)) -> (fabs x) 14871 if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) 14872 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); 14873 14874 if (SDValue Cast = foldSignChangeInBitcast(N)) 14875 return Cast; 14876 14877 return SDValue(); 14878 } 14879 14880 SDValue DAGCombiner::visitBRCOND(SDNode *N) { 14881 SDValue Chain = N->getOperand(0); 14882 SDValue N1 = N->getOperand(1); 14883 SDValue N2 = N->getOperand(2); 14884 14885 // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are 14886 // nondeterministic jumps). 14887 if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) { 14888 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, 14889 N1->getOperand(0), N2); 14890 } 14891 14892 // If N is a constant we could fold this into a fallthrough or unconditional 14893 // branch. However that doesn't happen very often in normal code, because 14894 // Instcombine/SimplifyCFG should have handled the available opportunities. 14895 // If we did this folding here, it would be necessary to update the 14896 // MachineBasicBlock CFG, which is awkward. 14897 14898 // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal 14899 // on the target. 14900 if (N1.getOpcode() == ISD::SETCC && 14901 TLI.isOperationLegalOrCustom(ISD::BR_CC, 14902 N1.getOperand(0).getValueType())) { 14903 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 14904 Chain, N1.getOperand(2), 14905 N1.getOperand(0), N1.getOperand(1), N2); 14906 } 14907 14908 if (N1.hasOneUse()) { 14909 // rebuildSetCC calls visitXor which may change the Chain when there is a 14910 // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes. 14911 HandleSDNode ChainHandle(Chain); 14912 if (SDValue NewN1 = rebuildSetCC(N1)) 14913 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, 14914 ChainHandle.getValue(), NewN1, N2); 14915 } 14916 14917 return SDValue(); 14918 } 14919 14920 SDValue DAGCombiner::rebuildSetCC(SDValue N) { 14921 if (N.getOpcode() == ISD::SRL || 14922 (N.getOpcode() == ISD::TRUNCATE && 14923 (N.getOperand(0).hasOneUse() && 14924 N.getOperand(0).getOpcode() == ISD::SRL))) { 14925 // Look pass the truncate. 14926 if (N.getOpcode() == ISD::TRUNCATE) 14927 N = N.getOperand(0); 14928 14929 // Match this pattern so that we can generate simpler code: 14930 // 14931 // %a = ... 14932 // %b = and i32 %a, 2 14933 // %c = srl i32 %b, 1 14934 // brcond i32 %c ... 14935 // 14936 // into 14937 // 14938 // %a = ... 14939 // %b = and i32 %a, 2 14940 // %c = setcc eq %b, 0 14941 // brcond %c ... 14942 // 14943 // This applies only when the AND constant value has one bit set and the 14944 // SRL constant is equal to the log2 of the AND constant. The back-end is 14945 // smart enough to convert the result into a TEST/JMP sequence. 14946 SDValue Op0 = N.getOperand(0); 14947 SDValue Op1 = N.getOperand(1); 14948 14949 if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { 14950 SDValue AndOp1 = Op0.getOperand(1); 14951 14952 if (AndOp1.getOpcode() == ISD::Constant) { 14953 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue(); 14954 14955 if (AndConst.isPowerOf2() && 14956 cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) { 14957 SDLoc DL(N); 14958 return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()), 14959 Op0, DAG.getConstant(0, DL, Op0.getValueType()), 14960 ISD::SETNE); 14961 } 14962 } 14963 } 14964 } 14965 14966 // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne)) 14967 // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq)) 14968 if (N.getOpcode() == ISD::XOR) { 14969 // Because we may call this on a speculatively constructed 14970 // SimplifiedSetCC Node, we need to simplify this node first. 14971 // Ideally this should be folded into SimplifySetCC and not 14972 // here. For now, grab a handle to N so we don't lose it from 14973 // replacements interal to the visit. 14974 HandleSDNode XORHandle(N); 14975 while (N.getOpcode() == ISD::XOR) { 14976 SDValue Tmp = visitXOR(N.getNode()); 14977 // No simplification done. 14978 if (!Tmp.getNode()) 14979 break; 14980 // Returning N is form in-visit replacement that may invalidated 14981 // N. Grab value from Handle. 14982 if (Tmp.getNode() == N.getNode()) 14983 N = XORHandle.getValue(); 14984 else // Node simplified. Try simplifying again. 14985 N = Tmp; 14986 } 14987 14988 if (N.getOpcode() != ISD::XOR) 14989 return N; 14990 14991 SDValue Op0 = N->getOperand(0); 14992 SDValue Op1 = N->getOperand(1); 14993 14994 if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { 14995 bool Equal = false; 14996 // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq)) 14997 if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR && 14998 Op0.getValueType() == MVT::i1) { 14999 N = Op0; 15000 Op0 = N->getOperand(0); 15001 Op1 = N->getOperand(1); 15002 Equal = true; 15003 } 15004 15005 EVT SetCCVT = N.getValueType(); 15006 if (LegalTypes) 15007 SetCCVT = getSetCCResultType(SetCCVT); 15008 // Replace the uses of XOR with SETCC 15009 return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1, 15010 Equal ? ISD::SETEQ : ISD::SETNE); 15011 } 15012 } 15013 15014 return SDValue(); 15015 } 15016 15017 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. 15018 // 15019 SDValue DAGCombiner::visitBR_CC(SDNode *N) { 15020 CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1)); 15021 SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); 15022 15023 // If N is a constant we could fold this into a fallthrough or unconditional 15024 // branch. However that doesn't happen very often in normal code, because 15025 // Instcombine/SimplifyCFG should have handled the available opportunities. 15026 // If we did this folding here, it would be necessary to update the 15027 // MachineBasicBlock CFG, which is awkward. 15028 15029 // Use SimplifySetCC to simplify SETCC's. 15030 SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), 15031 CondLHS, CondRHS, CC->get(), SDLoc(N), 15032 false); 15033 if (Simp.getNode()) AddToWorklist(Simp.getNode()); 15034 15035 // fold to a simpler setcc 15036 if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) 15037 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 15038 N->getOperand(0), Simp.getOperand(2), 15039 Simp.getOperand(0), Simp.getOperand(1), 15040 N->getOperand(4)); 15041 15042 return SDValue(); 15043 } 15044 15045 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, 15046 bool &IsLoad, bool &IsMasked, SDValue &Ptr, 15047 const TargetLowering &TLI) { 15048 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15049 if (LD->isIndexed()) 15050 return false; 15051 EVT VT = LD->getMemoryVT(); 15052 if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT)) 15053 return false; 15054 Ptr = LD->getBasePtr(); 15055 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15056 if (ST->isIndexed()) 15057 return false; 15058 EVT VT = ST->getMemoryVT(); 15059 if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT)) 15060 return false; 15061 Ptr = ST->getBasePtr(); 15062 IsLoad = false; 15063 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15064 if (LD->isIndexed()) 15065 return false; 15066 EVT VT = LD->getMemoryVT(); 15067 if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) && 15068 !TLI.isIndexedMaskedLoadLegal(Dec, VT)) 15069 return false; 15070 Ptr = LD->getBasePtr(); 15071 IsMasked = true; 15072 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15073 if (ST->isIndexed()) 15074 return false; 15075 EVT VT = ST->getMemoryVT(); 15076 if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) && 15077 !TLI.isIndexedMaskedStoreLegal(Dec, VT)) 15078 return false; 15079 Ptr = ST->getBasePtr(); 15080 IsLoad = false; 15081 IsMasked = true; 15082 } else { 15083 return false; 15084 } 15085 return true; 15086 } 15087 15088 /// Try turning a load/store into a pre-indexed load/store when the base 15089 /// pointer is an add or subtract and it has other uses besides the load/store. 15090 /// After the transformation, the new indexed load/store has effectively folded 15091 /// the add/subtract in and all of its other uses are redirected to the 15092 /// new load/store. 15093 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { 15094 if (Level < AfterLegalizeDAG) 15095 return false; 15096 15097 bool IsLoad = true; 15098 bool IsMasked = false; 15099 SDValue Ptr; 15100 if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked, 15101 Ptr, TLI)) 15102 return false; 15103 15104 // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail 15105 // out. There is no reason to make this a preinc/predec. 15106 if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || 15107 Ptr.getNode()->hasOneUse()) 15108 return false; 15109 15110 // Ask the target to do addressing mode selection. 15111 SDValue BasePtr; 15112 SDValue Offset; 15113 ISD::MemIndexedMode AM = ISD::UNINDEXED; 15114 if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) 15115 return false; 15116 15117 // Backends without true r+i pre-indexed forms may need to pass a 15118 // constant base with a variable offset so that constant coercion 15119 // will work with the patterns in canonical form. 15120 bool Swapped = false; 15121 if (isa<ConstantSDNode>(BasePtr)) { 15122 std::swap(BasePtr, Offset); 15123 Swapped = true; 15124 } 15125 15126 // Don't create a indexed load / store with zero offset. 15127 if (isNullConstant(Offset)) 15128 return false; 15129 15130 // Try turning it into a pre-indexed load / store except when: 15131 // 1) The new base ptr is a frame index. 15132 // 2) If N is a store and the new base ptr is either the same as or is a 15133 // predecessor of the value being stored. 15134 // 3) Another use of old base ptr is a predecessor of N. If ptr is folded 15135 // that would create a cycle. 15136 // 4) All uses are load / store ops that use it as old base ptr. 15137 15138 // Check #1. Preinc'ing a frame index would require copying the stack pointer 15139 // (plus the implicit offset) to a register to preinc anyway. 15140 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 15141 return false; 15142 15143 // Check #2. 15144 if (!IsLoad) { 15145 SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue() 15146 : cast<StoreSDNode>(N)->getValue(); 15147 15148 // Would require a copy. 15149 if (Val == BasePtr) 15150 return false; 15151 15152 // Would create a cycle. 15153 if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode())) 15154 return false; 15155 } 15156 15157 // Caches for hasPredecessorHelper. 15158 SmallPtrSet<const SDNode *, 32> Visited; 15159 SmallVector<const SDNode *, 16> Worklist; 15160 Worklist.push_back(N); 15161 15162 // If the offset is a constant, there may be other adds of constants that 15163 // can be folded with this one. We should do this to avoid having to keep 15164 // a copy of the original base pointer. 15165 SmallVector<SDNode *, 16> OtherUses; 15166 if (isa<ConstantSDNode>(Offset)) 15167 for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(), 15168 UE = BasePtr.getNode()->use_end(); 15169 UI != UE; ++UI) { 15170 SDUse &Use = UI.getUse(); 15171 // Skip the use that is Ptr and uses of other results from BasePtr's 15172 // node (important for nodes that return multiple results). 15173 if (Use.getUser() == Ptr.getNode() || Use != BasePtr) 15174 continue; 15175 15176 if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist)) 15177 continue; 15178 15179 if (Use.getUser()->getOpcode() != ISD::ADD && 15180 Use.getUser()->getOpcode() != ISD::SUB) { 15181 OtherUses.clear(); 15182 break; 15183 } 15184 15185 SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1); 15186 if (!isa<ConstantSDNode>(Op1)) { 15187 OtherUses.clear(); 15188 break; 15189 } 15190 15191 // FIXME: In some cases, we can be smarter about this. 15192 if (Op1.getValueType() != Offset.getValueType()) { 15193 OtherUses.clear(); 15194 break; 15195 } 15196 15197 OtherUses.push_back(Use.getUser()); 15198 } 15199 15200 if (Swapped) 15201 std::swap(BasePtr, Offset); 15202 15203 // Now check for #3 and #4. 15204 bool RealUse = false; 15205 15206 for (SDNode *Use : Ptr.getNode()->uses()) { 15207 if (Use == N) 15208 continue; 15209 if (SDNode::hasPredecessorHelper(Use, Visited, Worklist)) 15210 return false; 15211 15212 // If Ptr may be folded in addressing mode of other use, then it's 15213 // not profitable to do this transformation. 15214 if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) 15215 RealUse = true; 15216 } 15217 15218 if (!RealUse) 15219 return false; 15220 15221 SDValue Result; 15222 if (!IsMasked) { 15223 if (IsLoad) 15224 Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 15225 else 15226 Result = 15227 DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 15228 } else { 15229 if (IsLoad) 15230 Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 15231 Offset, AM); 15232 else 15233 Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, 15234 Offset, AM); 15235 } 15236 ++PreIndexedNodes; 15237 ++NodesCombined; 15238 LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; 15239 Result.getNode()->dump(&DAG); dbgs() << '\n'); 15240 WorklistRemover DeadNodes(*this); 15241 if (IsLoad) { 15242 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 15243 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 15244 } else { 15245 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 15246 } 15247 15248 // Finally, since the node is now dead, remove it from the graph. 15249 deleteAndRecombine(N); 15250 15251 if (Swapped) 15252 std::swap(BasePtr, Offset); 15253 15254 // Replace other uses of BasePtr that can be updated to use Ptr 15255 for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { 15256 unsigned OffsetIdx = 1; 15257 if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) 15258 OffsetIdx = 0; 15259 assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == 15260 BasePtr.getNode() && "Expected BasePtr operand"); 15261 15262 // We need to replace ptr0 in the following expression: 15263 // x0 * offset0 + y0 * ptr0 = t0 15264 // knowing that 15265 // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) 15266 // 15267 // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the 15268 // indexed load/store and the expression that needs to be re-written. 15269 // 15270 // Therefore, we have: 15271 // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 15272 15273 auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); 15274 const APInt &Offset0 = CN->getAPIntValue(); 15275 const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue(); 15276 int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; 15277 int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; 15278 int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; 15279 int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; 15280 15281 unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; 15282 15283 APInt CNV = Offset0; 15284 if (X0 < 0) CNV = -CNV; 15285 if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; 15286 else CNV = CNV - Offset1; 15287 15288 SDLoc DL(OtherUses[i]); 15289 15290 // We can now generate the new expression. 15291 SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); 15292 SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0); 15293 15294 SDValue NewUse = DAG.getNode(Opcode, 15295 DL, 15296 OtherUses[i]->getValueType(0), NewOp1, NewOp2); 15297 DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); 15298 deleteAndRecombine(OtherUses[i]); 15299 } 15300 15301 // Replace the uses of Ptr with uses of the updated base value. 15302 DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0)); 15303 deleteAndRecombine(Ptr.getNode()); 15304 AddToWorklist(Result.getNode()); 15305 15306 return true; 15307 } 15308 15309 static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse, 15310 SDValue &BasePtr, SDValue &Offset, 15311 ISD::MemIndexedMode &AM, 15312 SelectionDAG &DAG, 15313 const TargetLowering &TLI) { 15314 if (PtrUse == N || 15315 (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB)) 15316 return false; 15317 15318 if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG)) 15319 return false; 15320 15321 // Don't create a indexed load / store with zero offset. 15322 if (isNullConstant(Offset)) 15323 return false; 15324 15325 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 15326 return false; 15327 15328 SmallPtrSet<const SDNode *, 32> Visited; 15329 for (SDNode *Use : BasePtr.getNode()->uses()) { 15330 if (Use == Ptr.getNode()) 15331 continue; 15332 15333 // No if there's a later user which could perform the index instead. 15334 if (isa<MemSDNode>(Use)) { 15335 bool IsLoad = true; 15336 bool IsMasked = false; 15337 SDValue OtherPtr; 15338 if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad, 15339 IsMasked, OtherPtr, TLI)) { 15340 SmallVector<const SDNode *, 2> Worklist; 15341 Worklist.push_back(Use); 15342 if (SDNode::hasPredecessorHelper(N, Visited, Worklist)) 15343 return false; 15344 } 15345 } 15346 15347 // If all the uses are load / store addresses, then don't do the 15348 // transformation. 15349 if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { 15350 for (SDNode *UseUse : Use->uses()) 15351 if (canFoldInAddressingMode(Use, UseUse, DAG, TLI)) 15352 return false; 15353 } 15354 } 15355 return true; 15356 } 15357 15358 static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad, 15359 bool &IsMasked, SDValue &Ptr, 15360 SDValue &BasePtr, SDValue &Offset, 15361 ISD::MemIndexedMode &AM, 15362 SelectionDAG &DAG, 15363 const TargetLowering &TLI) { 15364 if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, 15365 IsMasked, Ptr, TLI) || 15366 Ptr.getNode()->hasOneUse()) 15367 return nullptr; 15368 15369 // Try turning it into a post-indexed load / store except when 15370 // 1) All uses are load / store ops that use it as base ptr (and 15371 // it may be folded as addressing mmode). 15372 // 2) Op must be independent of N, i.e. Op is neither a predecessor 15373 // nor a successor of N. Otherwise, if Op is folded that would 15374 // create a cycle. 15375 for (SDNode *Op : Ptr->uses()) { 15376 // Check for #1. 15377 if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI)) 15378 continue; 15379 15380 // Check for #2. 15381 SmallPtrSet<const SDNode *, 32> Visited; 15382 SmallVector<const SDNode *, 8> Worklist; 15383 // Ptr is predecessor to both N and Op. 15384 Visited.insert(Ptr.getNode()); 15385 Worklist.push_back(N); 15386 Worklist.push_back(Op); 15387 if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && 15388 !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) 15389 return Op; 15390 } 15391 return nullptr; 15392 } 15393 15394 /// Try to combine a load/store with a add/sub of the base pointer node into a 15395 /// post-indexed load/store. The transformation folded the add/subtract into the 15396 /// new indexed load/store effectively and all of its uses are redirected to the 15397 /// new load/store. 15398 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { 15399 if (Level < AfterLegalizeDAG) 15400 return false; 15401 15402 bool IsLoad = true; 15403 bool IsMasked = false; 15404 SDValue Ptr; 15405 SDValue BasePtr; 15406 SDValue Offset; 15407 ISD::MemIndexedMode AM = ISD::UNINDEXED; 15408 SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr, 15409 Offset, AM, DAG, TLI); 15410 if (!Op) 15411 return false; 15412 15413 SDValue Result; 15414 if (!IsMasked) 15415 Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 15416 Offset, AM) 15417 : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), 15418 BasePtr, Offset, AM); 15419 else 15420 Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), 15421 BasePtr, Offset, AM) 15422 : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), 15423 BasePtr, Offset, AM); 15424 ++PostIndexedNodes; 15425 ++NodesCombined; 15426 LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); 15427 dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); 15428 dbgs() << '\n'); 15429 WorklistRemover DeadNodes(*this); 15430 if (IsLoad) { 15431 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 15432 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 15433 } else { 15434 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 15435 } 15436 15437 // Finally, since the node is now dead, remove it from the graph. 15438 deleteAndRecombine(N); 15439 15440 // Replace the uses of Use with uses of the updated base value. 15441 DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), 15442 Result.getValue(IsLoad ? 1 : 0)); 15443 deleteAndRecombine(Op); 15444 return true; 15445 } 15446 15447 /// Return the base-pointer arithmetic from an indexed \p LD. 15448 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { 15449 ISD::MemIndexedMode AM = LD->getAddressingMode(); 15450 assert(AM != ISD::UNINDEXED); 15451 SDValue BP = LD->getOperand(1); 15452 SDValue Inc = LD->getOperand(2); 15453 15454 // Some backends use TargetConstants for load offsets, but don't expect 15455 // TargetConstants in general ADD nodes. We can convert these constants into 15456 // regular Constants (if the constant is not opaque). 15457 assert((Inc.getOpcode() != ISD::TargetConstant || 15458 !cast<ConstantSDNode>(Inc)->isOpaque()) && 15459 "Cannot split out indexing using opaque target constants"); 15460 if (Inc.getOpcode() == ISD::TargetConstant) { 15461 ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc); 15462 Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc), 15463 ConstInc->getValueType(0)); 15464 } 15465 15466 unsigned Opc = 15467 (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); 15468 return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); 15469 } 15470 15471 static inline ElementCount numVectorEltsOrZero(EVT T) { 15472 return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0); 15473 } 15474 15475 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { 15476 Val = ST->getValue(); 15477 EVT STType = Val.getValueType(); 15478 EVT STMemType = ST->getMemoryVT(); 15479 if (STType == STMemType) 15480 return true; 15481 if (isTypeLegal(STMemType)) 15482 return false; // fail. 15483 if (STType.isFloatingPoint() && STMemType.isFloatingPoint() && 15484 TLI.isOperationLegal(ISD::FTRUNC, STMemType)) { 15485 Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val); 15486 return true; 15487 } 15488 if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) && 15489 STType.isInteger() && STMemType.isInteger()) { 15490 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val); 15491 return true; 15492 } 15493 if (STType.getSizeInBits() == STMemType.getSizeInBits()) { 15494 Val = DAG.getBitcast(STMemType, Val); 15495 return true; 15496 } 15497 return false; // fail. 15498 } 15499 15500 bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { 15501 EVT LDMemType = LD->getMemoryVT(); 15502 EVT LDType = LD->getValueType(0); 15503 assert(Val.getValueType() == LDMemType && 15504 "Attempting to extend value of non-matching type"); 15505 if (LDType == LDMemType) 15506 return true; 15507 if (LDMemType.isInteger() && LDType.isInteger()) { 15508 switch (LD->getExtensionType()) { 15509 case ISD::NON_EXTLOAD: 15510 Val = DAG.getBitcast(LDType, Val); 15511 return true; 15512 case ISD::EXTLOAD: 15513 Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val); 15514 return true; 15515 case ISD::SEXTLOAD: 15516 Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val); 15517 return true; 15518 case ISD::ZEXTLOAD: 15519 Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val); 15520 return true; 15521 } 15522 } 15523 return false; 15524 } 15525 15526 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { 15527 if (OptLevel == CodeGenOpt::None || !LD->isSimple()) 15528 return SDValue(); 15529 SDValue Chain = LD->getOperand(0); 15530 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode()); 15531 // TODO: Relax this restriction for unordered atomics (see D66309) 15532 if (!ST || !ST->isSimple()) 15533 return SDValue(); 15534 15535 EVT LDType = LD->getValueType(0); 15536 EVT LDMemType = LD->getMemoryVT(); 15537 EVT STMemType = ST->getMemoryVT(); 15538 EVT STType = ST->getValue().getValueType(); 15539 15540 // There are two cases to consider here: 15541 // 1. The store is fixed width and the load is scalable. In this case we 15542 // don't know at compile time if the store completely envelops the load 15543 // so we abandon the optimisation. 15544 // 2. The store is scalable and the load is fixed width. We could 15545 // potentially support a limited number of cases here, but there has been 15546 // no cost-benefit analysis to prove it's worth it. 15547 bool LdStScalable = LDMemType.isScalableVector(); 15548 if (LdStScalable != STMemType.isScalableVector()) 15549 return SDValue(); 15550 15551 // If we are dealing with scalable vectors on a big endian platform the 15552 // calculation of offsets below becomes trickier, since we do not know at 15553 // compile time the absolute size of the vector. Until we've done more 15554 // analysis on big-endian platforms it seems better to bail out for now. 15555 if (LdStScalable && DAG.getDataLayout().isBigEndian()) 15556 return SDValue(); 15557 15558 BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); 15559 BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); 15560 int64_t Offset; 15561 if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) 15562 return SDValue(); 15563 15564 // Normalize for Endianness. After this Offset=0 will denote that the least 15565 // significant bit in the loaded value maps to the least significant bit in 15566 // the stored value). With Offset=n (for n > 0) the loaded value starts at the 15567 // n:th least significant byte of the stored value. 15568 if (DAG.getDataLayout().isBigEndian()) 15569 Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() - 15570 (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) / 15571 8 - 15572 Offset; 15573 15574 // Check that the stored value cover all bits that are loaded. 15575 bool STCoversLD; 15576 15577 TypeSize LdMemSize = LDMemType.getSizeInBits(); 15578 TypeSize StMemSize = STMemType.getSizeInBits(); 15579 if (LdStScalable) 15580 STCoversLD = (Offset == 0) && LdMemSize == StMemSize; 15581 else 15582 STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <= 15583 StMemSize.getFixedSize()); 15584 15585 auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { 15586 if (LD->isIndexed()) { 15587 // Cannot handle opaque target constants and we must respect the user's 15588 // request not to split indexes from loads. 15589 if (!canSplitIdx(LD)) 15590 return SDValue(); 15591 SDValue Idx = SplitIndexingFromLoad(LD); 15592 SDValue Ops[] = {Val, Idx, Chain}; 15593 return CombineTo(LD, Ops, 3); 15594 } 15595 return CombineTo(LD, Val, Chain); 15596 }; 15597 15598 if (!STCoversLD) 15599 return SDValue(); 15600 15601 // Memory as copy space (potentially masked). 15602 if (Offset == 0 && LDType == STType && STMemType == LDMemType) { 15603 // Simple case: Direct non-truncating forwarding 15604 if (LDType.getSizeInBits() == LdMemSize) 15605 return ReplaceLd(LD, ST->getValue(), Chain); 15606 // Can we model the truncate and extension with an and mask? 15607 if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && 15608 !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { 15609 // Mask to size of LDMemType 15610 auto Mask = 15611 DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(), 15612 StMemSize.getFixedSize()), 15613 SDLoc(ST), STType); 15614 auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); 15615 return ReplaceLd(LD, Val, Chain); 15616 } 15617 } 15618 15619 // TODO: Deal with nonzero offset. 15620 if (LD->getBasePtr().isUndef() || Offset != 0) 15621 return SDValue(); 15622 // Model necessary truncations / extenstions. 15623 SDValue Val; 15624 // Truncate Value To Stored Memory Size. 15625 do { 15626 if (!getTruncatedStoreValue(ST, Val)) 15627 continue; 15628 if (!isTypeLegal(LDMemType)) 15629 continue; 15630 if (STMemType != LDMemType) { 15631 // TODO: Support vectors? This requires extract_subvector/bitcast. 15632 if (!STMemType.isVector() && !LDMemType.isVector() && 15633 STMemType.isInteger() && LDMemType.isInteger()) 15634 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val); 15635 else 15636 continue; 15637 } 15638 if (!extendLoadedValueToExtension(LD, Val)) 15639 continue; 15640 return ReplaceLd(LD, Val, Chain); 15641 } while (false); 15642 15643 // On failure, cleanup dead nodes we may have created. 15644 if (Val->use_empty()) 15645 deleteAndRecombine(Val.getNode()); 15646 return SDValue(); 15647 } 15648 15649 SDValue DAGCombiner::visitLOAD(SDNode *N) { 15650 LoadSDNode *LD = cast<LoadSDNode>(N); 15651 SDValue Chain = LD->getChain(); 15652 SDValue Ptr = LD->getBasePtr(); 15653 15654 // If load is not volatile and there are no uses of the loaded value (and 15655 // the updated indexed value in case of indexed loads), change uses of the 15656 // chain value into uses of the chain input (i.e. delete the dead load). 15657 // TODO: Allow this for unordered atomics (see D66309) 15658 if (LD->isSimple()) { 15659 if (N->getValueType(1) == MVT::Other) { 15660 // Unindexed loads. 15661 if (!N->hasAnyUseOfValue(0)) { 15662 // It's not safe to use the two value CombineTo variant here. e.g. 15663 // v1, chain2 = load chain1, loc 15664 // v2, chain3 = load chain2, loc 15665 // v3 = add v2, c 15666 // Now we replace use of chain2 with chain1. This makes the second load 15667 // isomorphic to the one we are deleting, and thus makes this load live. 15668 LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); 15669 dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); 15670 dbgs() << "\n"); 15671 WorklistRemover DeadNodes(*this); 15672 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 15673 AddUsersToWorklist(Chain.getNode()); 15674 if (N->use_empty()) 15675 deleteAndRecombine(N); 15676 15677 return SDValue(N, 0); // Return N so it doesn't get rechecked! 15678 } 15679 } else { 15680 // Indexed loads. 15681 assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); 15682 15683 // If this load has an opaque TargetConstant offset, then we cannot split 15684 // the indexing into an add/sub directly (that TargetConstant may not be 15685 // valid for a different type of node, and we cannot convert an opaque 15686 // target constant into a regular constant). 15687 bool CanSplitIdx = canSplitIdx(LD); 15688 15689 if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) { 15690 SDValue Undef = DAG.getUNDEF(N->getValueType(0)); 15691 SDValue Index; 15692 if (N->hasAnyUseOfValue(1) && CanSplitIdx) { 15693 Index = SplitIndexingFromLoad(LD); 15694 // Try to fold the base pointer arithmetic into subsequent loads and 15695 // stores. 15696 AddUsersToWorklist(N); 15697 } else 15698 Index = DAG.getUNDEF(N->getValueType(1)); 15699 LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); 15700 dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); 15701 dbgs() << " and 2 other values\n"); 15702 WorklistRemover DeadNodes(*this); 15703 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); 15704 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); 15705 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); 15706 deleteAndRecombine(N); 15707 return SDValue(N, 0); // Return N so it doesn't get rechecked! 15708 } 15709 } 15710 } 15711 15712 // If this load is directly stored, replace the load value with the stored 15713 // value. 15714 if (auto V = ForwardStoreValueToDirectLoad(LD)) 15715 return V; 15716 15717 // Try to infer better alignment information than the load already has. 15718 if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) { 15719 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) { 15720 if (*Alignment > LD->getAlign() && 15721 isAligned(*Alignment, LD->getSrcValueOffset())) { 15722 SDValue NewLoad = DAG.getExtLoad( 15723 LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, 15724 LD->getPointerInfo(), LD->getMemoryVT(), *Alignment, 15725 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 15726 // NewLoad will always be N as we are only refining the alignment 15727 assert(NewLoad.getNode() == N); 15728 (void)NewLoad; 15729 } 15730 } 15731 } 15732 15733 if (LD->isUnindexed()) { 15734 // Walk up chain skipping non-aliasing memory nodes. 15735 SDValue BetterChain = FindBetterChain(LD, Chain); 15736 15737 // If there is a better chain. 15738 if (Chain != BetterChain) { 15739 SDValue ReplLoad; 15740 15741 // Replace the chain to void dependency. 15742 if (LD->getExtensionType() == ISD::NON_EXTLOAD) { 15743 ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), 15744 BetterChain, Ptr, LD->getMemOperand()); 15745 } else { 15746 ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), 15747 LD->getValueType(0), 15748 BetterChain, Ptr, LD->getMemoryVT(), 15749 LD->getMemOperand()); 15750 } 15751 15752 // Create token factor to keep old chain connected. 15753 SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), 15754 MVT::Other, Chain, ReplLoad.getValue(1)); 15755 15756 // Replace uses with load result and token factor 15757 return CombineTo(N, ReplLoad.getValue(0), Token); 15758 } 15759 } 15760 15761 // Try transforming N to an indexed load. 15762 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 15763 return SDValue(N, 0); 15764 15765 // Try to slice up N to more direct loads if the slices are mapped to 15766 // different register banks or pairing can take place. 15767 if (SliceUpLoad(N)) 15768 return SDValue(N, 0); 15769 15770 return SDValue(); 15771 } 15772 15773 namespace { 15774 15775 /// Helper structure used to slice a load in smaller loads. 15776 /// Basically a slice is obtained from the following sequence: 15777 /// Origin = load Ty1, Base 15778 /// Shift = srl Ty1 Origin, CstTy Amount 15779 /// Inst = trunc Shift to Ty2 15780 /// 15781 /// Then, it will be rewritten into: 15782 /// Slice = load SliceTy, Base + SliceOffset 15783 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 15784 /// 15785 /// SliceTy is deduced from the number of bits that are actually used to 15786 /// build Inst. 15787 struct LoadedSlice { 15788 /// Helper structure used to compute the cost of a slice. 15789 struct Cost { 15790 /// Are we optimizing for code size. 15791 bool ForCodeSize = false; 15792 15793 /// Various cost. 15794 unsigned Loads = 0; 15795 unsigned Truncates = 0; 15796 unsigned CrossRegisterBanksCopies = 0; 15797 unsigned ZExts = 0; 15798 unsigned Shift = 0; 15799 15800 explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {} 15801 15802 /// Get the cost of one isolated slice. 15803 Cost(const LoadedSlice &LS, bool ForCodeSize) 15804 : ForCodeSize(ForCodeSize), Loads(1) { 15805 EVT TruncType = LS.Inst->getValueType(0); 15806 EVT LoadedType = LS.getLoadedType(); 15807 if (TruncType != LoadedType && 15808 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) 15809 ZExts = 1; 15810 } 15811 15812 /// Account for slicing gain in the current cost. 15813 /// Slicing provide a few gains like removing a shift or a 15814 /// truncate. This method allows to grow the cost of the original 15815 /// load with the gain from this slice. 15816 void addSliceGain(const LoadedSlice &LS) { 15817 // Each slice saves a truncate. 15818 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); 15819 if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), 15820 LS.Inst->getValueType(0))) 15821 ++Truncates; 15822 // If there is a shift amount, this slice gets rid of it. 15823 if (LS.Shift) 15824 ++Shift; 15825 // If this slice can merge a cross register bank copy, account for it. 15826 if (LS.canMergeExpensiveCrossRegisterBankCopy()) 15827 ++CrossRegisterBanksCopies; 15828 } 15829 15830 Cost &operator+=(const Cost &RHS) { 15831 Loads += RHS.Loads; 15832 Truncates += RHS.Truncates; 15833 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; 15834 ZExts += RHS.ZExts; 15835 Shift += RHS.Shift; 15836 return *this; 15837 } 15838 15839 bool operator==(const Cost &RHS) const { 15840 return Loads == RHS.Loads && Truncates == RHS.Truncates && 15841 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && 15842 ZExts == RHS.ZExts && Shift == RHS.Shift; 15843 } 15844 15845 bool operator!=(const Cost &RHS) const { return !(*this == RHS); } 15846 15847 bool operator<(const Cost &RHS) const { 15848 // Assume cross register banks copies are as expensive as loads. 15849 // FIXME: Do we want some more target hooks? 15850 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; 15851 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; 15852 // Unless we are optimizing for code size, consider the 15853 // expensive operation first. 15854 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) 15855 return ExpensiveOpsLHS < ExpensiveOpsRHS; 15856 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < 15857 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); 15858 } 15859 15860 bool operator>(const Cost &RHS) const { return RHS < *this; } 15861 15862 bool operator<=(const Cost &RHS) const { return !(RHS < *this); } 15863 15864 bool operator>=(const Cost &RHS) const { return !(*this < RHS); } 15865 }; 15866 15867 // The last instruction that represent the slice. This should be a 15868 // truncate instruction. 15869 SDNode *Inst; 15870 15871 // The original load instruction. 15872 LoadSDNode *Origin; 15873 15874 // The right shift amount in bits from the original load. 15875 unsigned Shift; 15876 15877 // The DAG from which Origin came from. 15878 // This is used to get some contextual information about legal types, etc. 15879 SelectionDAG *DAG; 15880 15881 LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, 15882 unsigned Shift = 0, SelectionDAG *DAG = nullptr) 15883 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} 15884 15885 /// Get the bits used in a chunk of bits \p BitWidth large. 15886 /// \return Result is \p BitWidth and has used bits set to 1 and 15887 /// not used bits set to 0. 15888 APInt getUsedBits() const { 15889 // Reproduce the trunc(lshr) sequence: 15890 // - Start from the truncated value. 15891 // - Zero extend to the desired bit width. 15892 // - Shift left. 15893 assert(Origin && "No original load to compare against."); 15894 unsigned BitWidth = Origin->getValueSizeInBits(0); 15895 assert(Inst && "This slice is not bound to an instruction"); 15896 assert(Inst->getValueSizeInBits(0) <= BitWidth && 15897 "Extracted slice is bigger than the whole type!"); 15898 APInt UsedBits(Inst->getValueSizeInBits(0), 0); 15899 UsedBits.setAllBits(); 15900 UsedBits = UsedBits.zext(BitWidth); 15901 UsedBits <<= Shift; 15902 return UsedBits; 15903 } 15904 15905 /// Get the size of the slice to be loaded in bytes. 15906 unsigned getLoadedSize() const { 15907 unsigned SliceSize = getUsedBits().countPopulation(); 15908 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); 15909 return SliceSize / 8; 15910 } 15911 15912 /// Get the type that will be loaded for this slice. 15913 /// Note: This may not be the final type for the slice. 15914 EVT getLoadedType() const { 15915 assert(DAG && "Missing context"); 15916 LLVMContext &Ctxt = *DAG->getContext(); 15917 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); 15918 } 15919 15920 /// Get the alignment of the load used for this slice. 15921 Align getAlign() const { 15922 Align Alignment = Origin->getAlign(); 15923 uint64_t Offset = getOffsetFromBase(); 15924 if (Offset != 0) 15925 Alignment = commonAlignment(Alignment, Alignment.value() + Offset); 15926 return Alignment; 15927 } 15928 15929 /// Check if this slice can be rewritten with legal operations. 15930 bool isLegal() const { 15931 // An invalid slice is not legal. 15932 if (!Origin || !Inst || !DAG) 15933 return false; 15934 15935 // Offsets are for indexed load only, we do not handle that. 15936 if (!Origin->getOffset().isUndef()) 15937 return false; 15938 15939 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 15940 15941 // Check that the type is legal. 15942 EVT SliceType = getLoadedType(); 15943 if (!TLI.isTypeLegal(SliceType)) 15944 return false; 15945 15946 // Check that the load is legal for this type. 15947 if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) 15948 return false; 15949 15950 // Check that the offset can be computed. 15951 // 1. Check its type. 15952 EVT PtrType = Origin->getBasePtr().getValueType(); 15953 if (PtrType == MVT::Untyped || PtrType.isExtended()) 15954 return false; 15955 15956 // 2. Check that it fits in the immediate. 15957 if (!TLI.isLegalAddImmediate(getOffsetFromBase())) 15958 return false; 15959 15960 // 3. Check that the computation is legal. 15961 if (!TLI.isOperationLegal(ISD::ADD, PtrType)) 15962 return false; 15963 15964 // Check that the zext is legal if it needs one. 15965 EVT TruncateType = Inst->getValueType(0); 15966 if (TruncateType != SliceType && 15967 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) 15968 return false; 15969 15970 return true; 15971 } 15972 15973 /// Get the offset in bytes of this slice in the original chunk of 15974 /// bits. 15975 /// \pre DAG != nullptr. 15976 uint64_t getOffsetFromBase() const { 15977 assert(DAG && "Missing context."); 15978 bool IsBigEndian = DAG->getDataLayout().isBigEndian(); 15979 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); 15980 uint64_t Offset = Shift / 8; 15981 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; 15982 assert(!(Origin->getValueSizeInBits(0) & 0x7) && 15983 "The size of the original loaded type is not a multiple of a" 15984 " byte."); 15985 // If Offset is bigger than TySizeInBytes, it means we are loading all 15986 // zeros. This should have been optimized before in the process. 15987 assert(TySizeInBytes > Offset && 15988 "Invalid shift amount for given loaded size"); 15989 if (IsBigEndian) 15990 Offset = TySizeInBytes - Offset - getLoadedSize(); 15991 return Offset; 15992 } 15993 15994 /// Generate the sequence of instructions to load the slice 15995 /// represented by this object and redirect the uses of this slice to 15996 /// this new sequence of instructions. 15997 /// \pre this->Inst && this->Origin are valid Instructions and this 15998 /// object passed the legal check: LoadedSlice::isLegal returned true. 15999 /// \return The last instruction of the sequence used to load the slice. 16000 SDValue loadSlice() const { 16001 assert(Inst && Origin && "Unable to replace a non-existing slice."); 16002 const SDValue &OldBaseAddr = Origin->getBasePtr(); 16003 SDValue BaseAddr = OldBaseAddr; 16004 // Get the offset in that chunk of bytes w.r.t. the endianness. 16005 int64_t Offset = static_cast<int64_t>(getOffsetFromBase()); 16006 assert(Offset >= 0 && "Offset too big to fit in int64_t!"); 16007 if (Offset) { 16008 // BaseAddr = BaseAddr + Offset. 16009 EVT ArithType = BaseAddr.getValueType(); 16010 SDLoc DL(Origin); 16011 BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr, 16012 DAG->getConstant(Offset, DL, ArithType)); 16013 } 16014 16015 // Create the type of the loaded slice according to its size. 16016 EVT SliceType = getLoadedType(); 16017 16018 // Create the load for the slice. 16019 SDValue LastInst = 16020 DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, 16021 Origin->getPointerInfo().getWithOffset(Offset), getAlign(), 16022 Origin->getMemOperand()->getFlags()); 16023 // If the final type is not the same as the loaded type, this means that 16024 // we have to pad with zero. Create a zero extend for that. 16025 EVT FinalType = Inst->getValueType(0); 16026 if (SliceType != FinalType) 16027 LastInst = 16028 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); 16029 return LastInst; 16030 } 16031 16032 /// Check if this slice can be merged with an expensive cross register 16033 /// bank copy. E.g., 16034 /// i = load i32 16035 /// f = bitcast i32 i to float 16036 bool canMergeExpensiveCrossRegisterBankCopy() const { 16037 if (!Inst || !Inst->hasOneUse()) 16038 return false; 16039 SDNode *Use = *Inst->use_begin(); 16040 if (Use->getOpcode() != ISD::BITCAST) 16041 return false; 16042 assert(DAG && "Missing context"); 16043 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 16044 EVT ResVT = Use->getValueType(0); 16045 const TargetRegisterClass *ResRC = 16046 TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent()); 16047 const TargetRegisterClass *ArgRC = 16048 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(), 16049 Use->getOperand(0)->isDivergent()); 16050 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) 16051 return false; 16052 16053 // At this point, we know that we perform a cross-register-bank copy. 16054 // Check if it is expensive. 16055 const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); 16056 // Assume bitcasts are cheap, unless both register classes do not 16057 // explicitly share a common sub class. 16058 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) 16059 return false; 16060 16061 // Check if it will be merged with the load. 16062 // 1. Check the alignment constraint. 16063 Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign( 16064 ResVT.getTypeForEVT(*DAG->getContext())); 16065 16066 if (RequiredAlignment > getAlign()) 16067 return false; 16068 16069 // 2. Check that the load is a legal operation for that type. 16070 if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) 16071 return false; 16072 16073 // 3. Check that we do not have a zext in the way. 16074 if (Inst->getValueType(0) != getLoadedType()) 16075 return false; 16076 16077 return true; 16078 } 16079 }; 16080 16081 } // end anonymous namespace 16082 16083 /// Check that all bits set in \p UsedBits form a dense region, i.e., 16084 /// \p UsedBits looks like 0..0 1..1 0..0. 16085 static bool areUsedBitsDense(const APInt &UsedBits) { 16086 // If all the bits are one, this is dense! 16087 if (UsedBits.isAllOnesValue()) 16088 return true; 16089 16090 // Get rid of the unused bits on the right. 16091 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); 16092 // Get rid of the unused bits on the left. 16093 if (NarrowedUsedBits.countLeadingZeros()) 16094 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); 16095 // Check that the chunk of bits is completely used. 16096 return NarrowedUsedBits.isAllOnesValue(); 16097 } 16098 16099 /// Check whether or not \p First and \p Second are next to each other 16100 /// in memory. This means that there is no hole between the bits loaded 16101 /// by \p First and the bits loaded by \p Second. 16102 static bool areSlicesNextToEachOther(const LoadedSlice &First, 16103 const LoadedSlice &Second) { 16104 assert(First.Origin == Second.Origin && First.Origin && 16105 "Unable to match different memory origins."); 16106 APInt UsedBits = First.getUsedBits(); 16107 assert((UsedBits & Second.getUsedBits()) == 0 && 16108 "Slices are not supposed to overlap."); 16109 UsedBits |= Second.getUsedBits(); 16110 return areUsedBitsDense(UsedBits); 16111 } 16112 16113 /// Adjust the \p GlobalLSCost according to the target 16114 /// paring capabilities and the layout of the slices. 16115 /// \pre \p GlobalLSCost should account for at least as many loads as 16116 /// there is in the slices in \p LoadedSlices. 16117 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices, 16118 LoadedSlice::Cost &GlobalLSCost) { 16119 unsigned NumberOfSlices = LoadedSlices.size(); 16120 // If there is less than 2 elements, no pairing is possible. 16121 if (NumberOfSlices < 2) 16122 return; 16123 16124 // Sort the slices so that elements that are likely to be next to each 16125 // other in memory are next to each other in the list. 16126 llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) { 16127 assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); 16128 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); 16129 }); 16130 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); 16131 // First (resp. Second) is the first (resp. Second) potentially candidate 16132 // to be placed in a paired load. 16133 const LoadedSlice *First = nullptr; 16134 const LoadedSlice *Second = nullptr; 16135 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, 16136 // Set the beginning of the pair. 16137 First = Second) { 16138 Second = &LoadedSlices[CurrSlice]; 16139 16140 // If First is NULL, it means we start a new pair. 16141 // Get to the next slice. 16142 if (!First) 16143 continue; 16144 16145 EVT LoadedType = First->getLoadedType(); 16146 16147 // If the types of the slices are different, we cannot pair them. 16148 if (LoadedType != Second->getLoadedType()) 16149 continue; 16150 16151 // Check if the target supplies paired loads for this type. 16152 Align RequiredAlignment; 16153 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { 16154 // move to the next pair, this type is hopeless. 16155 Second = nullptr; 16156 continue; 16157 } 16158 // Check if we meet the alignment requirement. 16159 if (First->getAlign() < RequiredAlignment) 16160 continue; 16161 16162 // Check that both loads are next to each other in memory. 16163 if (!areSlicesNextToEachOther(*First, *Second)) 16164 continue; 16165 16166 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); 16167 --GlobalLSCost.Loads; 16168 // Move to the next pair. 16169 Second = nullptr; 16170 } 16171 } 16172 16173 /// Check the profitability of all involved LoadedSlice. 16174 /// Currently, it is considered profitable if there is exactly two 16175 /// involved slices (1) which are (2) next to each other in memory, and 16176 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). 16177 /// 16178 /// Note: The order of the elements in \p LoadedSlices may be modified, but not 16179 /// the elements themselves. 16180 /// 16181 /// FIXME: When the cost model will be mature enough, we can relax 16182 /// constraints (1) and (2). 16183 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices, 16184 const APInt &UsedBits, bool ForCodeSize) { 16185 unsigned NumberOfSlices = LoadedSlices.size(); 16186 if (StressLoadSlicing) 16187 return NumberOfSlices > 1; 16188 16189 // Check (1). 16190 if (NumberOfSlices != 2) 16191 return false; 16192 16193 // Check (2). 16194 if (!areUsedBitsDense(UsedBits)) 16195 return false; 16196 16197 // Check (3). 16198 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); 16199 // The original code has one big load. 16200 OrigCost.Loads = 1; 16201 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { 16202 const LoadedSlice &LS = LoadedSlices[CurrSlice]; 16203 // Accumulate the cost of all the slices. 16204 LoadedSlice::Cost SliceCost(LS, ForCodeSize); 16205 GlobalSlicingCost += SliceCost; 16206 16207 // Account as cost in the original configuration the gain obtained 16208 // with the current slices. 16209 OrigCost.addSliceGain(LS); 16210 } 16211 16212 // If the target supports paired load, adjust the cost accordingly. 16213 adjustCostForPairing(LoadedSlices, GlobalSlicingCost); 16214 return OrigCost > GlobalSlicingCost; 16215 } 16216 16217 /// If the given load, \p LI, is used only by trunc or trunc(lshr) 16218 /// operations, split it in the various pieces being extracted. 16219 /// 16220 /// This sort of thing is introduced by SROA. 16221 /// This slicing takes care not to insert overlapping loads. 16222 /// \pre LI is a simple load (i.e., not an atomic or volatile load). 16223 bool DAGCombiner::SliceUpLoad(SDNode *N) { 16224 if (Level < AfterLegalizeDAG) 16225 return false; 16226 16227 LoadSDNode *LD = cast<LoadSDNode>(N); 16228 if (!LD->isSimple() || !ISD::isNormalLoad(LD) || 16229 !LD->getValueType(0).isInteger()) 16230 return false; 16231 16232 // The algorithm to split up a load of a scalable vector into individual 16233 // elements currently requires knowing the length of the loaded type, 16234 // so will need adjusting to work on scalable vectors. 16235 if (LD->getValueType(0).isScalableVector()) 16236 return false; 16237 16238 // Keep track of already used bits to detect overlapping values. 16239 // In that case, we will just abort the transformation. 16240 APInt UsedBits(LD->getValueSizeInBits(0), 0); 16241 16242 SmallVector<LoadedSlice, 4> LoadedSlices; 16243 16244 // Check if this load is used as several smaller chunks of bits. 16245 // Basically, look for uses in trunc or trunc(lshr) and record a new chain 16246 // of computation for each trunc. 16247 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); 16248 UI != UIEnd; ++UI) { 16249 // Skip the uses of the chain. 16250 if (UI.getUse().getResNo() != 0) 16251 continue; 16252 16253 SDNode *User = *UI; 16254 unsigned Shift = 0; 16255 16256 // Check if this is a trunc(lshr). 16257 if (User->getOpcode() == ISD::SRL && User->hasOneUse() && 16258 isa<ConstantSDNode>(User->getOperand(1))) { 16259 Shift = User->getConstantOperandVal(1); 16260 User = *User->use_begin(); 16261 } 16262 16263 // At this point, User is a Truncate, iff we encountered, trunc or 16264 // trunc(lshr). 16265 if (User->getOpcode() != ISD::TRUNCATE) 16266 return false; 16267 16268 // The width of the type must be a power of 2 and greater than 8-bits. 16269 // Otherwise the load cannot be represented in LLVM IR. 16270 // Moreover, if we shifted with a non-8-bits multiple, the slice 16271 // will be across several bytes. We do not support that. 16272 unsigned Width = User->getValueSizeInBits(0); 16273 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) 16274 return false; 16275 16276 // Build the slice for this chain of computations. 16277 LoadedSlice LS(User, LD, Shift, &DAG); 16278 APInt CurrentUsedBits = LS.getUsedBits(); 16279 16280 // Check if this slice overlaps with another. 16281 if ((CurrentUsedBits & UsedBits) != 0) 16282 return false; 16283 // Update the bits used globally. 16284 UsedBits |= CurrentUsedBits; 16285 16286 // Check if the new slice would be legal. 16287 if (!LS.isLegal()) 16288 return false; 16289 16290 // Record the slice. 16291 LoadedSlices.push_back(LS); 16292 } 16293 16294 // Abort slicing if it does not seem to be profitable. 16295 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) 16296 return false; 16297 16298 ++SlicedLoads; 16299 16300 // Rewrite each chain to use an independent load. 16301 // By construction, each chain can be represented by a unique load. 16302 16303 // Prepare the argument for the new token factor for all the slices. 16304 SmallVector<SDValue, 8> ArgChains; 16305 for (const LoadedSlice &LS : LoadedSlices) { 16306 SDValue SliceInst = LS.loadSlice(); 16307 CombineTo(LS.Inst, SliceInst, true); 16308 if (SliceInst.getOpcode() != ISD::LOAD) 16309 SliceInst = SliceInst.getOperand(0); 16310 assert(SliceInst->getOpcode() == ISD::LOAD && 16311 "It takes more than a zext to get to the loaded slice!!"); 16312 ArgChains.push_back(SliceInst.getValue(1)); 16313 } 16314 16315 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, 16316 ArgChains); 16317 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 16318 AddToWorklist(Chain.getNode()); 16319 return true; 16320 } 16321 16322 /// Check to see if V is (and load (ptr), imm), where the load is having 16323 /// specific bytes cleared out. If so, return the byte size being masked out 16324 /// and the shift amount. 16325 static std::pair<unsigned, unsigned> 16326 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { 16327 std::pair<unsigned, unsigned> Result(0, 0); 16328 16329 // Check for the structure we're looking for. 16330 if (V->getOpcode() != ISD::AND || 16331 !isa<ConstantSDNode>(V->getOperand(1)) || 16332 !ISD::isNormalLoad(V->getOperand(0).getNode())) 16333 return Result; 16334 16335 // Check the chain and pointer. 16336 LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0)); 16337 if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. 16338 16339 // This only handles simple types. 16340 if (V.getValueType() != MVT::i16 && 16341 V.getValueType() != MVT::i32 && 16342 V.getValueType() != MVT::i64) 16343 return Result; 16344 16345 // Check the constant mask. Invert it so that the bits being masked out are 16346 // 0 and the bits being kept are 1. Use getSExtValue so that leading bits 16347 // follow the sign bit for uniformity. 16348 uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue(); 16349 unsigned NotMaskLZ = countLeadingZeros(NotMask); 16350 if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. 16351 unsigned NotMaskTZ = countTrailingZeros(NotMask); 16352 if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. 16353 if (NotMaskLZ == 64) return Result; // All zero mask. 16354 16355 // See if we have a continuous run of bits. If so, we have 0*1+0* 16356 if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) 16357 return Result; 16358 16359 // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. 16360 if (V.getValueType() != MVT::i64 && NotMaskLZ) 16361 NotMaskLZ -= 64-V.getValueSizeInBits(); 16362 16363 unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; 16364 switch (MaskedBytes) { 16365 case 1: 16366 case 2: 16367 case 4: break; 16368 default: return Result; // All one mask, or 5-byte mask. 16369 } 16370 16371 // Verify that the first bit starts at a multiple of mask so that the access 16372 // is aligned the same as the access width. 16373 if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; 16374 16375 // For narrowing to be valid, it must be the case that the load the 16376 // immediately preceding memory operation before the store. 16377 if (LD == Chain.getNode()) 16378 ; // ok. 16379 else if (Chain->getOpcode() == ISD::TokenFactor && 16380 SDValue(LD, 1).hasOneUse()) { 16381 // LD has only 1 chain use so they are no indirect dependencies. 16382 if (!LD->isOperandOf(Chain.getNode())) 16383 return Result; 16384 } else 16385 return Result; // Fail. 16386 16387 Result.first = MaskedBytes; 16388 Result.second = NotMaskTZ/8; 16389 return Result; 16390 } 16391 16392 /// Check to see if IVal is something that provides a value as specified by 16393 /// MaskInfo. If so, replace the specified store with a narrower store of 16394 /// truncated IVal. 16395 static SDValue 16396 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, 16397 SDValue IVal, StoreSDNode *St, 16398 DAGCombiner *DC) { 16399 unsigned NumBytes = MaskInfo.first; 16400 unsigned ByteShift = MaskInfo.second; 16401 SelectionDAG &DAG = DC->getDAG(); 16402 16403 // Check to see if IVal is all zeros in the part being masked in by the 'or' 16404 // that uses this. If not, this is not a replacement. 16405 APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), 16406 ByteShift*8, (ByteShift+NumBytes)*8); 16407 if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue(); 16408 16409 // Check that it is legal on the target to do this. It is legal if the new 16410 // VT we're shrinking to (i8/i16/i32) is legal or we're still before type 16411 // legalization (and the target doesn't explicitly think this is a bad idea). 16412 MVT VT = MVT::getIntegerVT(NumBytes * 8); 16413 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16414 if (!DC->isTypeLegal(VT)) 16415 return SDValue(); 16416 if (St->getMemOperand() && 16417 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 16418 *St->getMemOperand())) 16419 return SDValue(); 16420 16421 // Okay, we can do this! Replace the 'St' store with a store of IVal that is 16422 // shifted by ByteShift and truncated down to NumBytes. 16423 if (ByteShift) { 16424 SDLoc DL(IVal); 16425 IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal, 16426 DAG.getConstant(ByteShift*8, DL, 16427 DC->getShiftAmountTy(IVal.getValueType()))); 16428 } 16429 16430 // Figure out the offset for the store and the alignment of the access. 16431 unsigned StOffset; 16432 if (DAG.getDataLayout().isLittleEndian()) 16433 StOffset = ByteShift; 16434 else 16435 StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; 16436 16437 SDValue Ptr = St->getBasePtr(); 16438 if (StOffset) { 16439 SDLoc DL(IVal); 16440 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL); 16441 } 16442 16443 // Truncate down to the new size. 16444 IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); 16445 16446 ++OpsNarrowed; 16447 return DAG 16448 .getStore(St->getChain(), SDLoc(St), IVal, Ptr, 16449 St->getPointerInfo().getWithOffset(StOffset), 16450 St->getOriginalAlign()); 16451 } 16452 16453 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and 16454 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try 16455 /// narrowing the load and store if it would end up being a win for performance 16456 /// or code size. 16457 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { 16458 StoreSDNode *ST = cast<StoreSDNode>(N); 16459 if (!ST->isSimple()) 16460 return SDValue(); 16461 16462 SDValue Chain = ST->getChain(); 16463 SDValue Value = ST->getValue(); 16464 SDValue Ptr = ST->getBasePtr(); 16465 EVT VT = Value.getValueType(); 16466 16467 if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) 16468 return SDValue(); 16469 16470 unsigned Opc = Value.getOpcode(); 16471 16472 // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst 16473 // is a byte mask indicating a consecutive number of bytes, check to see if 16474 // Y is known to provide just those bytes. If so, we try to replace the 16475 // load + replace + store sequence with a single (narrower) store, which makes 16476 // the load dead. 16477 if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) { 16478 std::pair<unsigned, unsigned> MaskedLoad; 16479 MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); 16480 if (MaskedLoad.first) 16481 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 16482 Value.getOperand(1), ST,this)) 16483 return NewST; 16484 16485 // Or is commutative, so try swapping X and Y. 16486 MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); 16487 if (MaskedLoad.first) 16488 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 16489 Value.getOperand(0), ST,this)) 16490 return NewST; 16491 } 16492 16493 if (!EnableReduceLoadOpStoreWidth) 16494 return SDValue(); 16495 16496 if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || 16497 Value.getOperand(1).getOpcode() != ISD::Constant) 16498 return SDValue(); 16499 16500 SDValue N0 = Value.getOperand(0); 16501 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 16502 Chain == SDValue(N0.getNode(), 1)) { 16503 LoadSDNode *LD = cast<LoadSDNode>(N0); 16504 if (LD->getBasePtr() != Ptr || 16505 LD->getPointerInfo().getAddrSpace() != 16506 ST->getPointerInfo().getAddrSpace()) 16507 return SDValue(); 16508 16509 // Find the type to narrow it the load / op / store to. 16510 SDValue N1 = Value.getOperand(1); 16511 unsigned BitWidth = N1.getValueSizeInBits(); 16512 APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue(); 16513 if (Opc == ISD::AND) 16514 Imm ^= APInt::getAllOnesValue(BitWidth); 16515 if (Imm == 0 || Imm.isAllOnesValue()) 16516 return SDValue(); 16517 unsigned ShAmt = Imm.countTrailingZeros(); 16518 unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; 16519 unsigned NewBW = NextPowerOf2(MSB - ShAmt); 16520 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 16521 // The narrowing should be profitable, the load/store operation should be 16522 // legal (or custom) and the store size should be equal to the NewVT width. 16523 while (NewBW < BitWidth && 16524 (NewVT.getStoreSizeInBits() != NewBW || 16525 !TLI.isOperationLegalOrCustom(Opc, NewVT) || 16526 !TLI.isNarrowingProfitable(VT, NewVT))) { 16527 NewBW = NextPowerOf2(NewBW); 16528 NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 16529 } 16530 if (NewBW >= BitWidth) 16531 return SDValue(); 16532 16533 // If the lsb changed does not start at the type bitwidth boundary, 16534 // start at the previous one. 16535 if (ShAmt % NewBW) 16536 ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; 16537 APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, 16538 std::min(BitWidth, ShAmt + NewBW)); 16539 if ((Imm & Mask) == Imm) { 16540 APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); 16541 if (Opc == ISD::AND) 16542 NewImm ^= APInt::getAllOnesValue(NewBW); 16543 uint64_t PtrOff = ShAmt / 8; 16544 // For big endian targets, we need to adjust the offset to the pointer to 16545 // load the correct bytes. 16546 if (DAG.getDataLayout().isBigEndian()) 16547 PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; 16548 16549 Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); 16550 Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); 16551 if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy)) 16552 return SDValue(); 16553 16554 SDValue NewPtr = 16555 DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD)); 16556 SDValue NewLD = 16557 DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, 16558 LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, 16559 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 16560 SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, 16561 DAG.getConstant(NewImm, SDLoc(Value), 16562 NewVT)); 16563 SDValue NewST = 16564 DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, 16565 ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); 16566 16567 AddToWorklist(NewPtr.getNode()); 16568 AddToWorklist(NewLD.getNode()); 16569 AddToWorklist(NewVal.getNode()); 16570 WorklistRemover DeadNodes(*this); 16571 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); 16572 ++OpsNarrowed; 16573 return NewST; 16574 } 16575 } 16576 16577 return SDValue(); 16578 } 16579 16580 /// For a given floating point load / store pair, if the load value isn't used 16581 /// by any other operations, then consider transforming the pair to integer 16582 /// load / store operations if the target deems the transformation profitable. 16583 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { 16584 StoreSDNode *ST = cast<StoreSDNode>(N); 16585 SDValue Value = ST->getValue(); 16586 if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && 16587 Value.hasOneUse()) { 16588 LoadSDNode *LD = cast<LoadSDNode>(Value); 16589 EVT VT = LD->getMemoryVT(); 16590 if (!VT.isFloatingPoint() || 16591 VT != ST->getMemoryVT() || 16592 LD->isNonTemporal() || 16593 ST->isNonTemporal() || 16594 LD->getPointerInfo().getAddrSpace() != 0 || 16595 ST->getPointerInfo().getAddrSpace() != 0) 16596 return SDValue(); 16597 16598 TypeSize VTSize = VT.getSizeInBits(); 16599 16600 // We don't know the size of scalable types at compile time so we cannot 16601 // create an integer of the equivalent size. 16602 if (VTSize.isScalable()) 16603 return SDValue(); 16604 16605 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize()); 16606 if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || 16607 !TLI.isOperationLegal(ISD::STORE, IntVT) || 16608 !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || 16609 !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) 16610 return SDValue(); 16611 16612 Align LDAlign = LD->getAlign(); 16613 Align STAlign = ST->getAlign(); 16614 Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); 16615 Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy); 16616 if (LDAlign < ABIAlign || STAlign < ABIAlign) 16617 return SDValue(); 16618 16619 SDValue NewLD = 16620 DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), 16621 LD->getPointerInfo(), LDAlign); 16622 16623 SDValue NewST = 16624 DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(), 16625 ST->getPointerInfo(), STAlign); 16626 16627 AddToWorklist(NewLD.getNode()); 16628 AddToWorklist(NewST.getNode()); 16629 WorklistRemover DeadNodes(*this); 16630 DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); 16631 ++LdStFP2Int; 16632 return NewST; 16633 } 16634 16635 return SDValue(); 16636 } 16637 16638 // This is a helper function for visitMUL to check the profitability 16639 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 16640 // MulNode is the original multiply, AddNode is (add x, c1), 16641 // and ConstNode is c2. 16642 // 16643 // If the (add x, c1) has multiple uses, we could increase 16644 // the number of adds if we make this transformation. 16645 // It would only be worth doing this if we can remove a 16646 // multiply in the process. Check for that here. 16647 // To illustrate: 16648 // (A + c1) * c3 16649 // (A + c2) * c3 16650 // We're checking for cases where we have common "c3 * A" expressions. 16651 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, 16652 SDValue &AddNode, 16653 SDValue &ConstNode) { 16654 APInt Val; 16655 16656 // If the add only has one use, this would be OK to do. 16657 if (AddNode.getNode()->hasOneUse()) 16658 return true; 16659 16660 // Walk all the users of the constant with which we're multiplying. 16661 for (SDNode *Use : ConstNode->uses()) { 16662 if (Use == MulNode) // This use is the one we're on right now. Skip it. 16663 continue; 16664 16665 if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. 16666 SDNode *OtherOp; 16667 SDNode *MulVar = AddNode.getOperand(0).getNode(); 16668 16669 // OtherOp is what we're multiplying against the constant. 16670 if (Use->getOperand(0) == ConstNode) 16671 OtherOp = Use->getOperand(1).getNode(); 16672 else 16673 OtherOp = Use->getOperand(0).getNode(); 16674 16675 // Check to see if multiply is with the same operand of our "add". 16676 // 16677 // ConstNode = CONST 16678 // Use = ConstNode * A <-- visiting Use. OtherOp is A. 16679 // ... 16680 // AddNode = (A + c1) <-- MulVar is A. 16681 // = AddNode * ConstNode <-- current visiting instruction. 16682 // 16683 // If we make this transformation, we will have a common 16684 // multiply (ConstNode * A) that we can save. 16685 if (OtherOp == MulVar) 16686 return true; 16687 16688 // Now check to see if a future expansion will give us a common 16689 // multiply. 16690 // 16691 // ConstNode = CONST 16692 // AddNode = (A + c1) 16693 // ... = AddNode * ConstNode <-- current visiting instruction. 16694 // ... 16695 // OtherOp = (A + c2) 16696 // Use = OtherOp * ConstNode <-- visiting Use. 16697 // 16698 // If we make this transformation, we will have a common 16699 // multiply (CONST * A) after we also do the same transformation 16700 // to the "t2" instruction. 16701 if (OtherOp->getOpcode() == ISD::ADD && 16702 DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && 16703 OtherOp->getOperand(0).getNode() == MulVar) 16704 return true; 16705 } 16706 } 16707 16708 // Didn't find a case where this would be profitable. 16709 return false; 16710 } 16711 16712 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 16713 unsigned NumStores) { 16714 SmallVector<SDValue, 8> Chains; 16715 SmallPtrSet<const SDNode *, 8> Visited; 16716 SDLoc StoreDL(StoreNodes[0].MemNode); 16717 16718 for (unsigned i = 0; i < NumStores; ++i) { 16719 Visited.insert(StoreNodes[i].MemNode); 16720 } 16721 16722 // don't include nodes that are children or repeated nodes. 16723 for (unsigned i = 0; i < NumStores; ++i) { 16724 if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second) 16725 Chains.push_back(StoreNodes[i].MemNode->getChain()); 16726 } 16727 16728 assert(Chains.size() > 0 && "Chain should have generated a chain"); 16729 return DAG.getTokenFactor(StoreDL, Chains); 16730 } 16731 16732 bool DAGCombiner::mergeStoresOfConstantsOrVecElts( 16733 SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, 16734 bool IsConstantSrc, bool UseVector, bool UseTrunc) { 16735 // Make sure we have something to merge. 16736 if (NumStores < 2) 16737 return false; 16738 16739 // The latest Node in the DAG. 16740 SDLoc DL(StoreNodes[0].MemNode); 16741 16742 TypeSize ElementSizeBits = MemVT.getStoreSizeInBits(); 16743 unsigned SizeInBits = NumStores * ElementSizeBits; 16744 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 16745 16746 EVT StoreTy; 16747 if (UseVector) { 16748 unsigned Elts = NumStores * NumMemElts; 16749 // Get the type for the merged vector store. 16750 StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 16751 } else 16752 StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); 16753 16754 SDValue StoredVal; 16755 if (UseVector) { 16756 if (IsConstantSrc) { 16757 SmallVector<SDValue, 8> BuildVector; 16758 for (unsigned I = 0; I != NumStores; ++I) { 16759 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); 16760 SDValue Val = St->getValue(); 16761 // If constant is of the wrong type, convert it now. 16762 if (MemVT != Val.getValueType()) { 16763 Val = peekThroughBitcasts(Val); 16764 // Deal with constants of wrong size. 16765 if (ElementSizeBits != Val.getValueSizeInBits()) { 16766 EVT IntMemVT = 16767 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 16768 if (isa<ConstantFPSDNode>(Val)) { 16769 // Not clear how to truncate FP values. 16770 return false; 16771 } else if (auto *C = dyn_cast<ConstantSDNode>(Val)) 16772 Val = DAG.getConstant(C->getAPIntValue() 16773 .zextOrTrunc(Val.getValueSizeInBits()) 16774 .zextOrTrunc(ElementSizeBits), 16775 SDLoc(C), IntMemVT); 16776 } 16777 // Make sure correctly size type is the correct type. 16778 Val = DAG.getBitcast(MemVT, Val); 16779 } 16780 BuildVector.push_back(Val); 16781 } 16782 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 16783 : ISD::BUILD_VECTOR, 16784 DL, StoreTy, BuildVector); 16785 } else { 16786 SmallVector<SDValue, 8> Ops; 16787 for (unsigned i = 0; i < NumStores; ++i) { 16788 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 16789 SDValue Val = peekThroughBitcasts(St->getValue()); 16790 // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of 16791 // type MemVT. If the underlying value is not the correct 16792 // type, but it is an extraction of an appropriate vector we 16793 // can recast Val to be of the correct type. This may require 16794 // converting between EXTRACT_VECTOR_ELT and 16795 // EXTRACT_SUBVECTOR. 16796 if ((MemVT != Val.getValueType()) && 16797 (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 16798 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { 16799 EVT MemVTScalarTy = MemVT.getScalarType(); 16800 // We may need to add a bitcast here to get types to line up. 16801 if (MemVTScalarTy != Val.getValueType().getScalarType()) { 16802 Val = DAG.getBitcast(MemVT, Val); 16803 } else { 16804 unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR 16805 : ISD::EXTRACT_VECTOR_ELT; 16806 SDValue Vec = Val.getOperand(0); 16807 SDValue Idx = Val.getOperand(1); 16808 Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx); 16809 } 16810 } 16811 Ops.push_back(Val); 16812 } 16813 16814 // Build the extracted vector elements back into a vector. 16815 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 16816 : ISD::BUILD_VECTOR, 16817 DL, StoreTy, Ops); 16818 } 16819 } else { 16820 // We should always use a vector store when merging extracted vector 16821 // elements, so this path implies a store of constants. 16822 assert(IsConstantSrc && "Merged vector elements should use vector store"); 16823 16824 APInt StoreInt(SizeInBits, 0); 16825 16826 // Construct a single integer constant which is made of the smaller 16827 // constant inputs. 16828 bool IsLE = DAG.getDataLayout().isLittleEndian(); 16829 for (unsigned i = 0; i < NumStores; ++i) { 16830 unsigned Idx = IsLE ? (NumStores - 1 - i) : i; 16831 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); 16832 16833 SDValue Val = St->getValue(); 16834 Val = peekThroughBitcasts(Val); 16835 StoreInt <<= ElementSizeBits; 16836 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { 16837 StoreInt |= C->getAPIntValue() 16838 .zextOrTrunc(ElementSizeBits) 16839 .zextOrTrunc(SizeInBits); 16840 } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { 16841 StoreInt |= C->getValueAPF() 16842 .bitcastToAPInt() 16843 .zextOrTrunc(ElementSizeBits) 16844 .zextOrTrunc(SizeInBits); 16845 // If fp truncation is necessary give up for now. 16846 if (MemVT.getSizeInBits() != ElementSizeBits) 16847 return false; 16848 } else { 16849 llvm_unreachable("Invalid constant element type"); 16850 } 16851 } 16852 16853 // Create the new Load and Store operations. 16854 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); 16855 } 16856 16857 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16858 SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); 16859 16860 // make sure we use trunc store if it's necessary to be legal. 16861 SDValue NewStore; 16862 if (!UseTrunc) { 16863 NewStore = 16864 DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), 16865 FirstInChain->getPointerInfo(), FirstInChain->getAlign()); 16866 } else { // Must be realized as a trunc store 16867 EVT LegalizedStoredValTy = 16868 TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); 16869 unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits(); 16870 ConstantSDNode *C = cast<ConstantSDNode>(StoredVal); 16871 SDValue ExtendedStoreVal = 16872 DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, 16873 LegalizedStoredValTy); 16874 NewStore = DAG.getTruncStore( 16875 NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), 16876 FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, 16877 FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); 16878 } 16879 16880 // Replace all merged stores with the new store. 16881 for (unsigned i = 0; i < NumStores; ++i) 16882 CombineTo(StoreNodes[i].MemNode, NewStore); 16883 16884 AddToWorklist(NewChain.getNode()); 16885 return true; 16886 } 16887 16888 void DAGCombiner::getStoreMergeCandidates( 16889 StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes, 16890 SDNode *&RootNode) { 16891 // This holds the base pointer, index, and the offset in bytes from the base 16892 // pointer. We must have a base and an offset. Do not handle stores to undef 16893 // base pointers. 16894 BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 16895 if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef()) 16896 return; 16897 16898 SDValue Val = peekThroughBitcasts(St->getValue()); 16899 StoreSource StoreSrc = getStoreSource(Val); 16900 assert(StoreSrc != StoreSource::Unknown && "Expected known source for store"); 16901 16902 // Match on loadbaseptr if relevant. 16903 EVT MemVT = St->getMemoryVT(); 16904 BaseIndexOffset LBasePtr; 16905 EVT LoadVT; 16906 if (StoreSrc == StoreSource::Load) { 16907 auto *Ld = cast<LoadSDNode>(Val); 16908 LBasePtr = BaseIndexOffset::match(Ld, DAG); 16909 LoadVT = Ld->getMemoryVT(); 16910 // Load and store should be the same type. 16911 if (MemVT != LoadVT) 16912 return; 16913 // Loads must only have one use. 16914 if (!Ld->hasNUsesOfValue(1, 0)) 16915 return; 16916 // The memory operands must not be volatile/indexed/atomic. 16917 // TODO: May be able to relax for unordered atomics (see D66309) 16918 if (!Ld->isSimple() || Ld->isIndexed()) 16919 return; 16920 } 16921 auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, 16922 int64_t &Offset) -> bool { 16923 // The memory operands must not be volatile/indexed/atomic. 16924 // TODO: May be able to relax for unordered atomics (see D66309) 16925 if (!Other->isSimple() || Other->isIndexed()) 16926 return false; 16927 // Don't mix temporal stores with non-temporal stores. 16928 if (St->isNonTemporal() != Other->isNonTemporal()) 16929 return false; 16930 SDValue OtherBC = peekThroughBitcasts(Other->getValue()); 16931 // Allow merging constants of different types as integers. 16932 bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) 16933 : Other->getMemoryVT() != MemVT; 16934 switch (StoreSrc) { 16935 case StoreSource::Load: { 16936 if (NoTypeMatch) 16937 return false; 16938 // The Load's Base Ptr must also match. 16939 auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC); 16940 if (!OtherLd) 16941 return false; 16942 BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); 16943 if (LoadVT != OtherLd->getMemoryVT()) 16944 return false; 16945 // Loads must only have one use. 16946 if (!OtherLd->hasNUsesOfValue(1, 0)) 16947 return false; 16948 // The memory operands must not be volatile/indexed/atomic. 16949 // TODO: May be able to relax for unordered atomics (see D66309) 16950 if (!OtherLd->isSimple() || OtherLd->isIndexed()) 16951 return false; 16952 // Don't mix temporal loads with non-temporal loads. 16953 if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) 16954 return false; 16955 if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) 16956 return false; 16957 break; 16958 } 16959 case StoreSource::Constant: 16960 if (NoTypeMatch) 16961 return false; 16962 if (!isIntOrFPConstant(OtherBC)) 16963 return false; 16964 break; 16965 case StoreSource::Extract: 16966 // Do not merge truncated stores here. 16967 if (Other->isTruncatingStore()) 16968 return false; 16969 if (!MemVT.bitsEq(OtherBC.getValueType())) 16970 return false; 16971 if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT && 16972 OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) 16973 return false; 16974 break; 16975 default: 16976 llvm_unreachable("Unhandled store source for merging"); 16977 } 16978 Ptr = BaseIndexOffset::match(Other, DAG); 16979 return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); 16980 }; 16981 16982 // Check if the pair of StoreNode and the RootNode already bail out many 16983 // times which is over the limit in dependence check. 16984 auto OverLimitInDependenceCheck = [&](SDNode *StoreNode, 16985 SDNode *RootNode) -> bool { 16986 auto RootCount = StoreRootCountMap.find(StoreNode); 16987 return RootCount != StoreRootCountMap.end() && 16988 RootCount->second.first == RootNode && 16989 RootCount->second.second > StoreMergeDependenceLimit; 16990 }; 16991 16992 auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) { 16993 // This must be a chain use. 16994 if (UseIter.getOperandNo() != 0) 16995 return; 16996 if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) { 16997 BaseIndexOffset Ptr; 16998 int64_t PtrDiff; 16999 if (CandidateMatch(OtherStore, Ptr, PtrDiff) && 17000 !OverLimitInDependenceCheck(OtherStore, RootNode)) 17001 StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff)); 17002 } 17003 }; 17004 17005 // We looking for a root node which is an ancestor to all mergable 17006 // stores. We search up through a load, to our root and then down 17007 // through all children. For instance we will find Store{1,2,3} if 17008 // St is Store1, Store2. or Store3 where the root is not a load 17009 // which always true for nonvolatile ops. TODO: Expand 17010 // the search to find all valid candidates through multiple layers of loads. 17011 // 17012 // Root 17013 // |-------|-------| 17014 // Load Load Store3 17015 // | | 17016 // Store1 Store2 17017 // 17018 // FIXME: We should be able to climb and 17019 // descend TokenFactors to find candidates as well. 17020 17021 RootNode = St->getChain().getNode(); 17022 17023 unsigned NumNodesExplored = 0; 17024 const unsigned MaxSearchNodes = 1024; 17025 if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) { 17026 RootNode = Ldn->getChain().getNode(); 17027 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 17028 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) { 17029 if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain 17030 for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) 17031 TryToAddCandidate(I2); 17032 } 17033 } 17034 } else { 17035 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 17036 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) 17037 TryToAddCandidate(I); 17038 } 17039 } 17040 17041 // We need to check that merging these stores does not cause a loop in 17042 // the DAG. Any store candidate may depend on another candidate 17043 // indirectly through its operand (we already consider dependencies 17044 // through the chain). Check in parallel by searching up from 17045 // non-chain operands of candidates. 17046 bool DAGCombiner::checkMergeStoreCandidatesForDependencies( 17047 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 17048 SDNode *RootNode) { 17049 // FIXME: We should be able to truncate a full search of 17050 // predecessors by doing a BFS and keeping tabs the originating 17051 // stores from which worklist nodes come from in a similar way to 17052 // TokenFactor simplfication. 17053 17054 SmallPtrSet<const SDNode *, 32> Visited; 17055 SmallVector<const SDNode *, 8> Worklist; 17056 17057 // RootNode is a predecessor to all candidates so we need not search 17058 // past it. Add RootNode (peeking through TokenFactors). Do not count 17059 // these towards size check. 17060 17061 Worklist.push_back(RootNode); 17062 while (!Worklist.empty()) { 17063 auto N = Worklist.pop_back_val(); 17064 if (!Visited.insert(N).second) 17065 continue; // Already present in Visited. 17066 if (N->getOpcode() == ISD::TokenFactor) { 17067 for (SDValue Op : N->ops()) 17068 Worklist.push_back(Op.getNode()); 17069 } 17070 } 17071 17072 // Don't count pruning nodes towards max. 17073 unsigned int Max = 1024 + Visited.size(); 17074 // Search Ops of store candidates. 17075 for (unsigned i = 0; i < NumStores; ++i) { 17076 SDNode *N = StoreNodes[i].MemNode; 17077 // Of the 4 Store Operands: 17078 // * Chain (Op 0) -> We have already considered these 17079 // in candidate selection and can be 17080 // safely ignored 17081 // * Value (Op 1) -> Cycles may happen (e.g. through load chains) 17082 // * Address (Op 2) -> Merged addresses may only vary by a fixed constant, 17083 // but aren't necessarily fromt the same base node, so 17084 // cycles possible (e.g. via indexed store). 17085 // * (Op 3) -> Represents the pre or post-indexing offset (or undef for 17086 // non-indexed stores). Not constant on all targets (e.g. ARM) 17087 // and so can participate in a cycle. 17088 for (unsigned j = 1; j < N->getNumOperands(); ++j) 17089 Worklist.push_back(N->getOperand(j).getNode()); 17090 } 17091 // Search through DAG. We can stop early if we find a store node. 17092 for (unsigned i = 0; i < NumStores; ++i) 17093 if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, 17094 Max)) { 17095 // If the searching bail out, record the StoreNode and RootNode in the 17096 // StoreRootCountMap. If we have seen the pair many times over a limit, 17097 // we won't add the StoreNode into StoreNodes set again. 17098 if (Visited.size() >= Max) { 17099 auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode]; 17100 if (RootCount.first == RootNode) 17101 RootCount.second++; 17102 else 17103 RootCount = {RootNode, 1}; 17104 } 17105 return false; 17106 } 17107 return true; 17108 } 17109 17110 unsigned 17111 DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes, 17112 int64_t ElementSizeBytes) const { 17113 while (true) { 17114 // Find a store past the width of the first store. 17115 size_t StartIdx = 0; 17116 while ((StartIdx + 1 < StoreNodes.size()) && 17117 StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != 17118 StoreNodes[StartIdx + 1].OffsetFromBase) 17119 ++StartIdx; 17120 17121 // Bail if we don't have enough candidates to merge. 17122 if (StartIdx + 1 >= StoreNodes.size()) 17123 return 0; 17124 17125 // Trim stores that overlapped with the first store. 17126 if (StartIdx) 17127 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); 17128 17129 // Scan the memory operations on the chain and find the first 17130 // non-consecutive store memory address. 17131 unsigned NumConsecutiveStores = 1; 17132 int64_t StartAddress = StoreNodes[0].OffsetFromBase; 17133 // Check that the addresses are consecutive starting from the second 17134 // element in the list of stores. 17135 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { 17136 int64_t CurrAddress = StoreNodes[i].OffsetFromBase; 17137 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 17138 break; 17139 NumConsecutiveStores = i + 1; 17140 } 17141 if (NumConsecutiveStores > 1) 17142 return NumConsecutiveStores; 17143 17144 // There are no consecutive stores at the start of the list. 17145 // Remove the first store and try again. 17146 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); 17147 } 17148 } 17149 17150 bool DAGCombiner::tryStoreMergeOfConstants( 17151 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores, 17152 EVT MemVT, SDNode *RootNode, bool AllowVectors) { 17153 LLVMContext &Context = *DAG.getContext(); 17154 const DataLayout &DL = DAG.getDataLayout(); 17155 int64_t ElementSizeBytes = MemVT.getStoreSize(); 17156 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 17157 bool MadeChange = false; 17158 17159 // Store the constants into memory as one consecutive store. 17160 while (NumConsecutiveStores >= 2) { 17161 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 17162 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 17163 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 17164 unsigned LastLegalType = 1; 17165 unsigned LastLegalVectorType = 1; 17166 bool LastIntegerTrunc = false; 17167 bool NonZero = false; 17168 unsigned FirstZeroAfterNonZero = NumConsecutiveStores; 17169 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 17170 StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); 17171 SDValue StoredVal = ST->getValue(); 17172 bool IsElementZero = false; 17173 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) 17174 IsElementZero = C->isNullValue(); 17175 else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) 17176 IsElementZero = C->getConstantFPValue()->isNullValue(); 17177 if (IsElementZero) { 17178 if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) 17179 FirstZeroAfterNonZero = i; 17180 } 17181 NonZero |= !IsElementZero; 17182 17183 // Find a legal type for the constant store. 17184 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 17185 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); 17186 bool IsFast = false; 17187 17188 // Break early when size is too large to be legal. 17189 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 17190 break; 17191 17192 if (TLI.isTypeLegal(StoreTy) && 17193 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 17194 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17195 *FirstInChain->getMemOperand(), &IsFast) && 17196 IsFast) { 17197 LastIntegerTrunc = false; 17198 LastLegalType = i + 1; 17199 // Or check whether a truncstore is legal. 17200 } else if (TLI.getTypeAction(Context, StoreTy) == 17201 TargetLowering::TypePromoteInteger) { 17202 EVT LegalizedStoredValTy = 17203 TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); 17204 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 17205 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 17206 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17207 *FirstInChain->getMemOperand(), &IsFast) && 17208 IsFast) { 17209 LastIntegerTrunc = true; 17210 LastLegalType = i + 1; 17211 } 17212 } 17213 17214 // We only use vectors if the constant is known to be zero or the 17215 // target allows it and the function is not marked with the 17216 // noimplicitfloat attribute. 17217 if ((!NonZero || 17218 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && 17219 AllowVectors) { 17220 // Find a legal type for the vector store. 17221 unsigned Elts = (i + 1) * NumMemElts; 17222 EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 17223 if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && 17224 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 17225 TLI.allowsMemoryAccess(Context, DL, Ty, 17226 *FirstInChain->getMemOperand(), &IsFast) && 17227 IsFast) 17228 LastLegalVectorType = i + 1; 17229 } 17230 } 17231 17232 bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors; 17233 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; 17234 17235 // Check if we found a legal integer type that creates a meaningful 17236 // merge. 17237 if (NumElem < 2) { 17238 // We know that candidate stores are in order and of correct 17239 // shape. While there is no mergeable sequence from the 17240 // beginning one may start later in the sequence. The only 17241 // reason a merge of size N could have failed where another of 17242 // the same size would not have, is if the alignment has 17243 // improved or we've dropped a non-zero value. Drop as many 17244 // candidates as we can here. 17245 unsigned NumSkip = 1; 17246 while ((NumSkip < NumConsecutiveStores) && 17247 (NumSkip < FirstZeroAfterNonZero) && 17248 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 17249 NumSkip++; 17250 17251 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 17252 NumConsecutiveStores -= NumSkip; 17253 continue; 17254 } 17255 17256 // Check that we can merge these candidates without causing a cycle. 17257 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 17258 RootNode)) { 17259 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 17260 NumConsecutiveStores -= NumElem; 17261 continue; 17262 } 17263 17264 MadeChange |= mergeStoresOfConstantsOrVecElts( 17265 StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); 17266 17267 // Remove merged stores for next iteration. 17268 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 17269 NumConsecutiveStores -= NumElem; 17270 } 17271 return MadeChange; 17272 } 17273 17274 bool DAGCombiner::tryStoreMergeOfExtracts( 17275 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores, 17276 EVT MemVT, SDNode *RootNode) { 17277 LLVMContext &Context = *DAG.getContext(); 17278 const DataLayout &DL = DAG.getDataLayout(); 17279 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 17280 bool MadeChange = false; 17281 17282 // Loop on Consecutive Stores on success. 17283 while (NumConsecutiveStores >= 2) { 17284 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 17285 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 17286 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 17287 unsigned NumStoresToMerge = 1; 17288 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 17289 // Find a legal type for the vector store. 17290 unsigned Elts = (i + 1) * NumMemElts; 17291 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 17292 bool IsFast = false; 17293 17294 // Break early when size is too large to be legal. 17295 if (Ty.getSizeInBits() > MaximumLegalStoreInBits) 17296 break; 17297 17298 if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 17299 TLI.allowsMemoryAccess(Context, DL, Ty, 17300 *FirstInChain->getMemOperand(), &IsFast) && 17301 IsFast) 17302 NumStoresToMerge = i + 1; 17303 } 17304 17305 // Check if we found a legal integer type creating a meaningful 17306 // merge. 17307 if (NumStoresToMerge < 2) { 17308 // We know that candidate stores are in order and of correct 17309 // shape. While there is no mergeable sequence from the 17310 // beginning one may start later in the sequence. The only 17311 // reason a merge of size N could have failed where another of 17312 // the same size would not have, is if the alignment has 17313 // improved. Drop as many candidates as we can here. 17314 unsigned NumSkip = 1; 17315 while ((NumSkip < NumConsecutiveStores) && 17316 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 17317 NumSkip++; 17318 17319 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 17320 NumConsecutiveStores -= NumSkip; 17321 continue; 17322 } 17323 17324 // Check that we can merge these candidates without causing a cycle. 17325 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge, 17326 RootNode)) { 17327 StoreNodes.erase(StoreNodes.begin(), 17328 StoreNodes.begin() + NumStoresToMerge); 17329 NumConsecutiveStores -= NumStoresToMerge; 17330 continue; 17331 } 17332 17333 MadeChange |= mergeStoresOfConstantsOrVecElts( 17334 StoreNodes, MemVT, NumStoresToMerge, false, true, false); 17335 17336 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); 17337 NumConsecutiveStores -= NumStoresToMerge; 17338 } 17339 return MadeChange; 17340 } 17341 17342 bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, 17343 unsigned NumConsecutiveStores, EVT MemVT, 17344 SDNode *RootNode, bool AllowVectors, 17345 bool IsNonTemporalStore, 17346 bool IsNonTemporalLoad) { 17347 LLVMContext &Context = *DAG.getContext(); 17348 const DataLayout &DL = DAG.getDataLayout(); 17349 int64_t ElementSizeBytes = MemVT.getStoreSize(); 17350 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 17351 bool MadeChange = false; 17352 17353 // Look for load nodes which are used by the stored values. 17354 SmallVector<MemOpLink, 8> LoadNodes; 17355 17356 // Find acceptable loads. Loads need to have the same chain (token factor), 17357 // must not be zext, volatile, indexed, and they must be consecutive. 17358 BaseIndexOffset LdBasePtr; 17359 17360 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 17361 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 17362 SDValue Val = peekThroughBitcasts(St->getValue()); 17363 LoadSDNode *Ld = cast<LoadSDNode>(Val); 17364 17365 BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG); 17366 // If this is not the first ptr that we check. 17367 int64_t LdOffset = 0; 17368 if (LdBasePtr.getBase().getNode()) { 17369 // The base ptr must be the same. 17370 if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) 17371 break; 17372 } else { 17373 // Check that all other base pointers are the same as this one. 17374 LdBasePtr = LdPtr; 17375 } 17376 17377 // We found a potential memory operand to merge. 17378 LoadNodes.push_back(MemOpLink(Ld, LdOffset)); 17379 } 17380 17381 while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) { 17382 Align RequiredAlignment; 17383 bool NeedRotate = false; 17384 if (LoadNodes.size() == 2) { 17385 // If we have load/store pair instructions and we only have two values, 17386 // don't bother merging. 17387 if (TLI.hasPairedLoad(MemVT, RequiredAlignment) && 17388 StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) { 17389 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); 17390 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); 17391 break; 17392 } 17393 // If the loads are reversed, see if we can rotate the halves into place. 17394 int64_t Offset0 = LoadNodes[0].OffsetFromBase; 17395 int64_t Offset1 = LoadNodes[1].OffsetFromBase; 17396 EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2); 17397 if (Offset0 - Offset1 == ElementSizeBytes && 17398 (hasOperation(ISD::ROTL, PairVT) || 17399 hasOperation(ISD::ROTR, PairVT))) { 17400 std::swap(LoadNodes[0], LoadNodes[1]); 17401 NeedRotate = true; 17402 } 17403 } 17404 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 17405 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 17406 Align FirstStoreAlign = FirstInChain->getAlign(); 17407 LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); 17408 17409 // Scan the memory operations on the chain and find the first 17410 // non-consecutive load memory address. These variables hold the index in 17411 // the store node array. 17412 17413 unsigned LastConsecutiveLoad = 1; 17414 17415 // This variable refers to the size and not index in the array. 17416 unsigned LastLegalVectorType = 1; 17417 unsigned LastLegalIntegerType = 1; 17418 bool isDereferenceable = true; 17419 bool DoIntegerTruncate = false; 17420 int64_t StartAddress = LoadNodes[0].OffsetFromBase; 17421 SDValue LoadChain = FirstLoad->getChain(); 17422 for (unsigned i = 1; i < LoadNodes.size(); ++i) { 17423 // All loads must share the same chain. 17424 if (LoadNodes[i].MemNode->getChain() != LoadChain) 17425 break; 17426 17427 int64_t CurrAddress = LoadNodes[i].OffsetFromBase; 17428 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 17429 break; 17430 LastConsecutiveLoad = i; 17431 17432 if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) 17433 isDereferenceable = false; 17434 17435 // Find a legal type for the vector store. 17436 unsigned Elts = (i + 1) * NumMemElts; 17437 EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 17438 17439 // Break early when size is too large to be legal. 17440 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 17441 break; 17442 17443 bool IsFastSt = false; 17444 bool IsFastLd = false; 17445 if (TLI.isTypeLegal(StoreTy) && 17446 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 17447 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17448 *FirstInChain->getMemOperand(), &IsFastSt) && 17449 IsFastSt && 17450 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17451 *FirstLoad->getMemOperand(), &IsFastLd) && 17452 IsFastLd) { 17453 LastLegalVectorType = i + 1; 17454 } 17455 17456 // Find a legal type for the integer store. 17457 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 17458 StoreTy = EVT::getIntegerVT(Context, SizeInBits); 17459 if (TLI.isTypeLegal(StoreTy) && 17460 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 17461 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17462 *FirstInChain->getMemOperand(), &IsFastSt) && 17463 IsFastSt && 17464 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17465 *FirstLoad->getMemOperand(), &IsFastLd) && 17466 IsFastLd) { 17467 LastLegalIntegerType = i + 1; 17468 DoIntegerTruncate = false; 17469 // Or check whether a truncstore and extload is legal. 17470 } else if (TLI.getTypeAction(Context, StoreTy) == 17471 TargetLowering::TypePromoteInteger) { 17472 EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy); 17473 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 17474 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 17475 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) && 17476 TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) && 17477 TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && 17478 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17479 *FirstInChain->getMemOperand(), &IsFastSt) && 17480 IsFastSt && 17481 TLI.allowsMemoryAccess(Context, DL, StoreTy, 17482 *FirstLoad->getMemOperand(), &IsFastLd) && 17483 IsFastLd) { 17484 LastLegalIntegerType = i + 1; 17485 DoIntegerTruncate = true; 17486 } 17487 } 17488 } 17489 17490 // Only use vector types if the vector type is larger than the integer 17491 // type. If they are the same, use integers. 17492 bool UseVectorTy = 17493 LastLegalVectorType > LastLegalIntegerType && AllowVectors; 17494 unsigned LastLegalType = 17495 std::max(LastLegalVectorType, LastLegalIntegerType); 17496 17497 // We add +1 here because the LastXXX variables refer to location while 17498 // the NumElem refers to array/index size. 17499 unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); 17500 NumElem = std::min(LastLegalType, NumElem); 17501 Align FirstLoadAlign = FirstLoad->getAlign(); 17502 17503 if (NumElem < 2) { 17504 // We know that candidate stores are in order and of correct 17505 // shape. While there is no mergeable sequence from the 17506 // beginning one may start later in the sequence. The only 17507 // reason a merge of size N could have failed where another of 17508 // the same size would not have is if the alignment or either 17509 // the load or store has improved. Drop as many candidates as we 17510 // can here. 17511 unsigned NumSkip = 1; 17512 while ((NumSkip < LoadNodes.size()) && 17513 (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) && 17514 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign)) 17515 NumSkip++; 17516 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 17517 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); 17518 NumConsecutiveStores -= NumSkip; 17519 continue; 17520 } 17521 17522 // Check that we can merge these candidates without causing a cycle. 17523 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 17524 RootNode)) { 17525 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 17526 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 17527 NumConsecutiveStores -= NumElem; 17528 continue; 17529 } 17530 17531 // Find if it is better to use vectors or integers to load and store 17532 // to memory. 17533 EVT JointMemOpVT; 17534 if (UseVectorTy) { 17535 // Find a legal type for the vector store. 17536 unsigned Elts = NumElem * NumMemElts; 17537 JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 17538 } else { 17539 unsigned SizeInBits = NumElem * ElementSizeBytes * 8; 17540 JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); 17541 } 17542 17543 SDLoc LoadDL(LoadNodes[0].MemNode); 17544 SDLoc StoreDL(StoreNodes[0].MemNode); 17545 17546 // The merged loads are required to have the same incoming chain, so 17547 // using the first's chain is acceptable. 17548 17549 SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); 17550 AddToWorklist(NewStoreChain.getNode()); 17551 17552 MachineMemOperand::Flags LdMMOFlags = 17553 isDereferenceable ? MachineMemOperand::MODereferenceable 17554 : MachineMemOperand::MONone; 17555 if (IsNonTemporalLoad) 17556 LdMMOFlags |= MachineMemOperand::MONonTemporal; 17557 17558 MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore 17559 ? MachineMemOperand::MONonTemporal 17560 : MachineMemOperand::MONone; 17561 17562 SDValue NewLoad, NewStore; 17563 if (UseVectorTy || !DoIntegerTruncate) { 17564 NewLoad = DAG.getLoad( 17565 JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), 17566 FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags); 17567 SDValue StoreOp = NewLoad; 17568 if (NeedRotate) { 17569 unsigned LoadWidth = ElementSizeBytes * 8 * 2; 17570 assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) && 17571 "Unexpected type for rotate-able load pair"); 17572 SDValue RotAmt = 17573 DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL); 17574 // Target can convert to the identical ROTR if it does not have ROTL. 17575 StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt); 17576 } 17577 NewStore = DAG.getStore( 17578 NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(), 17579 FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); 17580 } else { // This must be the truncstore/extload case 17581 EVT ExtendedTy = 17582 TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); 17583 NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, 17584 FirstLoad->getChain(), FirstLoad->getBasePtr(), 17585 FirstLoad->getPointerInfo(), JointMemOpVT, 17586 FirstLoadAlign, LdMMOFlags); 17587 NewStore = DAG.getTruncStore( 17588 NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), 17589 FirstInChain->getPointerInfo(), JointMemOpVT, 17590 FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); 17591 } 17592 17593 // Transfer chain users from old loads to the new load. 17594 for (unsigned i = 0; i < NumElem; ++i) { 17595 LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); 17596 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), 17597 SDValue(NewLoad.getNode(), 1)); 17598 } 17599 17600 // Replace all stores with the new store. Recursively remove corresponding 17601 // values if they are no longer used. 17602 for (unsigned i = 0; i < NumElem; ++i) { 17603 SDValue Val = StoreNodes[i].MemNode->getOperand(1); 17604 CombineTo(StoreNodes[i].MemNode, NewStore); 17605 if (Val.getNode()->use_empty()) 17606 recursivelyDeleteUnusedNodes(Val.getNode()); 17607 } 17608 17609 MadeChange = true; 17610 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 17611 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 17612 NumConsecutiveStores -= NumElem; 17613 } 17614 return MadeChange; 17615 } 17616 17617 bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { 17618 if (OptLevel == CodeGenOpt::None || !EnableStoreMerging) 17619 return false; 17620 17621 // TODO: Extend this function to merge stores of scalable vectors. 17622 // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8> 17623 // store since we know <vscale x 16 x i8> is exactly twice as large as 17624 // <vscale x 8 x i8>). Until then, bail out for scalable vectors. 17625 EVT MemVT = St->getMemoryVT(); 17626 if (MemVT.isScalableVector()) 17627 return false; 17628 if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) 17629 return false; 17630 17631 // This function cannot currently deal with non-byte-sized memory sizes. 17632 int64_t ElementSizeBytes = MemVT.getStoreSize(); 17633 if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits()) 17634 return false; 17635 17636 // Do not bother looking at stored values that are not constants, loads, or 17637 // extracted vector elements. 17638 SDValue StoredVal = peekThroughBitcasts(St->getValue()); 17639 const StoreSource StoreSrc = getStoreSource(StoredVal); 17640 if (StoreSrc == StoreSource::Unknown) 17641 return false; 17642 17643 SmallVector<MemOpLink, 8> StoreNodes; 17644 SDNode *RootNode; 17645 // Find potential store merge candidates by searching through chain sub-DAG 17646 getStoreMergeCandidates(St, StoreNodes, RootNode); 17647 17648 // Check if there is anything to merge. 17649 if (StoreNodes.size() < 2) 17650 return false; 17651 17652 // Sort the memory operands according to their distance from the 17653 // base pointer. 17654 llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) { 17655 return LHS.OffsetFromBase < RHS.OffsetFromBase; 17656 }); 17657 17658 bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute( 17659 Attribute::NoImplicitFloat); 17660 bool IsNonTemporalStore = St->isNonTemporal(); 17661 bool IsNonTemporalLoad = StoreSrc == StoreSource::Load && 17662 cast<LoadSDNode>(StoredVal)->isNonTemporal(); 17663 17664 // Store Merge attempts to merge the lowest stores. This generally 17665 // works out as if successful, as the remaining stores are checked 17666 // after the first collection of stores is merged. However, in the 17667 // case that a non-mergeable store is found first, e.g., {p[-2], 17668 // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent 17669 // mergeable cases. To prevent this, we prune such stores from the 17670 // front of StoreNodes here. 17671 bool MadeChange = false; 17672 while (StoreNodes.size() > 1) { 17673 unsigned NumConsecutiveStores = 17674 getConsecutiveStores(StoreNodes, ElementSizeBytes); 17675 // There are no more stores in the list to examine. 17676 if (NumConsecutiveStores == 0) 17677 return MadeChange; 17678 17679 // We have at least 2 consecutive stores. Try to merge them. 17680 assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores"); 17681 switch (StoreSrc) { 17682 case StoreSource::Constant: 17683 MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores, 17684 MemVT, RootNode, AllowVectors); 17685 break; 17686 17687 case StoreSource::Extract: 17688 MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores, 17689 MemVT, RootNode); 17690 break; 17691 17692 case StoreSource::Load: 17693 MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores, 17694 MemVT, RootNode, AllowVectors, 17695 IsNonTemporalStore, IsNonTemporalLoad); 17696 break; 17697 17698 default: 17699 llvm_unreachable("Unhandled store source type"); 17700 } 17701 } 17702 return MadeChange; 17703 } 17704 17705 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { 17706 SDLoc SL(ST); 17707 SDValue ReplStore; 17708 17709 // Replace the chain to avoid dependency. 17710 if (ST->isTruncatingStore()) { 17711 ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), 17712 ST->getBasePtr(), ST->getMemoryVT(), 17713 ST->getMemOperand()); 17714 } else { 17715 ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), 17716 ST->getMemOperand()); 17717 } 17718 17719 // Create token to keep both nodes around. 17720 SDValue Token = DAG.getNode(ISD::TokenFactor, SL, 17721 MVT::Other, ST->getChain(), ReplStore); 17722 17723 // Make sure the new and old chains are cleaned up. 17724 AddToWorklist(Token.getNode()); 17725 17726 // Don't add users to work list. 17727 return CombineTo(ST, Token, false); 17728 } 17729 17730 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { 17731 SDValue Value = ST->getValue(); 17732 if (Value.getOpcode() == ISD::TargetConstantFP) 17733 return SDValue(); 17734 17735 if (!ISD::isNormalStore(ST)) 17736 return SDValue(); 17737 17738 SDLoc DL(ST); 17739 17740 SDValue Chain = ST->getChain(); 17741 SDValue Ptr = ST->getBasePtr(); 17742 17743 const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value); 17744 17745 // NOTE: If the original store is volatile, this transform must not increase 17746 // the number of stores. For example, on x86-32 an f64 can be stored in one 17747 // processor operation but an i64 (which is not legal) requires two. So the 17748 // transform should not be done in this case. 17749 17750 SDValue Tmp; 17751 switch (CFP->getSimpleValueType(0).SimpleTy) { 17752 default: 17753 llvm_unreachable("Unknown FP type"); 17754 case MVT::f16: // We don't do this for these yet. 17755 case MVT::f80: 17756 case MVT::f128: 17757 case MVT::ppcf128: 17758 return SDValue(); 17759 case MVT::f32: 17760 if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) || 17761 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 17762 ; 17763 Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). 17764 bitcastToAPInt().getZExtValue(), SDLoc(CFP), 17765 MVT::i32); 17766 return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); 17767 } 17768 17769 return SDValue(); 17770 case MVT::f64: 17771 if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && 17772 ST->isSimple()) || 17773 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { 17774 ; 17775 Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). 17776 getZExtValue(), SDLoc(CFP), MVT::i64); 17777 return DAG.getStore(Chain, DL, Tmp, 17778 Ptr, ST->getMemOperand()); 17779 } 17780 17781 if (ST->isSimple() && 17782 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 17783 // Many FP stores are not made apparent until after legalize, e.g. for 17784 // argument passing. Since this is so common, custom legalize the 17785 // 64-bit integer store into two 32-bit stores. 17786 uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); 17787 SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); 17788 SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); 17789 if (DAG.getDataLayout().isBigEndian()) 17790 std::swap(Lo, Hi); 17791 17792 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 17793 AAMDNodes AAInfo = ST->getAAInfo(); 17794 17795 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 17796 ST->getOriginalAlign(), MMOFlags, AAInfo); 17797 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL); 17798 SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, 17799 ST->getPointerInfo().getWithOffset(4), 17800 ST->getOriginalAlign(), MMOFlags, AAInfo); 17801 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 17802 St0, St1); 17803 } 17804 17805 return SDValue(); 17806 } 17807 } 17808 17809 SDValue DAGCombiner::visitSTORE(SDNode *N) { 17810 StoreSDNode *ST = cast<StoreSDNode>(N); 17811 SDValue Chain = ST->getChain(); 17812 SDValue Value = ST->getValue(); 17813 SDValue Ptr = ST->getBasePtr(); 17814 17815 // If this is a store of a bit convert, store the input value if the 17816 // resultant store does not need a higher alignment than the original. 17817 if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && 17818 ST->isUnindexed()) { 17819 EVT SVT = Value.getOperand(0).getValueType(); 17820 // If the store is volatile, we only want to change the store type if the 17821 // resulting store is legal. Otherwise we might increase the number of 17822 // memory accesses. We don't care if the original type was legal or not 17823 // as we assume software couldn't rely on the number of accesses of an 17824 // illegal type. 17825 // TODO: May be able to relax for unordered atomics (see D66309) 17826 if (((!LegalOperations && ST->isSimple()) || 17827 TLI.isOperationLegal(ISD::STORE, SVT)) && 17828 TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, 17829 DAG, *ST->getMemOperand())) { 17830 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, 17831 ST->getMemOperand()); 17832 } 17833 } 17834 17835 // Turn 'store undef, Ptr' -> nothing. 17836 if (Value.isUndef() && ST->isUnindexed()) 17837 return Chain; 17838 17839 // Try to infer better alignment information than the store already has. 17840 if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) { 17841 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) { 17842 if (*Alignment > ST->getAlign() && 17843 isAligned(*Alignment, ST->getSrcValueOffset())) { 17844 SDValue NewStore = 17845 DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), 17846 ST->getMemoryVT(), *Alignment, 17847 ST->getMemOperand()->getFlags(), ST->getAAInfo()); 17848 // NewStore will always be N as we are only refining the alignment 17849 assert(NewStore.getNode() == N); 17850 (void)NewStore; 17851 } 17852 } 17853 } 17854 17855 // Try transforming a pair floating point load / store ops to integer 17856 // load / store ops. 17857 if (SDValue NewST = TransformFPLoadStorePair(N)) 17858 return NewST; 17859 17860 // Try transforming several stores into STORE (BSWAP). 17861 if (SDValue Store = mergeTruncStores(ST)) 17862 return Store; 17863 17864 if (ST->isUnindexed()) { 17865 // Walk up chain skipping non-aliasing memory nodes, on this store and any 17866 // adjacent stores. 17867 if (findBetterNeighborChains(ST)) { 17868 // replaceStoreChain uses CombineTo, which handled all of the worklist 17869 // manipulation. Return the original node to not do anything else. 17870 return SDValue(ST, 0); 17871 } 17872 Chain = ST->getChain(); 17873 } 17874 17875 // FIXME: is there such a thing as a truncating indexed store? 17876 if (ST->isTruncatingStore() && ST->isUnindexed() && 17877 Value.getValueType().isInteger() && 17878 (!isa<ConstantSDNode>(Value) || 17879 !cast<ConstantSDNode>(Value)->isOpaque())) { 17880 APInt TruncDemandedBits = 17881 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), 17882 ST->getMemoryVT().getScalarSizeInBits()); 17883 17884 // See if we can simplify the input to this truncstore with knowledge that 17885 // only the low bits are being used. For example: 17886 // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" 17887 AddToWorklist(Value.getNode()); 17888 if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits)) 17889 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), 17890 ST->getMemOperand()); 17891 17892 // Otherwise, see if we can simplify the operation with 17893 // SimplifyDemandedBits, which only works if the value has a single use. 17894 if (SimplifyDemandedBits(Value, TruncDemandedBits)) { 17895 // Re-visit the store if anything changed and the store hasn't been merged 17896 // with another node (N is deleted) SimplifyDemandedBits will add Value's 17897 // node back to the worklist if necessary, but we also need to re-visit 17898 // the Store node itself. 17899 if (N->getOpcode() != ISD::DELETED_NODE) 17900 AddToWorklist(N); 17901 return SDValue(N, 0); 17902 } 17903 } 17904 17905 // If this is a load followed by a store to the same location, then the store 17906 // is dead/noop. 17907 // TODO: Can relax for unordered atomics (see D66309) 17908 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) { 17909 if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && 17910 ST->isUnindexed() && ST->isSimple() && 17911 // There can't be any side effects between the load and store, such as 17912 // a call or store. 17913 Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { 17914 // The store is dead, remove it. 17915 return Chain; 17916 } 17917 } 17918 17919 // TODO: Can relax for unordered atomics (see D66309) 17920 if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { 17921 if (ST->isUnindexed() && ST->isSimple() && 17922 ST1->isUnindexed() && ST1->isSimple()) { 17923 if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value && 17924 ST->getMemoryVT() == ST1->getMemoryVT()) { 17925 // If this is a store followed by a store with the same value to the 17926 // same location, then the store is dead/noop. 17927 return Chain; 17928 } 17929 17930 if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && 17931 !ST1->getBasePtr().isUndef() && 17932 // BaseIndexOffset and the code below requires knowing the size 17933 // of a vector, so bail out if MemoryVT is scalable. 17934 !ST->getMemoryVT().isScalableVector() && 17935 !ST1->getMemoryVT().isScalableVector()) { 17936 const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); 17937 const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); 17938 unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits(); 17939 unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits(); 17940 // If this is a store who's preceding store to a subset of the current 17941 // location and no one other node is chained to that store we can 17942 // effectively drop the store. Do not remove stores to undef as they may 17943 // be used as data sinks. 17944 if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) { 17945 CombineTo(ST1, ST1->getChain()); 17946 return SDValue(); 17947 } 17948 } 17949 } 17950 } 17951 17952 // If this is an FP_ROUND or TRUNC followed by a store, fold this into a 17953 // truncating store. We can do this even if this is already a truncstore. 17954 if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) 17955 && Value.getNode()->hasOneUse() && ST->isUnindexed() && 17956 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), 17957 ST->getMemoryVT())) { 17958 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), 17959 Ptr, ST->getMemoryVT(), ST->getMemOperand()); 17960 } 17961 17962 // Always perform this optimization before types are legal. If the target 17963 // prefers, also try this after legalization to catch stores that were created 17964 // by intrinsics or other nodes. 17965 if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) { 17966 while (true) { 17967 // There can be multiple store sequences on the same chain. 17968 // Keep trying to merge store sequences until we are unable to do so 17969 // or until we merge the last store on the chain. 17970 bool Changed = mergeConsecutiveStores(ST); 17971 if (!Changed) break; 17972 // Return N as merge only uses CombineTo and no worklist clean 17973 // up is necessary. 17974 if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) 17975 return SDValue(N, 0); 17976 } 17977 } 17978 17979 // Try transforming N to an indexed store. 17980 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 17981 return SDValue(N, 0); 17982 17983 // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' 17984 // 17985 // Make sure to do this only after attempting to merge stores in order to 17986 // avoid changing the types of some subset of stores due to visit order, 17987 // preventing their merging. 17988 if (isa<ConstantFPSDNode>(ST->getValue())) { 17989 if (SDValue NewSt = replaceStoreOfFPConstant(ST)) 17990 return NewSt; 17991 } 17992 17993 if (SDValue NewSt = splitMergedValStore(ST)) 17994 return NewSt; 17995 17996 return ReduceLoadOpStoreWidth(N); 17997 } 17998 17999 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { 18000 const auto *LifetimeEnd = cast<LifetimeSDNode>(N); 18001 if (!LifetimeEnd->hasOffset()) 18002 return SDValue(); 18003 18004 const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 18005 LifetimeEnd->getOffset(), false); 18006 18007 // We walk up the chains to find stores. 18008 SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; 18009 while (!Chains.empty()) { 18010 SDValue Chain = Chains.pop_back_val(); 18011 if (!Chain.hasOneUse()) 18012 continue; 18013 switch (Chain.getOpcode()) { 18014 case ISD::TokenFactor: 18015 for (unsigned Nops = Chain.getNumOperands(); Nops;) 18016 Chains.push_back(Chain.getOperand(--Nops)); 18017 break; 18018 case ISD::LIFETIME_START: 18019 case ISD::LIFETIME_END: 18020 // We can forward past any lifetime start/end that can be proven not to 18021 // alias the node. 18022 if (!isAlias(Chain.getNode(), N)) 18023 Chains.push_back(Chain.getOperand(0)); 18024 break; 18025 case ISD::STORE: { 18026 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain); 18027 // TODO: Can relax for unordered atomics (see D66309) 18028 if (!ST->isSimple() || ST->isIndexed()) 18029 continue; 18030 const TypeSize StoreSize = ST->getMemoryVT().getStoreSize(); 18031 // The bounds of a scalable store are not known until runtime, so this 18032 // store cannot be elided. 18033 if (StoreSize.isScalable()) 18034 continue; 18035 const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); 18036 // If we store purely within object bounds just before its lifetime ends, 18037 // we can remove the store. 18038 if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, 18039 StoreSize.getFixedSize() * 8)) { 18040 LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); 18041 dbgs() << "\nwithin LIFETIME_END of : "; 18042 LifetimeEndBase.dump(); dbgs() << "\n"); 18043 CombineTo(ST, ST->getChain()); 18044 return SDValue(N, 0); 18045 } 18046 } 18047 } 18048 } 18049 return SDValue(); 18050 } 18051 18052 /// For the instruction sequence of store below, F and I values 18053 /// are bundled together as an i64 value before being stored into memory. 18054 /// Sometimes it is more efficent to generate separate stores for F and I, 18055 /// which can remove the bitwise instructions or sink them to colder places. 18056 /// 18057 /// (store (or (zext (bitcast F to i32) to i64), 18058 /// (shl (zext I to i64), 32)), addr) --> 18059 /// (store F, addr) and (store I, addr+4) 18060 /// 18061 /// Similarly, splitting for other merged store can also be beneficial, like: 18062 /// For pair of {i32, i32}, i64 store --> two i32 stores. 18063 /// For pair of {i32, i16}, i64 store --> two i32 stores. 18064 /// For pair of {i16, i16}, i32 store --> two i16 stores. 18065 /// For pair of {i16, i8}, i32 store --> two i16 stores. 18066 /// For pair of {i8, i8}, i16 store --> two i8 stores. 18067 /// 18068 /// We allow each target to determine specifically which kind of splitting is 18069 /// supported. 18070 /// 18071 /// The store patterns are commonly seen from the simple code snippet below 18072 /// if only std::make_pair(...) is sroa transformed before inlined into hoo. 18073 /// void goo(const std::pair<int, float> &); 18074 /// hoo() { 18075 /// ... 18076 /// goo(std::make_pair(tmp, ftmp)); 18077 /// ... 18078 /// } 18079 /// 18080 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { 18081 if (OptLevel == CodeGenOpt::None) 18082 return SDValue(); 18083 18084 // Can't change the number of memory accesses for a volatile store or break 18085 // atomicity for an atomic one. 18086 if (!ST->isSimple()) 18087 return SDValue(); 18088 18089 SDValue Val = ST->getValue(); 18090 SDLoc DL(ST); 18091 18092 // Match OR operand. 18093 if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) 18094 return SDValue(); 18095 18096 // Match SHL operand and get Lower and Higher parts of Val. 18097 SDValue Op1 = Val.getOperand(0); 18098 SDValue Op2 = Val.getOperand(1); 18099 SDValue Lo, Hi; 18100 if (Op1.getOpcode() != ISD::SHL) { 18101 std::swap(Op1, Op2); 18102 if (Op1.getOpcode() != ISD::SHL) 18103 return SDValue(); 18104 } 18105 Lo = Op2; 18106 Hi = Op1.getOperand(0); 18107 if (!Op1.hasOneUse()) 18108 return SDValue(); 18109 18110 // Match shift amount to HalfValBitSize. 18111 unsigned HalfValBitSize = Val.getValueSizeInBits() / 2; 18112 ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1)); 18113 if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) 18114 return SDValue(); 18115 18116 // Lo and Hi are zero-extended from int with size less equal than 32 18117 // to i64. 18118 if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || 18119 !Lo.getOperand(0).getValueType().isScalarInteger() || 18120 Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize || 18121 Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || 18122 !Hi.getOperand(0).getValueType().isScalarInteger() || 18123 Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize) 18124 return SDValue(); 18125 18126 // Use the EVT of low and high parts before bitcast as the input 18127 // of target query. 18128 EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST) 18129 ? Lo.getOperand(0).getValueType() 18130 : Lo.getValueType(); 18131 EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST) 18132 ? Hi.getOperand(0).getValueType() 18133 : Hi.getValueType(); 18134 if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) 18135 return SDValue(); 18136 18137 // Start to split store. 18138 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 18139 AAMDNodes AAInfo = ST->getAAInfo(); 18140 18141 // Change the sizes of Lo and Hi's value types to HalfValBitSize. 18142 EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); 18143 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); 18144 Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); 18145 18146 SDValue Chain = ST->getChain(); 18147 SDValue Ptr = ST->getBasePtr(); 18148 // Lower value store. 18149 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 18150 ST->getOriginalAlign(), MMOFlags, AAInfo); 18151 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL); 18152 // Higher value store. 18153 SDValue St1 = DAG.getStore( 18154 St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), 18155 ST->getOriginalAlign(), MMOFlags, AAInfo); 18156 return St1; 18157 } 18158 18159 /// Convert a disguised subvector insertion into a shuffle: 18160 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { 18161 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && 18162 "Expected extract_vector_elt"); 18163 SDValue InsertVal = N->getOperand(1); 18164 SDValue Vec = N->getOperand(0); 18165 18166 // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), 18167 // InsIndex) 18168 // --> (vector_shuffle X, Y) and variations where shuffle operands may be 18169 // CONCAT_VECTORS. 18170 if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() && 18171 InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18172 isa<ConstantSDNode>(InsertVal.getOperand(1))) { 18173 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode()); 18174 ArrayRef<int> Mask = SVN->getMask(); 18175 18176 SDValue X = Vec.getOperand(0); 18177 SDValue Y = Vec.getOperand(1); 18178 18179 // Vec's operand 0 is using indices from 0 to N-1 and 18180 // operand 1 from N to 2N - 1, where N is the number of 18181 // elements in the vectors. 18182 SDValue InsertVal0 = InsertVal.getOperand(0); 18183 int ElementOffset = -1; 18184 18185 // We explore the inputs of the shuffle in order to see if we find the 18186 // source of the extract_vector_elt. If so, we can use it to modify the 18187 // shuffle rather than perform an insert_vector_elt. 18188 SmallVector<std::pair<int, SDValue>, 8> ArgWorkList; 18189 ArgWorkList.emplace_back(Mask.size(), Y); 18190 ArgWorkList.emplace_back(0, X); 18191 18192 while (!ArgWorkList.empty()) { 18193 int ArgOffset; 18194 SDValue ArgVal; 18195 std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val(); 18196 18197 if (ArgVal == InsertVal0) { 18198 ElementOffset = ArgOffset; 18199 break; 18200 } 18201 18202 // Peek through concat_vector. 18203 if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) { 18204 int CurrentArgOffset = 18205 ArgOffset + ArgVal.getValueType().getVectorNumElements(); 18206 int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements(); 18207 for (SDValue Op : reverse(ArgVal->ops())) { 18208 CurrentArgOffset -= Step; 18209 ArgWorkList.emplace_back(CurrentArgOffset, Op); 18210 } 18211 18212 // Make sure we went through all the elements and did not screw up index 18213 // computation. 18214 assert(CurrentArgOffset == ArgOffset); 18215 } 18216 } 18217 18218 if (ElementOffset != -1) { 18219 SmallVector<int, 16> NewMask(Mask.begin(), Mask.end()); 18220 18221 auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1)); 18222 NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue(); 18223 assert(NewMask[InsIndex] < 18224 (int)(2 * Vec.getValueType().getVectorNumElements()) && 18225 NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound"); 18226 18227 SDValue LegalShuffle = 18228 TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X, 18229 Y, NewMask, DAG); 18230 if (LegalShuffle) 18231 return LegalShuffle; 18232 } 18233 } 18234 18235 // insert_vector_elt V, (bitcast X from vector type), IdxC --> 18236 // bitcast(shuffle (bitcast V), (extended X), Mask) 18237 // Note: We do not use an insert_subvector node because that requires a 18238 // legal subvector type. 18239 if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || 18240 !InsertVal.getOperand(0).getValueType().isVector()) 18241 return SDValue(); 18242 18243 SDValue SubVec = InsertVal.getOperand(0); 18244 SDValue DestVec = N->getOperand(0); 18245 EVT SubVecVT = SubVec.getValueType(); 18246 EVT VT = DestVec.getValueType(); 18247 unsigned NumSrcElts = SubVecVT.getVectorNumElements(); 18248 // If the source only has a single vector element, the cost of creating adding 18249 // it to a vector is likely to exceed the cost of a insert_vector_elt. 18250 if (NumSrcElts == 1) 18251 return SDValue(); 18252 unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); 18253 unsigned NumMaskVals = ExtendRatio * NumSrcElts; 18254 18255 // Step 1: Create a shuffle mask that implements this insert operation. The 18256 // vector that we are inserting into will be operand 0 of the shuffle, so 18257 // those elements are just 'i'. The inserted subvector is in the first 18258 // positions of operand 1 of the shuffle. Example: 18259 // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} 18260 SmallVector<int, 16> Mask(NumMaskVals); 18261 for (unsigned i = 0; i != NumMaskVals; ++i) { 18262 if (i / NumSrcElts == InsIndex) 18263 Mask[i] = (i % NumSrcElts) + NumMaskVals; 18264 else 18265 Mask[i] = i; 18266 } 18267 18268 // Bail out if the target can not handle the shuffle we want to create. 18269 EVT SubVecEltVT = SubVecVT.getVectorElementType(); 18270 EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); 18271 if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) 18272 return SDValue(); 18273 18274 // Step 2: Create a wide vector from the inserted source vector by appending 18275 // undefined elements. This is the same size as our destination vector. 18276 SDLoc DL(N); 18277 SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); 18278 ConcatOps[0] = SubVec; 18279 SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); 18280 18281 // Step 3: Shuffle in the padded subvector. 18282 SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); 18283 SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); 18284 AddToWorklist(PaddedSubV.getNode()); 18285 AddToWorklist(DestVecBC.getNode()); 18286 AddToWorklist(Shuf.getNode()); 18287 return DAG.getBitcast(VT, Shuf); 18288 } 18289 18290 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { 18291 SDValue InVec = N->getOperand(0); 18292 SDValue InVal = N->getOperand(1); 18293 SDValue EltNo = N->getOperand(2); 18294 SDLoc DL(N); 18295 18296 EVT VT = InVec.getValueType(); 18297 auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); 18298 18299 // Insert into out-of-bounds element is undefined. 18300 if (IndexC && VT.isFixedLengthVector() && 18301 IndexC->getZExtValue() >= VT.getVectorNumElements()) 18302 return DAG.getUNDEF(VT); 18303 18304 // Remove redundant insertions: 18305 // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x 18306 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18307 InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) 18308 return InVec; 18309 18310 if (!IndexC) { 18311 // If this is variable insert to undef vector, it might be better to splat: 18312 // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > 18313 if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) { 18314 if (VT.isScalableVector()) 18315 return DAG.getSplatVector(VT, DL, InVal); 18316 else { 18317 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal); 18318 return DAG.getBuildVector(VT, DL, Ops); 18319 } 18320 } 18321 return SDValue(); 18322 } 18323 18324 if (VT.isScalableVector()) 18325 return SDValue(); 18326 18327 unsigned NumElts = VT.getVectorNumElements(); 18328 18329 // We must know which element is being inserted for folds below here. 18330 unsigned Elt = IndexC->getZExtValue(); 18331 if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) 18332 return Shuf; 18333 18334 // Canonicalize insert_vector_elt dag nodes. 18335 // Example: 18336 // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) 18337 // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) 18338 // 18339 // Do this only if the child insert_vector node has one use; also 18340 // do this only if indices are both constants and Idx1 < Idx0. 18341 if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() 18342 && isa<ConstantSDNode>(InVec.getOperand(2))) { 18343 unsigned OtherElt = InVec.getConstantOperandVal(2); 18344 if (Elt < OtherElt) { 18345 // Swap nodes. 18346 SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, 18347 InVec.getOperand(0), InVal, EltNo); 18348 AddToWorklist(NewOp.getNode()); 18349 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), 18350 VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); 18351 } 18352 } 18353 18354 // If we can't generate a legal BUILD_VECTOR, exit 18355 if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 18356 return SDValue(); 18357 18358 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 18359 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 18360 // vector elements. 18361 SmallVector<SDValue, 8> Ops; 18362 // Do not combine these two vectors if the output vector will not replace 18363 // the input vector. 18364 if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { 18365 Ops.append(InVec.getNode()->op_begin(), 18366 InVec.getNode()->op_end()); 18367 } else if (InVec.isUndef()) { 18368 Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); 18369 } else { 18370 return SDValue(); 18371 } 18372 assert(Ops.size() == NumElts && "Unexpected vector size"); 18373 18374 // Insert the element 18375 if (Elt < Ops.size()) { 18376 // All the operands of BUILD_VECTOR must have the same type; 18377 // we enforce that here. 18378 EVT OpVT = Ops[0].getValueType(); 18379 Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; 18380 } 18381 18382 // Return the new vector 18383 return DAG.getBuildVector(VT, DL, Ops); 18384 } 18385 18386 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 18387 SDValue EltNo, 18388 LoadSDNode *OriginalLoad) { 18389 assert(OriginalLoad->isSimple()); 18390 18391 EVT ResultVT = EVE->getValueType(0); 18392 EVT VecEltVT = InVecVT.getVectorElementType(); 18393 18394 // If the vector element type is not a multiple of a byte then we are unable 18395 // to correctly compute an address to load only the extracted element as a 18396 // scalar. 18397 if (!VecEltVT.isByteSized()) 18398 return SDValue(); 18399 18400 Align Alignment = OriginalLoad->getAlign(); 18401 Align NewAlign = DAG.getDataLayout().getABITypeAlign( 18402 VecEltVT.getTypeForEVT(*DAG.getContext())); 18403 18404 if (NewAlign > Alignment || 18405 !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) 18406 return SDValue(); 18407 18408 ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? 18409 ISD::NON_EXTLOAD : ISD::EXTLOAD; 18410 if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) 18411 return SDValue(); 18412 18413 Alignment = NewAlign; 18414 18415 SDValue NewPtr = OriginalLoad->getBasePtr(); 18416 SDValue Offset; 18417 EVT PtrType = NewPtr.getValueType(); 18418 MachinePointerInfo MPI; 18419 SDLoc DL(EVE); 18420 if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) { 18421 int Elt = ConstEltNo->getZExtValue(); 18422 unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; 18423 Offset = DAG.getConstant(PtrOff, DL, PtrType); 18424 MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); 18425 } else { 18426 Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType); 18427 Offset = DAG.getNode( 18428 ISD::MUL, DL, PtrType, Offset, 18429 DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); 18430 // Discard the pointer info except the address space because the memory 18431 // operand can't represent this new access since the offset is variable. 18432 MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); 18433 } 18434 NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL); 18435 18436 // The replacement we need to do here is a little tricky: we need to 18437 // replace an extractelement of a load with a load. 18438 // Use ReplaceAllUsesOfValuesWith to do the replacement. 18439 // Note that this replacement assumes that the extractvalue is the only 18440 // use of the load; that's okay because we don't want to perform this 18441 // transformation in other cases anyway. 18442 SDValue Load; 18443 SDValue Chain; 18444 if (ResultVT.bitsGT(VecEltVT)) { 18445 // If the result type of vextract is wider than the load, then issue an 18446 // extending load instead. 18447 ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, 18448 VecEltVT) 18449 ? ISD::ZEXTLOAD 18450 : ISD::EXTLOAD; 18451 Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, 18452 OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, 18453 Alignment, OriginalLoad->getMemOperand()->getFlags(), 18454 OriginalLoad->getAAInfo()); 18455 Chain = Load.getValue(1); 18456 } else { 18457 Load = DAG.getLoad( 18458 VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment, 18459 OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); 18460 Chain = Load.getValue(1); 18461 if (ResultVT.bitsLT(VecEltVT)) 18462 Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); 18463 else 18464 Load = DAG.getBitcast(ResultVT, Load); 18465 } 18466 WorklistRemover DeadNodes(*this); 18467 SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; 18468 SDValue To[] = { Load, Chain }; 18469 DAG.ReplaceAllUsesOfValuesWith(From, To, 2); 18470 // Make sure to revisit this node to clean it up; it will usually be dead. 18471 AddToWorklist(EVE); 18472 // Since we're explicitly calling ReplaceAllUses, add the new node to the 18473 // worklist explicitly as well. 18474 AddToWorklistWithUsers(Load.getNode()); 18475 ++OpsNarrowed; 18476 return SDValue(EVE, 0); 18477 } 18478 18479 /// Transform a vector binary operation into a scalar binary operation by moving 18480 /// the math/logic after an extract element of a vector. 18481 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, 18482 bool LegalOperations) { 18483 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18484 SDValue Vec = ExtElt->getOperand(0); 18485 SDValue Index = ExtElt->getOperand(1); 18486 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 18487 if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() || 18488 Vec.getNode()->getNumValues() != 1) 18489 return SDValue(); 18490 18491 // Targets may want to avoid this to prevent an expensive register transfer. 18492 if (!TLI.shouldScalarizeBinop(Vec)) 18493 return SDValue(); 18494 18495 // Extracting an element of a vector constant is constant-folded, so this 18496 // transform is just replacing a vector op with a scalar op while moving the 18497 // extract. 18498 SDValue Op0 = Vec.getOperand(0); 18499 SDValue Op1 = Vec.getOperand(1); 18500 if (isAnyConstantBuildVector(Op0, true) || 18501 isAnyConstantBuildVector(Op1, true)) { 18502 // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C' 18503 // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC) 18504 SDLoc DL(ExtElt); 18505 EVT VT = ExtElt->getValueType(0); 18506 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index); 18507 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index); 18508 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1); 18509 } 18510 18511 return SDValue(); 18512 } 18513 18514 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { 18515 SDValue VecOp = N->getOperand(0); 18516 SDValue Index = N->getOperand(1); 18517 EVT ScalarVT = N->getValueType(0); 18518 EVT VecVT = VecOp.getValueType(); 18519 if (VecOp.isUndef()) 18520 return DAG.getUNDEF(ScalarVT); 18521 18522 // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val 18523 // 18524 // This only really matters if the index is non-constant since other combines 18525 // on the constant elements already work. 18526 SDLoc DL(N); 18527 if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT && 18528 Index == VecOp.getOperand(2)) { 18529 SDValue Elt = VecOp.getOperand(1); 18530 return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt; 18531 } 18532 18533 // (vextract (scalar_to_vector val, 0) -> val 18534 if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { 18535 // Only 0'th element of SCALAR_TO_VECTOR is defined. 18536 if (DAG.isKnownNeverZero(Index)) 18537 return DAG.getUNDEF(ScalarVT); 18538 18539 // Check if the result type doesn't match the inserted element type. A 18540 // SCALAR_TO_VECTOR may truncate the inserted element and the 18541 // EXTRACT_VECTOR_ELT may widen the extracted vector. 18542 SDValue InOp = VecOp.getOperand(0); 18543 if (InOp.getValueType() != ScalarVT) { 18544 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 18545 return DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 18546 } 18547 return InOp; 18548 } 18549 18550 // extract_vector_elt of out-of-bounds element -> UNDEF 18551 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 18552 if (IndexC && VecVT.isFixedLengthVector() && 18553 IndexC->getAPIntValue().uge(VecVT.getVectorNumElements())) 18554 return DAG.getUNDEF(ScalarVT); 18555 18556 // extract_vector_elt (build_vector x, y), 1 -> y 18557 if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) || 18558 VecOp.getOpcode() == ISD::SPLAT_VECTOR) && 18559 TLI.isTypeLegal(VecVT) && 18560 (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { 18561 assert((VecOp.getOpcode() != ISD::BUILD_VECTOR || 18562 VecVT.isFixedLengthVector()) && 18563 "BUILD_VECTOR used for scalable vectors"); 18564 unsigned IndexVal = 18565 VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0; 18566 SDValue Elt = VecOp.getOperand(IndexVal); 18567 EVT InEltVT = Elt.getValueType(); 18568 18569 // Sometimes build_vector's scalar input types do not match result type. 18570 if (ScalarVT == InEltVT) 18571 return Elt; 18572 18573 // TODO: It may be useful to truncate if free if the build_vector implicitly 18574 // converts. 18575 } 18576 18577 if (VecVT.isScalableVector()) 18578 return SDValue(); 18579 18580 // All the code from this point onwards assumes fixed width vectors, but it's 18581 // possible that some of the combinations could be made to work for scalable 18582 // vectors too. 18583 unsigned NumElts = VecVT.getVectorNumElements(); 18584 unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); 18585 18586 // TODO: These transforms should not require the 'hasOneUse' restriction, but 18587 // there are regressions on multiple targets without it. We can end up with a 18588 // mess of scalar and vector code if we reduce only part of the DAG to scalar. 18589 if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() && 18590 VecOp.hasOneUse()) { 18591 // The vector index of the LSBs of the source depend on the endian-ness. 18592 bool IsLE = DAG.getDataLayout().isLittleEndian(); 18593 unsigned ExtractIndex = IndexC->getZExtValue(); 18594 // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) 18595 unsigned BCTruncElt = IsLE ? 0 : NumElts - 1; 18596 SDValue BCSrc = VecOp.getOperand(0); 18597 if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger()) 18598 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc); 18599 18600 if (LegalTypes && BCSrc.getValueType().isInteger() && 18601 BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) { 18602 // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> 18603 // trunc i64 X to i32 18604 SDValue X = BCSrc.getOperand(0); 18605 assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() && 18606 "Extract element and scalar to vector can't change element type " 18607 "from FP to integer."); 18608 unsigned XBitWidth = X.getValueSizeInBits(); 18609 BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; 18610 18611 // An extract element return value type can be wider than its vector 18612 // operand element type. In that case, the high bits are undefined, so 18613 // it's possible that we may need to extend rather than truncate. 18614 if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) { 18615 assert(XBitWidth % VecEltBitWidth == 0 && 18616 "Scalar bitwidth must be a multiple of vector element bitwidth"); 18617 return DAG.getAnyExtOrTrunc(X, DL, ScalarVT); 18618 } 18619 } 18620 } 18621 18622 if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) 18623 return BO; 18624 18625 // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. 18626 // We only perform this optimization before the op legalization phase because 18627 // we may introduce new vector instructions which are not backed by TD 18628 // patterns. For example on AVX, extracting elements from a wide vector 18629 // without using extract_subvector. However, if we can find an underlying 18630 // scalar value, then we can always use that. 18631 if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) { 18632 auto *Shuf = cast<ShuffleVectorSDNode>(VecOp); 18633 // Find the new index to extract from. 18634 int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue()); 18635 18636 // Extracting an undef index is undef. 18637 if (OrigElt == -1) 18638 return DAG.getUNDEF(ScalarVT); 18639 18640 // Select the right vector half to extract from. 18641 SDValue SVInVec; 18642 if (OrigElt < (int)NumElts) { 18643 SVInVec = VecOp.getOperand(0); 18644 } else { 18645 SVInVec = VecOp.getOperand(1); 18646 OrigElt -= NumElts; 18647 } 18648 18649 if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { 18650 SDValue InOp = SVInVec.getOperand(OrigElt); 18651 if (InOp.getValueType() != ScalarVT) { 18652 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 18653 InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 18654 } 18655 18656 return InOp; 18657 } 18658 18659 // FIXME: We should handle recursing on other vector shuffles and 18660 // scalar_to_vector here as well. 18661 18662 if (!LegalOperations || 18663 // FIXME: Should really be just isOperationLegalOrCustom. 18664 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) || 18665 TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) { 18666 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec, 18667 DAG.getVectorIdxConstant(OrigElt, DL)); 18668 } 18669 } 18670 18671 // If only EXTRACT_VECTOR_ELT nodes use the source vector we can 18672 // simplify it based on the (valid) extraction indices. 18673 if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) { 18674 return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18675 Use->getOperand(0) == VecOp && 18676 isa<ConstantSDNode>(Use->getOperand(1)); 18677 })) { 18678 APInt DemandedElts = APInt::getNullValue(NumElts); 18679 for (SDNode *Use : VecOp->uses()) { 18680 auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1)); 18681 if (CstElt->getAPIntValue().ult(NumElts)) 18682 DemandedElts.setBit(CstElt->getZExtValue()); 18683 } 18684 if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) { 18685 // We simplified the vector operand of this extract element. If this 18686 // extract is not dead, visit it again so it is folded properly. 18687 if (N->getOpcode() != ISD::DELETED_NODE) 18688 AddToWorklist(N); 18689 return SDValue(N, 0); 18690 } 18691 APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth); 18692 if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) { 18693 // We simplified the vector operand of this extract element. If this 18694 // extract is not dead, visit it again so it is folded properly. 18695 if (N->getOpcode() != ISD::DELETED_NODE) 18696 AddToWorklist(N); 18697 return SDValue(N, 0); 18698 } 18699 } 18700 18701 // Everything under here is trying to match an extract of a loaded value. 18702 // If the result of load has to be truncated, then it's not necessarily 18703 // profitable. 18704 bool BCNumEltsChanged = false; 18705 EVT ExtVT = VecVT.getVectorElementType(); 18706 EVT LVT = ExtVT; 18707 if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT)) 18708 return SDValue(); 18709 18710 if (VecOp.getOpcode() == ISD::BITCAST) { 18711 // Don't duplicate a load with other uses. 18712 if (!VecOp.hasOneUse()) 18713 return SDValue(); 18714 18715 EVT BCVT = VecOp.getOperand(0).getValueType(); 18716 if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) 18717 return SDValue(); 18718 if (NumElts != BCVT.getVectorNumElements()) 18719 BCNumEltsChanged = true; 18720 VecOp = VecOp.getOperand(0); 18721 ExtVT = BCVT.getVectorElementType(); 18722 } 18723 18724 // extract (vector load $addr), i --> load $addr + i * size 18725 if (!LegalOperations && !IndexC && VecOp.hasOneUse() && 18726 ISD::isNormalLoad(VecOp.getNode()) && 18727 !Index->hasPredecessor(VecOp.getNode())) { 18728 auto *VecLoad = dyn_cast<LoadSDNode>(VecOp); 18729 if (VecLoad && VecLoad->isSimple()) 18730 return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad); 18731 } 18732 18733 // Perform only after legalization to ensure build_vector / vector_shuffle 18734 // optimizations have already been done. 18735 if (!LegalOperations || !IndexC) 18736 return SDValue(); 18737 18738 // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) 18739 // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) 18740 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) 18741 int Elt = IndexC->getZExtValue(); 18742 LoadSDNode *LN0 = nullptr; 18743 if (ISD::isNormalLoad(VecOp.getNode())) { 18744 LN0 = cast<LoadSDNode>(VecOp); 18745 } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 18746 VecOp.getOperand(0).getValueType() == ExtVT && 18747 ISD::isNormalLoad(VecOp.getOperand(0).getNode())) { 18748 // Don't duplicate a load with other uses. 18749 if (!VecOp.hasOneUse()) 18750 return SDValue(); 18751 18752 LN0 = cast<LoadSDNode>(VecOp.getOperand(0)); 18753 } 18754 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) { 18755 // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) 18756 // => 18757 // (load $addr+1*size) 18758 18759 // Don't duplicate a load with other uses. 18760 if (!VecOp.hasOneUse()) 18761 return SDValue(); 18762 18763 // If the bit convert changed the number of elements, it is unsafe 18764 // to examine the mask. 18765 if (BCNumEltsChanged) 18766 return SDValue(); 18767 18768 // Select the input vector, guarding against out of range extract vector. 18769 int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt); 18770 VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1); 18771 18772 if (VecOp.getOpcode() == ISD::BITCAST) { 18773 // Don't duplicate a load with other uses. 18774 if (!VecOp.hasOneUse()) 18775 return SDValue(); 18776 18777 VecOp = VecOp.getOperand(0); 18778 } 18779 if (ISD::isNormalLoad(VecOp.getNode())) { 18780 LN0 = cast<LoadSDNode>(VecOp); 18781 Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts; 18782 Index = DAG.getConstant(Elt, DL, Index.getValueType()); 18783 } 18784 } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged && 18785 VecVT.getVectorElementType() == ScalarVT && 18786 (!LegalTypes || 18787 TLI.isTypeLegal( 18788 VecOp.getOperand(0).getValueType().getVectorElementType()))) { 18789 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0 18790 // -> extract_vector_elt a, 0 18791 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1 18792 // -> extract_vector_elt a, 1 18793 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2 18794 // -> extract_vector_elt b, 0 18795 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3 18796 // -> extract_vector_elt b, 1 18797 SDLoc SL(N); 18798 EVT ConcatVT = VecOp.getOperand(0).getValueType(); 18799 unsigned ConcatNumElts = ConcatVT.getVectorNumElements(); 18800 SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL, 18801 Index.getValueType()); 18802 18803 SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts); 18804 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, 18805 ConcatVT.getVectorElementType(), 18806 ConcatOp, NewIdx); 18807 return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt); 18808 } 18809 18810 // Make sure we found a non-volatile load and the extractelement is 18811 // the only use. 18812 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple()) 18813 return SDValue(); 18814 18815 // If Idx was -1 above, Elt is going to be -1, so just return undef. 18816 if (Elt == -1) 18817 return DAG.getUNDEF(LVT); 18818 18819 return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0); 18820 } 18821 18822 // Simplify (build_vec (ext )) to (bitcast (build_vec )) 18823 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { 18824 // We perform this optimization post type-legalization because 18825 // the type-legalizer often scalarizes integer-promoted vectors. 18826 // Performing this optimization before may create bit-casts which 18827 // will be type-legalized to complex code sequences. 18828 // We perform this optimization only before the operation legalizer because we 18829 // may introduce illegal operations. 18830 if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) 18831 return SDValue(); 18832 18833 unsigned NumInScalars = N->getNumOperands(); 18834 SDLoc DL(N); 18835 EVT VT = N->getValueType(0); 18836 18837 // Check to see if this is a BUILD_VECTOR of a bunch of values 18838 // which come from any_extend or zero_extend nodes. If so, we can create 18839 // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR 18840 // optimizations. We do not handle sign-extend because we can't fill the sign 18841 // using shuffles. 18842 EVT SourceType = MVT::Other; 18843 bool AllAnyExt = true; 18844 18845 for (unsigned i = 0; i != NumInScalars; ++i) { 18846 SDValue In = N->getOperand(i); 18847 // Ignore undef inputs. 18848 if (In.isUndef()) continue; 18849 18850 bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; 18851 bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; 18852 18853 // Abort if the element is not an extension. 18854 if (!ZeroExt && !AnyExt) { 18855 SourceType = MVT::Other; 18856 break; 18857 } 18858 18859 // The input is a ZeroExt or AnyExt. Check the original type. 18860 EVT InTy = In.getOperand(0).getValueType(); 18861 18862 // Check that all of the widened source types are the same. 18863 if (SourceType == MVT::Other) 18864 // First time. 18865 SourceType = InTy; 18866 else if (InTy != SourceType) { 18867 // Multiple income types. Abort. 18868 SourceType = MVT::Other; 18869 break; 18870 } 18871 18872 // Check if all of the extends are ANY_EXTENDs. 18873 AllAnyExt &= AnyExt; 18874 } 18875 18876 // In order to have valid types, all of the inputs must be extended from the 18877 // same source type and all of the inputs must be any or zero extend. 18878 // Scalar sizes must be a power of two. 18879 EVT OutScalarTy = VT.getScalarType(); 18880 bool ValidTypes = SourceType != MVT::Other && 18881 isPowerOf2_32(OutScalarTy.getSizeInBits()) && 18882 isPowerOf2_32(SourceType.getSizeInBits()); 18883 18884 // Create a new simpler BUILD_VECTOR sequence which other optimizations can 18885 // turn into a single shuffle instruction. 18886 if (!ValidTypes) 18887 return SDValue(); 18888 18889 // If we already have a splat buildvector, then don't fold it if it means 18890 // introducing zeros. 18891 if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true)) 18892 return SDValue(); 18893 18894 bool isLE = DAG.getDataLayout().isLittleEndian(); 18895 unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); 18896 assert(ElemRatio > 1 && "Invalid element size ratio"); 18897 SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): 18898 DAG.getConstant(0, DL, SourceType); 18899 18900 unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); 18901 SmallVector<SDValue, 8> Ops(NewBVElems, Filler); 18902 18903 // Populate the new build_vector 18904 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 18905 SDValue Cast = N->getOperand(i); 18906 assert((Cast.getOpcode() == ISD::ANY_EXTEND || 18907 Cast.getOpcode() == ISD::ZERO_EXTEND || 18908 Cast.isUndef()) && "Invalid cast opcode"); 18909 SDValue In; 18910 if (Cast.isUndef()) 18911 In = DAG.getUNDEF(SourceType); 18912 else 18913 In = Cast->getOperand(0); 18914 unsigned Index = isLE ? (i * ElemRatio) : 18915 (i * ElemRatio + (ElemRatio - 1)); 18916 18917 assert(Index < Ops.size() && "Invalid index"); 18918 Ops[Index] = In; 18919 } 18920 18921 // The type of the new BUILD_VECTOR node. 18922 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); 18923 assert(VecVT.getSizeInBits() == VT.getSizeInBits() && 18924 "Invalid vector size"); 18925 // Check if the new vector type is legal. 18926 if (!isTypeLegal(VecVT) || 18927 (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) && 18928 TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))) 18929 return SDValue(); 18930 18931 // Make the new BUILD_VECTOR. 18932 SDValue BV = DAG.getBuildVector(VecVT, DL, Ops); 18933 18934 // The new BUILD_VECTOR node has the potential to be further optimized. 18935 AddToWorklist(BV.getNode()); 18936 // Bitcast to the desired type. 18937 return DAG.getBitcast(VT, BV); 18938 } 18939 18940 // Simplify (build_vec (trunc $1) 18941 // (trunc (srl $1 half-width)) 18942 // (trunc (srl $1 (2 * half-width))) …) 18943 // to (bitcast $1) 18944 SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) { 18945 assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 18946 18947 // Only for little endian 18948 if (!DAG.getDataLayout().isLittleEndian()) 18949 return SDValue(); 18950 18951 SDLoc DL(N); 18952 EVT VT = N->getValueType(0); 18953 EVT OutScalarTy = VT.getScalarType(); 18954 uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits(); 18955 18956 // Only for power of two types to be sure that bitcast works well 18957 if (!isPowerOf2_64(ScalarTypeBitsize)) 18958 return SDValue(); 18959 18960 unsigned NumInScalars = N->getNumOperands(); 18961 18962 // Look through bitcasts 18963 auto PeekThroughBitcast = [](SDValue Op) { 18964 if (Op.getOpcode() == ISD::BITCAST) 18965 return Op.getOperand(0); 18966 return Op; 18967 }; 18968 18969 // The source value where all the parts are extracted. 18970 SDValue Src; 18971 for (unsigned i = 0; i != NumInScalars; ++i) { 18972 SDValue In = PeekThroughBitcast(N->getOperand(i)); 18973 // Ignore undef inputs. 18974 if (In.isUndef()) continue; 18975 18976 if (In.getOpcode() != ISD::TRUNCATE) 18977 return SDValue(); 18978 18979 In = PeekThroughBitcast(In.getOperand(0)); 18980 18981 if (In.getOpcode() != ISD::SRL) { 18982 // For now only build_vec without shuffling, handle shifts here in the 18983 // future. 18984 if (i != 0) 18985 return SDValue(); 18986 18987 Src = In; 18988 } else { 18989 // In is SRL 18990 SDValue part = PeekThroughBitcast(In.getOperand(0)); 18991 18992 if (!Src) { 18993 Src = part; 18994 } else if (Src != part) { 18995 // Vector parts do not stem from the same variable 18996 return SDValue(); 18997 } 18998 18999 SDValue ShiftAmtVal = In.getOperand(1); 19000 if (!isa<ConstantSDNode>(ShiftAmtVal)) 19001 return SDValue(); 19002 19003 uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1); 19004 19005 // The extracted value is not extracted at the right position 19006 if (ShiftAmt != i * ScalarTypeBitsize) 19007 return SDValue(); 19008 } 19009 } 19010 19011 // Only cast if the size is the same 19012 if (Src.getValueType().getSizeInBits() != VT.getSizeInBits()) 19013 return SDValue(); 19014 19015 return DAG.getBitcast(VT, Src); 19016 } 19017 19018 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, 19019 ArrayRef<int> VectorMask, 19020 SDValue VecIn1, SDValue VecIn2, 19021 unsigned LeftIdx, bool DidSplitVec) { 19022 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 19023 19024 EVT VT = N->getValueType(0); 19025 EVT InVT1 = VecIn1.getValueType(); 19026 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; 19027 19028 unsigned NumElems = VT.getVectorNumElements(); 19029 unsigned ShuffleNumElems = NumElems; 19030 19031 // If we artificially split a vector in two already, then the offsets in the 19032 // operands will all be based off of VecIn1, even those in VecIn2. 19033 unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); 19034 19035 uint64_t VTSize = VT.getFixedSizeInBits(); 19036 uint64_t InVT1Size = InVT1.getFixedSizeInBits(); 19037 uint64_t InVT2Size = InVT2.getFixedSizeInBits(); 19038 19039 // We can't generate a shuffle node with mismatched input and output types. 19040 // Try to make the types match the type of the output. 19041 if (InVT1 != VT || InVT2 != VT) { 19042 if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) { 19043 // If the output vector length is a multiple of both input lengths, 19044 // we can concatenate them and pad the rest with undefs. 19045 unsigned NumConcats = VTSize / InVT1Size; 19046 assert(NumConcats >= 2 && "Concat needs at least two inputs!"); 19047 SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); 19048 ConcatOps[0] = VecIn1; 19049 ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); 19050 VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 19051 VecIn2 = SDValue(); 19052 } else if (InVT1Size == VTSize * 2) { 19053 if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) 19054 return SDValue(); 19055 19056 if (!VecIn2.getNode()) { 19057 // If we only have one input vector, and it's twice the size of the 19058 // output, split it in two. 19059 VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, 19060 DAG.getVectorIdxConstant(NumElems, DL)); 19061 VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx); 19062 // Since we now have shorter input vectors, adjust the offset of the 19063 // second vector's start. 19064 Vec2Offset = NumElems; 19065 } else if (InVT2Size <= InVT1Size) { 19066 // VecIn1 is wider than the output, and we have another, possibly 19067 // smaller input. Pad the smaller input with undefs, shuffle at the 19068 // input vector width, and extract the output. 19069 // The shuffle type is different than VT, so check legality again. 19070 if (LegalOperations && 19071 !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) 19072 return SDValue(); 19073 19074 // Legalizing INSERT_SUBVECTOR is tricky - you basically have to 19075 // lower it back into a BUILD_VECTOR. So if the inserted type is 19076 // illegal, don't even try. 19077 if (InVT1 != InVT2) { 19078 if (!TLI.isTypeLegal(InVT2)) 19079 return SDValue(); 19080 VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1, 19081 DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); 19082 } 19083 ShuffleNumElems = NumElems * 2; 19084 } else { 19085 // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider 19086 // than VecIn1. We can't handle this for now - this case will disappear 19087 // when we start sorting the vectors by type. 19088 return SDValue(); 19089 } 19090 } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) { 19091 SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2)); 19092 ConcatOps[0] = VecIn2; 19093 VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 19094 } else { 19095 // TODO: Support cases where the length mismatch isn't exactly by a 19096 // factor of 2. 19097 // TODO: Move this check upwards, so that if we have bad type 19098 // mismatches, we don't create any DAG nodes. 19099 return SDValue(); 19100 } 19101 } 19102 19103 // Initialize mask to undef. 19104 SmallVector<int, 8> Mask(ShuffleNumElems, -1); 19105 19106 // Only need to run up to the number of elements actually used, not the 19107 // total number of elements in the shuffle - if we are shuffling a wider 19108 // vector, the high lanes should be set to undef. 19109 for (unsigned i = 0; i != NumElems; ++i) { 19110 if (VectorMask[i] <= 0) 19111 continue; 19112 19113 unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1); 19114 if (VectorMask[i] == (int)LeftIdx) { 19115 Mask[i] = ExtIndex; 19116 } else if (VectorMask[i] == (int)LeftIdx + 1) { 19117 Mask[i] = Vec2Offset + ExtIndex; 19118 } 19119 } 19120 19121 // The type the input vectors may have changed above. 19122 InVT1 = VecIn1.getValueType(); 19123 19124 // If we already have a VecIn2, it should have the same type as VecIn1. 19125 // If we don't, get an undef/zero vector of the appropriate type. 19126 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1); 19127 assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); 19128 19129 SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask); 19130 if (ShuffleNumElems > NumElems) 19131 Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx); 19132 19133 return Shuffle; 19134 } 19135 19136 static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { 19137 assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 19138 19139 // First, determine where the build vector is not undef. 19140 // TODO: We could extend this to handle zero elements as well as undefs. 19141 int NumBVOps = BV->getNumOperands(); 19142 int ZextElt = -1; 19143 for (int i = 0; i != NumBVOps; ++i) { 19144 SDValue Op = BV->getOperand(i); 19145 if (Op.isUndef()) 19146 continue; 19147 if (ZextElt == -1) 19148 ZextElt = i; 19149 else 19150 return SDValue(); 19151 } 19152 // Bail out if there's no non-undef element. 19153 if (ZextElt == -1) 19154 return SDValue(); 19155 19156 // The build vector contains some number of undef elements and exactly 19157 // one other element. That other element must be a zero-extended scalar 19158 // extracted from a vector at a constant index to turn this into a shuffle. 19159 // Also, require that the build vector does not implicitly truncate/extend 19160 // its elements. 19161 // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND. 19162 EVT VT = BV->getValueType(0); 19163 SDValue Zext = BV->getOperand(ZextElt); 19164 if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() || 19165 Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 19166 !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) || 19167 Zext.getValueSizeInBits() != VT.getScalarSizeInBits()) 19168 return SDValue(); 19169 19170 // The zero-extend must be a multiple of the source size, and we must be 19171 // building a vector of the same size as the source of the extract element. 19172 SDValue Extract = Zext.getOperand(0); 19173 unsigned DestSize = Zext.getValueSizeInBits(); 19174 unsigned SrcSize = Extract.getValueSizeInBits(); 19175 if (DestSize % SrcSize != 0 || 19176 Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits()) 19177 return SDValue(); 19178 19179 // Create a shuffle mask that will combine the extracted element with zeros 19180 // and undefs. 19181 int ZextRatio = DestSize / SrcSize; 19182 int NumMaskElts = NumBVOps * ZextRatio; 19183 SmallVector<int, 32> ShufMask(NumMaskElts, -1); 19184 for (int i = 0; i != NumMaskElts; ++i) { 19185 if (i / ZextRatio == ZextElt) { 19186 // The low bits of the (potentially translated) extracted element map to 19187 // the source vector. The high bits map to zero. We will use a zero vector 19188 // as the 2nd source operand of the shuffle, so use the 1st element of 19189 // that vector (mask value is number-of-elements) for the high bits. 19190 if (i % ZextRatio == 0) 19191 ShufMask[i] = Extract.getConstantOperandVal(1); 19192 else 19193 ShufMask[i] = NumMaskElts; 19194 } 19195 19196 // Undef elements of the build vector remain undef because we initialize 19197 // the shuffle mask with -1. 19198 } 19199 19200 // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... --> 19201 // bitcast (shuffle V, ZeroVec, VectorMask) 19202 SDLoc DL(BV); 19203 EVT VecVT = Extract.getOperand(0).getValueType(); 19204 SDValue ZeroVec = DAG.getConstant(0, DL, VecVT); 19205 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19206 SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0), 19207 ZeroVec, ShufMask, DAG); 19208 if (!Shuf) 19209 return SDValue(); 19210 return DAG.getBitcast(VT, Shuf); 19211 } 19212 19213 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT 19214 // operations. If the types of the vectors we're extracting from allow it, 19215 // turn this into a vector_shuffle node. 19216 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { 19217 SDLoc DL(N); 19218 EVT VT = N->getValueType(0); 19219 19220 // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. 19221 if (!isTypeLegal(VT)) 19222 return SDValue(); 19223 19224 if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG)) 19225 return V; 19226 19227 // May only combine to shuffle after legalize if shuffle is legal. 19228 if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) 19229 return SDValue(); 19230 19231 bool UsesZeroVector = false; 19232 unsigned NumElems = N->getNumOperands(); 19233 19234 // Record, for each element of the newly built vector, which input vector 19235 // that element comes from. -1 stands for undef, 0 for the zero vector, 19236 // and positive values for the input vectors. 19237 // VectorMask maps each element to its vector number, and VecIn maps vector 19238 // numbers to their initial SDValues. 19239 19240 SmallVector<int, 8> VectorMask(NumElems, -1); 19241 SmallVector<SDValue, 8> VecIn; 19242 VecIn.push_back(SDValue()); 19243 19244 for (unsigned i = 0; i != NumElems; ++i) { 19245 SDValue Op = N->getOperand(i); 19246 19247 if (Op.isUndef()) 19248 continue; 19249 19250 // See if we can use a blend with a zero vector. 19251 // TODO: Should we generalize this to a blend with an arbitrary constant 19252 // vector? 19253 if (isNullConstant(Op) || isNullFPConstant(Op)) { 19254 UsesZeroVector = true; 19255 VectorMask[i] = 0; 19256 continue; 19257 } 19258 19259 // Not an undef or zero. If the input is something other than an 19260 // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. 19261 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 19262 !isa<ConstantSDNode>(Op.getOperand(1))) 19263 return SDValue(); 19264 SDValue ExtractedFromVec = Op.getOperand(0); 19265 19266 if (ExtractedFromVec.getValueType().isScalableVector()) 19267 return SDValue(); 19268 19269 const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); 19270 if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) 19271 return SDValue(); 19272 19273 // All inputs must have the same element type as the output. 19274 if (VT.getVectorElementType() != 19275 ExtractedFromVec.getValueType().getVectorElementType()) 19276 return SDValue(); 19277 19278 // Have we seen this input vector before? 19279 // The vectors are expected to be tiny (usually 1 or 2 elements), so using 19280 // a map back from SDValues to numbers isn't worth it. 19281 unsigned Idx = std::distance(VecIn.begin(), find(VecIn, ExtractedFromVec)); 19282 if (Idx == VecIn.size()) 19283 VecIn.push_back(ExtractedFromVec); 19284 19285 VectorMask[i] = Idx; 19286 } 19287 19288 // If we didn't find at least one input vector, bail out. 19289 if (VecIn.size() < 2) 19290 return SDValue(); 19291 19292 // If all the Operands of BUILD_VECTOR extract from same 19293 // vector, then split the vector efficiently based on the maximum 19294 // vector access index and adjust the VectorMask and 19295 // VecIn accordingly. 19296 bool DidSplitVec = false; 19297 if (VecIn.size() == 2) { 19298 unsigned MaxIndex = 0; 19299 unsigned NearestPow2 = 0; 19300 SDValue Vec = VecIn.back(); 19301 EVT InVT = Vec.getValueType(); 19302 SmallVector<unsigned, 8> IndexVec(NumElems, 0); 19303 19304 for (unsigned i = 0; i < NumElems; i++) { 19305 if (VectorMask[i] <= 0) 19306 continue; 19307 unsigned Index = N->getOperand(i).getConstantOperandVal(1); 19308 IndexVec[i] = Index; 19309 MaxIndex = std::max(MaxIndex, Index); 19310 } 19311 19312 NearestPow2 = PowerOf2Ceil(MaxIndex); 19313 if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && 19314 NumElems * 2 < NearestPow2) { 19315 unsigned SplitSize = NearestPow2 / 2; 19316 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), 19317 InVT.getVectorElementType(), SplitSize); 19318 if (TLI.isTypeLegal(SplitVT)) { 19319 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 19320 DAG.getVectorIdxConstant(SplitSize, DL)); 19321 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 19322 DAG.getVectorIdxConstant(0, DL)); 19323 VecIn.pop_back(); 19324 VecIn.push_back(VecIn1); 19325 VecIn.push_back(VecIn2); 19326 DidSplitVec = true; 19327 19328 for (unsigned i = 0; i < NumElems; i++) { 19329 if (VectorMask[i] <= 0) 19330 continue; 19331 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; 19332 } 19333 } 19334 } 19335 } 19336 19337 // TODO: We want to sort the vectors by descending length, so that adjacent 19338 // pairs have similar length, and the longer vector is always first in the 19339 // pair. 19340 19341 // TODO: Should this fire if some of the input vectors has illegal type (like 19342 // it does now), or should we let legalization run its course first? 19343 19344 // Shuffle phase: 19345 // Take pairs of vectors, and shuffle them so that the result has elements 19346 // from these vectors in the correct places. 19347 // For example, given: 19348 // t10: i32 = extract_vector_elt t1, Constant:i64<0> 19349 // t11: i32 = extract_vector_elt t2, Constant:i64<0> 19350 // t12: i32 = extract_vector_elt t3, Constant:i64<0> 19351 // t13: i32 = extract_vector_elt t1, Constant:i64<1> 19352 // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13 19353 // We will generate: 19354 // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2 19355 // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef 19356 SmallVector<SDValue, 4> Shuffles; 19357 for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) { 19358 unsigned LeftIdx = 2 * In + 1; 19359 SDValue VecLeft = VecIn[LeftIdx]; 19360 SDValue VecRight = 19361 (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); 19362 19363 if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, 19364 VecRight, LeftIdx, DidSplitVec)) 19365 Shuffles.push_back(Shuffle); 19366 else 19367 return SDValue(); 19368 } 19369 19370 // If we need the zero vector as an "ingredient" in the blend tree, add it 19371 // to the list of shuffles. 19372 if (UsesZeroVector) 19373 Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT) 19374 : DAG.getConstantFP(0.0, DL, VT)); 19375 19376 // If we only have one shuffle, we're done. 19377 if (Shuffles.size() == 1) 19378 return Shuffles[0]; 19379 19380 // Update the vector mask to point to the post-shuffle vectors. 19381 for (int &Vec : VectorMask) 19382 if (Vec == 0) 19383 Vec = Shuffles.size() - 1; 19384 else 19385 Vec = (Vec - 1) / 2; 19386 19387 // More than one shuffle. Generate a binary tree of blends, e.g. if from 19388 // the previous step we got the set of shuffles t10, t11, t12, t13, we will 19389 // generate: 19390 // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2 19391 // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4 19392 // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6 19393 // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8 19394 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11 19395 // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13 19396 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21 19397 19398 // Make sure the initial size of the shuffle list is even. 19399 if (Shuffles.size() % 2) 19400 Shuffles.push_back(DAG.getUNDEF(VT)); 19401 19402 for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) { 19403 if (CurSize % 2) { 19404 Shuffles[CurSize] = DAG.getUNDEF(VT); 19405 CurSize++; 19406 } 19407 for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { 19408 int Left = 2 * In; 19409 int Right = 2 * In + 1; 19410 SmallVector<int, 8> Mask(NumElems, -1); 19411 for (unsigned i = 0; i != NumElems; ++i) { 19412 if (VectorMask[i] == Left) { 19413 Mask[i] = i; 19414 VectorMask[i] = In; 19415 } else if (VectorMask[i] == Right) { 19416 Mask[i] = i + NumElems; 19417 VectorMask[i] = In; 19418 } 19419 } 19420 19421 Shuffles[In] = 19422 DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); 19423 } 19424 } 19425 return Shuffles[0]; 19426 } 19427 19428 // Try to turn a build vector of zero extends of extract vector elts into a 19429 // a vector zero extend and possibly an extract subvector. 19430 // TODO: Support sign extend? 19431 // TODO: Allow undef elements? 19432 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { 19433 if (LegalOperations) 19434 return SDValue(); 19435 19436 EVT VT = N->getValueType(0); 19437 19438 bool FoundZeroExtend = false; 19439 SDValue Op0 = N->getOperand(0); 19440 auto checkElem = [&](SDValue Op) -> int64_t { 19441 unsigned Opc = Op.getOpcode(); 19442 FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND); 19443 if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) && 19444 Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 19445 Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) 19446 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1))) 19447 return C->getZExtValue(); 19448 return -1; 19449 }; 19450 19451 // Make sure the first element matches 19452 // (zext (extract_vector_elt X, C)) 19453 int64_t Offset = checkElem(Op0); 19454 if (Offset < 0) 19455 return SDValue(); 19456 19457 unsigned NumElems = N->getNumOperands(); 19458 SDValue In = Op0.getOperand(0).getOperand(0); 19459 EVT InSVT = In.getValueType().getScalarType(); 19460 EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems); 19461 19462 // Don't create an illegal input type after type legalization. 19463 if (LegalTypes && !TLI.isTypeLegal(InVT)) 19464 return SDValue(); 19465 19466 // Ensure all the elements come from the same vector and are adjacent. 19467 for (unsigned i = 1; i != NumElems; ++i) { 19468 if ((Offset + i) != checkElem(N->getOperand(i))) 19469 return SDValue(); 19470 } 19471 19472 SDLoc DL(N); 19473 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, 19474 Op0.getOperand(0).getOperand(1)); 19475 return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL, 19476 VT, In); 19477 } 19478 19479 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { 19480 EVT VT = N->getValueType(0); 19481 19482 // A vector built entirely of undefs is undef. 19483 if (ISD::allOperandsUndef(N)) 19484 return DAG.getUNDEF(VT); 19485 19486 // If this is a splat of a bitcast from another vector, change to a 19487 // concat_vector. 19488 // For example: 19489 // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) -> 19490 // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X)))) 19491 // 19492 // If X is a build_vector itself, the concat can become a larger build_vector. 19493 // TODO: Maybe this is useful for non-splat too? 19494 if (!LegalOperations) { 19495 if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) { 19496 Splat = peekThroughBitcasts(Splat); 19497 EVT SrcVT = Splat.getValueType(); 19498 if (SrcVT.isVector()) { 19499 unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements(); 19500 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), 19501 SrcVT.getVectorElementType(), NumElts); 19502 if (!LegalTypes || TLI.isTypeLegal(NewVT)) { 19503 SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat); 19504 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), 19505 NewVT, Ops); 19506 return DAG.getBitcast(VT, Concat); 19507 } 19508 } 19509 } 19510 } 19511 19512 // A splat of a single element is a SPLAT_VECTOR if supported on the target. 19513 if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand) 19514 if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) { 19515 assert(!V.isUndef() && "Splat of undef should have been handled earlier"); 19516 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V); 19517 } 19518 19519 // Check if we can express BUILD VECTOR via subvector extract. 19520 if (!LegalTypes && (N->getNumOperands() > 1)) { 19521 SDValue Op0 = N->getOperand(0); 19522 auto checkElem = [&](SDValue Op) -> uint64_t { 19523 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && 19524 (Op0.getOperand(0) == Op.getOperand(0))) 19525 if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 19526 return CNode->getZExtValue(); 19527 return -1; 19528 }; 19529 19530 int Offset = checkElem(Op0); 19531 for (unsigned i = 0; i < N->getNumOperands(); ++i) { 19532 if (Offset + i != checkElem(N->getOperand(i))) { 19533 Offset = -1; 19534 break; 19535 } 19536 } 19537 19538 if ((Offset == 0) && 19539 (Op0.getOperand(0).getValueType() == N->getValueType(0))) 19540 return Op0.getOperand(0); 19541 if ((Offset != -1) && 19542 ((Offset % N->getValueType(0).getVectorNumElements()) == 19543 0)) // IDX must be multiple of output size. 19544 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), 19545 Op0.getOperand(0), Op0.getOperand(1)); 19546 } 19547 19548 if (SDValue V = convertBuildVecZextToZext(N)) 19549 return V; 19550 19551 if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) 19552 return V; 19553 19554 if (SDValue V = reduceBuildVecTruncToBitCast(N)) 19555 return V; 19556 19557 if (SDValue V = reduceBuildVecToShuffle(N)) 19558 return V; 19559 19560 return SDValue(); 19561 } 19562 19563 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { 19564 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19565 EVT OpVT = N->getOperand(0).getValueType(); 19566 19567 // If the operands are legal vectors, leave them alone. 19568 if (TLI.isTypeLegal(OpVT)) 19569 return SDValue(); 19570 19571 SDLoc DL(N); 19572 EVT VT = N->getValueType(0); 19573 SmallVector<SDValue, 8> Ops; 19574 19575 EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); 19576 SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 19577 19578 // Keep track of what we encounter. 19579 bool AnyInteger = false; 19580 bool AnyFP = false; 19581 for (const SDValue &Op : N->ops()) { 19582 if (ISD::BITCAST == Op.getOpcode() && 19583 !Op.getOperand(0).getValueType().isVector()) 19584 Ops.push_back(Op.getOperand(0)); 19585 else if (ISD::UNDEF == Op.getOpcode()) 19586 Ops.push_back(ScalarUndef); 19587 else 19588 return SDValue(); 19589 19590 // Note whether we encounter an integer or floating point scalar. 19591 // If it's neither, bail out, it could be something weird like x86mmx. 19592 EVT LastOpVT = Ops.back().getValueType(); 19593 if (LastOpVT.isFloatingPoint()) 19594 AnyFP = true; 19595 else if (LastOpVT.isInteger()) 19596 AnyInteger = true; 19597 else 19598 return SDValue(); 19599 } 19600 19601 // If any of the operands is a floating point scalar bitcast to a vector, 19602 // use floating point types throughout, and bitcast everything. 19603 // Replace UNDEFs by another scalar UNDEF node, of the final desired type. 19604 if (AnyFP) { 19605 SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); 19606 ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 19607 if (AnyInteger) { 19608 for (SDValue &Op : Ops) { 19609 if (Op.getValueType() == SVT) 19610 continue; 19611 if (Op.isUndef()) 19612 Op = ScalarUndef; 19613 else 19614 Op = DAG.getBitcast(SVT, Op); 19615 } 19616 } 19617 } 19618 19619 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, 19620 VT.getSizeInBits() / SVT.getSizeInBits()); 19621 return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); 19622 } 19623 19624 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR 19625 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at 19626 // most two distinct vectors the same size as the result, attempt to turn this 19627 // into a legal shuffle. 19628 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { 19629 EVT VT = N->getValueType(0); 19630 EVT OpVT = N->getOperand(0).getValueType(); 19631 19632 // We currently can't generate an appropriate shuffle for a scalable vector. 19633 if (VT.isScalableVector()) 19634 return SDValue(); 19635 19636 int NumElts = VT.getVectorNumElements(); 19637 int NumOpElts = OpVT.getVectorNumElements(); 19638 19639 SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); 19640 SmallVector<int, 8> Mask; 19641 19642 for (SDValue Op : N->ops()) { 19643 Op = peekThroughBitcasts(Op); 19644 19645 // UNDEF nodes convert to UNDEF shuffle mask values. 19646 if (Op.isUndef()) { 19647 Mask.append((unsigned)NumOpElts, -1); 19648 continue; 19649 } 19650 19651 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 19652 return SDValue(); 19653 19654 // What vector are we extracting the subvector from and at what index? 19655 SDValue ExtVec = Op.getOperand(0); 19656 int ExtIdx = Op.getConstantOperandVal(1); 19657 19658 // We want the EVT of the original extraction to correctly scale the 19659 // extraction index. 19660 EVT ExtVT = ExtVec.getValueType(); 19661 ExtVec = peekThroughBitcasts(ExtVec); 19662 19663 // UNDEF nodes convert to UNDEF shuffle mask values. 19664 if (ExtVec.isUndef()) { 19665 Mask.append((unsigned)NumOpElts, -1); 19666 continue; 19667 } 19668 19669 // Ensure that we are extracting a subvector from a vector the same 19670 // size as the result. 19671 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) 19672 return SDValue(); 19673 19674 // Scale the subvector index to account for any bitcast. 19675 int NumExtElts = ExtVT.getVectorNumElements(); 19676 if (0 == (NumExtElts % NumElts)) 19677 ExtIdx /= (NumExtElts / NumElts); 19678 else if (0 == (NumElts % NumExtElts)) 19679 ExtIdx *= (NumElts / NumExtElts); 19680 else 19681 return SDValue(); 19682 19683 // At most we can reference 2 inputs in the final shuffle. 19684 if (SV0.isUndef() || SV0 == ExtVec) { 19685 SV0 = ExtVec; 19686 for (int i = 0; i != NumOpElts; ++i) 19687 Mask.push_back(i + ExtIdx); 19688 } else if (SV1.isUndef() || SV1 == ExtVec) { 19689 SV1 = ExtVec; 19690 for (int i = 0; i != NumOpElts; ++i) 19691 Mask.push_back(i + ExtIdx + NumElts); 19692 } else { 19693 return SDValue(); 19694 } 19695 } 19696 19697 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19698 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), 19699 DAG.getBitcast(VT, SV1), Mask, DAG); 19700 } 19701 19702 static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) { 19703 unsigned CastOpcode = N->getOperand(0).getOpcode(); 19704 switch (CastOpcode) { 19705 case ISD::SINT_TO_FP: 19706 case ISD::UINT_TO_FP: 19707 case ISD::FP_TO_SINT: 19708 case ISD::FP_TO_UINT: 19709 // TODO: Allow more opcodes? 19710 // case ISD::BITCAST: 19711 // case ISD::TRUNCATE: 19712 // case ISD::ZERO_EXTEND: 19713 // case ISD::SIGN_EXTEND: 19714 // case ISD::FP_EXTEND: 19715 break; 19716 default: 19717 return SDValue(); 19718 } 19719 19720 EVT SrcVT = N->getOperand(0).getOperand(0).getValueType(); 19721 if (!SrcVT.isVector()) 19722 return SDValue(); 19723 19724 // All operands of the concat must be the same kind of cast from the same 19725 // source type. 19726 SmallVector<SDValue, 4> SrcOps; 19727 for (SDValue Op : N->ops()) { 19728 if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() || 19729 Op.getOperand(0).getValueType() != SrcVT) 19730 return SDValue(); 19731 SrcOps.push_back(Op.getOperand(0)); 19732 } 19733 19734 // The wider cast must be supported by the target. This is unusual because 19735 // the operation support type parameter depends on the opcode. In addition, 19736 // check the other type in the cast to make sure this is really legal. 19737 EVT VT = N->getValueType(0); 19738 EVT SrcEltVT = SrcVT.getVectorElementType(); 19739 ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands(); 19740 EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts); 19741 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19742 switch (CastOpcode) { 19743 case ISD::SINT_TO_FP: 19744 case ISD::UINT_TO_FP: 19745 if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) || 19746 !TLI.isTypeLegal(VT)) 19747 return SDValue(); 19748 break; 19749 case ISD::FP_TO_SINT: 19750 case ISD::FP_TO_UINT: 19751 if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) || 19752 !TLI.isTypeLegal(ConcatSrcVT)) 19753 return SDValue(); 19754 break; 19755 default: 19756 llvm_unreachable("Unexpected cast opcode"); 19757 } 19758 19759 // concat (cast X), (cast Y)... -> cast (concat X, Y...) 19760 SDLoc DL(N); 19761 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps); 19762 return DAG.getNode(CastOpcode, DL, VT, NewConcat); 19763 } 19764 19765 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { 19766 // If we only have one input vector, we don't need to do any concatenation. 19767 if (N->getNumOperands() == 1) 19768 return N->getOperand(0); 19769 19770 // Check if all of the operands are undefs. 19771 EVT VT = N->getValueType(0); 19772 if (ISD::allOperandsUndef(N)) 19773 return DAG.getUNDEF(VT); 19774 19775 // Optimize concat_vectors where all but the first of the vectors are undef. 19776 if (all_of(drop_begin(N->ops()), 19777 [](const SDValue &Op) { return Op.isUndef(); })) { 19778 SDValue In = N->getOperand(0); 19779 assert(In.getValueType().isVector() && "Must concat vectors"); 19780 19781 // If the input is a concat_vectors, just make a larger concat by padding 19782 // with smaller undefs. 19783 if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) { 19784 unsigned NumOps = N->getNumOperands() * In.getNumOperands(); 19785 SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end()); 19786 Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType())); 19787 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 19788 } 19789 19790 SDValue Scalar = peekThroughOneUseBitcasts(In); 19791 19792 // concat_vectors(scalar_to_vector(scalar), undef) -> 19793 // scalar_to_vector(scalar) 19794 if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR && 19795 Scalar.hasOneUse()) { 19796 EVT SVT = Scalar.getValueType().getVectorElementType(); 19797 if (SVT == Scalar.getOperand(0).getValueType()) 19798 Scalar = Scalar.getOperand(0); 19799 } 19800 19801 // concat_vectors(scalar, undef) -> scalar_to_vector(scalar) 19802 if (!Scalar.getValueType().isVector()) { 19803 // If the bitcast type isn't legal, it might be a trunc of a legal type; 19804 // look through the trunc so we can still do the transform: 19805 // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) 19806 if (Scalar->getOpcode() == ISD::TRUNCATE && 19807 !TLI.isTypeLegal(Scalar.getValueType()) && 19808 TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) 19809 Scalar = Scalar->getOperand(0); 19810 19811 EVT SclTy = Scalar.getValueType(); 19812 19813 if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) 19814 return SDValue(); 19815 19816 // Bail out if the vector size is not a multiple of the scalar size. 19817 if (VT.getSizeInBits() % SclTy.getSizeInBits()) 19818 return SDValue(); 19819 19820 unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); 19821 if (VNTNumElms < 2) 19822 return SDValue(); 19823 19824 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); 19825 if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) 19826 return SDValue(); 19827 19828 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar); 19829 return DAG.getBitcast(VT, Res); 19830 } 19831 } 19832 19833 // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. 19834 // We have already tested above for an UNDEF only concatenation. 19835 // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) 19836 // -> (BUILD_VECTOR A, B, ..., C, D, ...) 19837 auto IsBuildVectorOrUndef = [](const SDValue &Op) { 19838 return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); 19839 }; 19840 if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { 19841 SmallVector<SDValue, 8> Opnds; 19842 EVT SVT = VT.getScalarType(); 19843 19844 EVT MinVT = SVT; 19845 if (!SVT.isFloatingPoint()) { 19846 // If BUILD_VECTOR are from built from integer, they may have different 19847 // operand types. Get the smallest type and truncate all operands to it. 19848 bool FoundMinVT = false; 19849 for (const SDValue &Op : N->ops()) 19850 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 19851 EVT OpSVT = Op.getOperand(0).getValueType(); 19852 MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; 19853 FoundMinVT = true; 19854 } 19855 assert(FoundMinVT && "Concat vector type mismatch"); 19856 } 19857 19858 for (const SDValue &Op : N->ops()) { 19859 EVT OpVT = Op.getValueType(); 19860 unsigned NumElts = OpVT.getVectorNumElements(); 19861 19862 if (ISD::UNDEF == Op.getOpcode()) 19863 Opnds.append(NumElts, DAG.getUNDEF(MinVT)); 19864 19865 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 19866 if (SVT.isFloatingPoint()) { 19867 assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); 19868 Opnds.append(Op->op_begin(), Op->op_begin() + NumElts); 19869 } else { 19870 for (unsigned i = 0; i != NumElts; ++i) 19871 Opnds.push_back( 19872 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); 19873 } 19874 } 19875 } 19876 19877 assert(VT.getVectorNumElements() == Opnds.size() && 19878 "Concat vector type mismatch"); 19879 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 19880 } 19881 19882 // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. 19883 if (SDValue V = combineConcatVectorOfScalars(N, DAG)) 19884 return V; 19885 19886 // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. 19887 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) 19888 if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) 19889 return V; 19890 19891 if (SDValue V = combineConcatVectorOfCasts(N, DAG)) 19892 return V; 19893 19894 // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR 19895 // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR 19896 // operands and look for a CONCAT operations that place the incoming vectors 19897 // at the exact same location. 19898 // 19899 // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled. 19900 SDValue SingleSource = SDValue(); 19901 unsigned PartNumElem = 19902 N->getOperand(0).getValueType().getVectorMinNumElements(); 19903 19904 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 19905 SDValue Op = N->getOperand(i); 19906 19907 if (Op.isUndef()) 19908 continue; 19909 19910 // Check if this is the identity extract: 19911 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 19912 return SDValue(); 19913 19914 // Find the single incoming vector for the extract_subvector. 19915 if (SingleSource.getNode()) { 19916 if (Op.getOperand(0) != SingleSource) 19917 return SDValue(); 19918 } else { 19919 SingleSource = Op.getOperand(0); 19920 19921 // Check the source type is the same as the type of the result. 19922 // If not, this concat may extend the vector, so we can not 19923 // optimize it away. 19924 if (SingleSource.getValueType() != N->getValueType(0)) 19925 return SDValue(); 19926 } 19927 19928 // Check that we are reading from the identity index. 19929 unsigned IdentityIndex = i * PartNumElem; 19930 if (Op.getConstantOperandAPInt(1) != IdentityIndex) 19931 return SDValue(); 19932 } 19933 19934 if (SingleSource.getNode()) 19935 return SingleSource; 19936 19937 return SDValue(); 19938 } 19939 19940 // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find 19941 // if the subvector can be sourced for free. 19942 static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) { 19943 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && 19944 V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) { 19945 return V.getOperand(1); 19946 } 19947 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 19948 if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS && 19949 V.getOperand(0).getValueType() == SubVT && 19950 (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) { 19951 uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements(); 19952 return V.getOperand(SubIdx); 19953 } 19954 return SDValue(); 19955 } 19956 19957 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, 19958 SelectionDAG &DAG, 19959 bool LegalOperations) { 19960 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19961 SDValue BinOp = Extract->getOperand(0); 19962 unsigned BinOpcode = BinOp.getOpcode(); 19963 if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1) 19964 return SDValue(); 19965 19966 EVT VecVT = BinOp.getValueType(); 19967 SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1); 19968 if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType()) 19969 return SDValue(); 19970 19971 SDValue Index = Extract->getOperand(1); 19972 EVT SubVT = Extract->getValueType(0); 19973 if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations)) 19974 return SDValue(); 19975 19976 SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT); 19977 SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT); 19978 19979 // TODO: We could handle the case where only 1 operand is being inserted by 19980 // creating an extract of the other operand, but that requires checking 19981 // number of uses and/or costs. 19982 if (!Sub0 || !Sub1) 19983 return SDValue(); 19984 19985 // We are inserting both operands of the wide binop only to extract back 19986 // to the narrow vector size. Eliminate all of the insert/extract: 19987 // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y 19988 return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1, 19989 BinOp->getFlags()); 19990 } 19991 19992 /// If we are extracting a subvector produced by a wide binary operator try 19993 /// to use a narrow binary operator and/or avoid concatenation and extraction. 19994 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG, 19995 bool LegalOperations) { 19996 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share 19997 // some of these bailouts with other transforms. 19998 19999 if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations)) 20000 return V; 20001 20002 // The extract index must be a constant, so we can map it to a concat operand. 20003 auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 20004 if (!ExtractIndexC) 20005 return SDValue(); 20006 20007 // We are looking for an optionally bitcasted wide vector binary operator 20008 // feeding an extract subvector. 20009 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20010 SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0)); 20011 unsigned BOpcode = BinOp.getOpcode(); 20012 if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1) 20013 return SDValue(); 20014 20015 // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be 20016 // reduced to the unary fneg when it is visited, and we probably want to deal 20017 // with fneg in a target-specific way. 20018 if (BOpcode == ISD::FSUB) { 20019 auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true); 20020 if (C && C->getValueAPF().isNegZero()) 20021 return SDValue(); 20022 } 20023 20024 // The binop must be a vector type, so we can extract some fraction of it. 20025 EVT WideBVT = BinOp.getValueType(); 20026 // The optimisations below currently assume we are dealing with fixed length 20027 // vectors. It is possible to add support for scalable vectors, but at the 20028 // moment we've done no analysis to prove whether they are profitable or not. 20029 if (!WideBVT.isFixedLengthVector()) 20030 return SDValue(); 20031 20032 EVT VT = Extract->getValueType(0); 20033 unsigned ExtractIndex = ExtractIndexC->getZExtValue(); 20034 assert(ExtractIndex % VT.getVectorNumElements() == 0 && 20035 "Extract index is not a multiple of the vector length."); 20036 20037 // Bail out if this is not a proper multiple width extraction. 20038 unsigned WideWidth = WideBVT.getSizeInBits(); 20039 unsigned NarrowWidth = VT.getSizeInBits(); 20040 if (WideWidth % NarrowWidth != 0) 20041 return SDValue(); 20042 20043 // Bail out if we are extracting a fraction of a single operation. This can 20044 // occur because we potentially looked through a bitcast of the binop. 20045 unsigned NarrowingRatio = WideWidth / NarrowWidth; 20046 unsigned WideNumElts = WideBVT.getVectorNumElements(); 20047 if (WideNumElts % NarrowingRatio != 0) 20048 return SDValue(); 20049 20050 // Bail out if the target does not support a narrower version of the binop. 20051 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), 20052 WideNumElts / NarrowingRatio); 20053 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) 20054 return SDValue(); 20055 20056 // If extraction is cheap, we don't need to look at the binop operands 20057 // for concat ops. The narrow binop alone makes this transform profitable. 20058 // We can't just reuse the original extract index operand because we may have 20059 // bitcasted. 20060 unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements(); 20061 unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); 20062 if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && 20063 BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { 20064 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) 20065 SDLoc DL(Extract); 20066 SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL); 20067 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 20068 BinOp.getOperand(0), NewExtIndex); 20069 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 20070 BinOp.getOperand(1), NewExtIndex); 20071 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, 20072 BinOp.getNode()->getFlags()); 20073 return DAG.getBitcast(VT, NarrowBinOp); 20074 } 20075 20076 // Only handle the case where we are doubling and then halving. A larger ratio 20077 // may require more than two narrow binops to replace the wide binop. 20078 if (NarrowingRatio != 2) 20079 return SDValue(); 20080 20081 // TODO: The motivating case for this transform is an x86 AVX1 target. That 20082 // target has temptingly almost legal versions of bitwise logic ops in 256-bit 20083 // flavors, but no other 256-bit integer support. This could be extended to 20084 // handle any binop, but that may require fixing/adding other folds to avoid 20085 // codegen regressions. 20086 if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) 20087 return SDValue(); 20088 20089 // We need at least one concatenation operation of a binop operand to make 20090 // this transform worthwhile. The concat must double the input vector sizes. 20091 auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue { 20092 if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2) 20093 return V.getOperand(ConcatOpNum); 20094 return SDValue(); 20095 }; 20096 SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0))); 20097 SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1))); 20098 20099 if (SubVecL || SubVecR) { 20100 // If a binop operand was not the result of a concat, we must extract a 20101 // half-sized operand for our new narrow binop: 20102 // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN 20103 // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC) 20104 // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN 20105 SDLoc DL(Extract); 20106 SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL); 20107 SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL) 20108 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 20109 BinOp.getOperand(0), IndexC); 20110 20111 SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR) 20112 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 20113 BinOp.getOperand(1), IndexC); 20114 20115 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); 20116 return DAG.getBitcast(VT, NarrowBinOp); 20117 } 20118 20119 return SDValue(); 20120 } 20121 20122 /// If we are extracting a subvector from a wide vector load, convert to a 20123 /// narrow load to eliminate the extraction: 20124 /// (extract_subvector (load wide vector)) --> (load narrow vector) 20125 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { 20126 // TODO: Add support for big-endian. The offset calculation must be adjusted. 20127 if (DAG.getDataLayout().isBigEndian()) 20128 return SDValue(); 20129 20130 auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0)); 20131 auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 20132 if (!Ld || Ld->getExtensionType() || !Ld->isSimple() || 20133 !ExtIdx) 20134 return SDValue(); 20135 20136 // Allow targets to opt-out. 20137 EVT VT = Extract->getValueType(0); 20138 20139 // We can only create byte sized loads. 20140 if (!VT.isByteSized()) 20141 return SDValue(); 20142 20143 unsigned Index = ExtIdx->getZExtValue(); 20144 unsigned NumElts = VT.getVectorMinNumElements(); 20145 20146 // The definition of EXTRACT_SUBVECTOR states that the index must be a 20147 // multiple of the minimum number of elements in the result type. 20148 assert(Index % NumElts == 0 && "The extract subvector index is not a " 20149 "multiple of the result's element count"); 20150 20151 // It's fine to use TypeSize here as we know the offset will not be negative. 20152 TypeSize Offset = VT.getStoreSize() * (Index / NumElts); 20153 20154 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20155 if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) 20156 return SDValue(); 20157 20158 // The narrow load will be offset from the base address of the old load if 20159 // we are extracting from something besides index 0 (little-endian). 20160 SDLoc DL(Extract); 20161 20162 // TODO: Use "BaseIndexOffset" to make this more effective. 20163 SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL); 20164 20165 uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize()); 20166 MachineFunction &MF = DAG.getMachineFunction(); 20167 MachineMemOperand *MMO; 20168 if (Offset.isScalable()) { 20169 MachinePointerInfo MPI = 20170 MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()); 20171 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize); 20172 } else 20173 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(), 20174 StoreSize); 20175 20176 SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); 20177 DAG.makeEquivalentMemoryOrdering(Ld, NewLd); 20178 return NewLd; 20179 } 20180 20181 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { 20182 EVT NVT = N->getValueType(0); 20183 SDValue V = N->getOperand(0); 20184 uint64_t ExtIdx = N->getConstantOperandVal(1); 20185 20186 // Extract from UNDEF is UNDEF. 20187 if (V.isUndef()) 20188 return DAG.getUNDEF(NVT); 20189 20190 if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) 20191 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) 20192 return NarrowLoad; 20193 20194 // Combine an extract of an extract into a single extract_subvector. 20195 // ext (ext X, C), 0 --> ext X, C 20196 if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) { 20197 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), 20198 V.getConstantOperandVal(1)) && 20199 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { 20200 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0), 20201 V.getOperand(1)); 20202 } 20203 } 20204 20205 // Try to move vector bitcast after extract_subv by scaling extraction index: 20206 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') 20207 if (V.getOpcode() == ISD::BITCAST && 20208 V.getOperand(0).getValueType().isVector()) { 20209 SDValue SrcOp = V.getOperand(0); 20210 EVT SrcVT = SrcOp.getValueType(); 20211 unsigned SrcNumElts = SrcVT.getVectorMinNumElements(); 20212 unsigned DestNumElts = V.getValueType().getVectorMinNumElements(); 20213 if ((SrcNumElts % DestNumElts) == 0) { 20214 unsigned SrcDestRatio = SrcNumElts / DestNumElts; 20215 ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio; 20216 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), 20217 NewExtEC); 20218 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 20219 SDLoc DL(N); 20220 SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL); 20221 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 20222 V.getOperand(0), NewIndex); 20223 return DAG.getBitcast(NVT, NewExtract); 20224 } 20225 } 20226 if ((DestNumElts % SrcNumElts) == 0) { 20227 unsigned DestSrcRatio = DestNumElts / SrcNumElts; 20228 if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) { 20229 ElementCount NewExtEC = 20230 NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio); 20231 EVT ScalarVT = SrcVT.getScalarType(); 20232 if ((ExtIdx % DestSrcRatio) == 0) { 20233 SDLoc DL(N); 20234 unsigned IndexValScaled = ExtIdx / DestSrcRatio; 20235 EVT NewExtVT = 20236 EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC); 20237 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 20238 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 20239 SDValue NewExtract = 20240 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 20241 V.getOperand(0), NewIndex); 20242 return DAG.getBitcast(NVT, NewExtract); 20243 } 20244 if (NewExtEC.isScalar() && 20245 TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) { 20246 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 20247 SDValue NewExtract = 20248 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, 20249 V.getOperand(0), NewIndex); 20250 return DAG.getBitcast(NVT, NewExtract); 20251 } 20252 } 20253 } 20254 } 20255 } 20256 20257 if (V.getOpcode() == ISD::CONCAT_VECTORS) { 20258 unsigned ExtNumElts = NVT.getVectorMinNumElements(); 20259 EVT ConcatSrcVT = V.getOperand(0).getValueType(); 20260 assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() && 20261 "Concat and extract subvector do not change element type"); 20262 assert((ExtIdx % ExtNumElts) == 0 && 20263 "Extract index is not a multiple of the input vector length."); 20264 20265 unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements(); 20266 unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts; 20267 20268 // If the concatenated source types match this extract, it's a direct 20269 // simplification: 20270 // extract_subvec (concat V1, V2, ...), i --> Vi 20271 if (ConcatSrcNumElts == ExtNumElts) 20272 return V.getOperand(ConcatOpIdx); 20273 20274 // If the concatenated source vectors are a multiple length of this extract, 20275 // then extract a fraction of one of those source vectors directly from a 20276 // concat operand. Example: 20277 // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> 20278 // v2i8 extract_subvec v8i8 Y, 6 20279 if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) { 20280 SDLoc DL(N); 20281 unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; 20282 assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && 20283 "Trying to extract from >1 concat operand?"); 20284 assert(NewExtIdx % ExtNumElts == 0 && 20285 "Extract index is not a multiple of the input vector length."); 20286 SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL); 20287 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, 20288 V.getOperand(ConcatOpIdx), NewIndexC); 20289 } 20290 } 20291 20292 V = peekThroughBitcasts(V); 20293 20294 // If the input is a build vector. Try to make a smaller build vector. 20295 if (V.getOpcode() == ISD::BUILD_VECTOR) { 20296 EVT InVT = V.getValueType(); 20297 unsigned ExtractSize = NVT.getSizeInBits(); 20298 unsigned EltSize = InVT.getScalarSizeInBits(); 20299 // Only do this if we won't split any elements. 20300 if (ExtractSize % EltSize == 0) { 20301 unsigned NumElems = ExtractSize / EltSize; 20302 EVT EltVT = InVT.getVectorElementType(); 20303 EVT ExtractVT = 20304 NumElems == 1 ? EltVT 20305 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems); 20306 if ((Level < AfterLegalizeDAG || 20307 (NumElems == 1 || 20308 TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) && 20309 (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { 20310 unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize; 20311 20312 if (NumElems == 1) { 20313 SDValue Src = V->getOperand(IdxVal); 20314 if (EltVT != Src.getValueType()) 20315 Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); 20316 return DAG.getBitcast(NVT, Src); 20317 } 20318 20319 // Extract the pieces from the original build_vector. 20320 SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), 20321 V->ops().slice(IdxVal, NumElems)); 20322 return DAG.getBitcast(NVT, BuildVec); 20323 } 20324 } 20325 } 20326 20327 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { 20328 // Handle only simple case where vector being inserted and vector 20329 // being extracted are of same size. 20330 EVT SmallVT = V.getOperand(1).getValueType(); 20331 if (!NVT.bitsEq(SmallVT)) 20332 return SDValue(); 20333 20334 // Combine: 20335 // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) 20336 // Into: 20337 // indices are equal or bit offsets are equal => V1 20338 // otherwise => (extract_subvec V1, ExtIdx) 20339 uint64_t InsIdx = V.getConstantOperandVal(2); 20340 if (InsIdx * SmallVT.getScalarSizeInBits() == 20341 ExtIdx * NVT.getScalarSizeInBits()) 20342 return DAG.getBitcast(NVT, V.getOperand(1)); 20343 return DAG.getNode( 20344 ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, 20345 DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), 20346 N->getOperand(1)); 20347 } 20348 20349 if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations)) 20350 return NarrowBOp; 20351 20352 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 20353 return SDValue(N, 0); 20354 20355 return SDValue(); 20356 } 20357 20358 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles 20359 /// followed by concatenation. Narrow vector ops may have better performance 20360 /// than wide ops, and this can unlock further narrowing of other vector ops. 20361 /// Targets can invert this transform later if it is not profitable. 20362 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf, 20363 SelectionDAG &DAG) { 20364 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1); 20365 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || 20366 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 || 20367 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) 20368 return SDValue(); 20369 20370 // Split the wide shuffle mask into halves. Any mask element that is accessing 20371 // operand 1 is offset down to account for narrowing of the vectors. 20372 ArrayRef<int> Mask = Shuf->getMask(); 20373 EVT VT = Shuf->getValueType(0); 20374 unsigned NumElts = VT.getVectorNumElements(); 20375 unsigned HalfNumElts = NumElts / 2; 20376 SmallVector<int, 16> Mask0(HalfNumElts, -1); 20377 SmallVector<int, 16> Mask1(HalfNumElts, -1); 20378 for (unsigned i = 0; i != NumElts; ++i) { 20379 if (Mask[i] == -1) 20380 continue; 20381 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts; 20382 if (i < HalfNumElts) 20383 Mask0[i] = M; 20384 else 20385 Mask1[i - HalfNumElts] = M; 20386 } 20387 20388 // Ask the target if this is a valid transform. 20389 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20390 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 20391 HalfNumElts); 20392 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) || 20393 !TLI.isShuffleMaskLegal(Mask1, HalfVT)) 20394 return SDValue(); 20395 20396 // shuffle (concat X, undef), (concat Y, undef), Mask --> 20397 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) 20398 SDValue X = N0.getOperand(0), Y = N1.getOperand(0); 20399 SDLoc DL(Shuf); 20400 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0); 20401 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1); 20402 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1); 20403 } 20404 20405 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, 20406 // or turn a shuffle of a single concat into simpler shuffle then concat. 20407 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { 20408 EVT VT = N->getValueType(0); 20409 unsigned NumElts = VT.getVectorNumElements(); 20410 20411 SDValue N0 = N->getOperand(0); 20412 SDValue N1 = N->getOperand(1); 20413 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 20414 ArrayRef<int> Mask = SVN->getMask(); 20415 20416 SmallVector<SDValue, 4> Ops; 20417 EVT ConcatVT = N0.getOperand(0).getValueType(); 20418 unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); 20419 unsigned NumConcats = NumElts / NumElemsPerConcat; 20420 20421 auto IsUndefMaskElt = [](int i) { return i == -1; }; 20422 20423 // Special case: shuffle(concat(A,B)) can be more efficiently represented 20424 // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high 20425 // half vector elements. 20426 if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && 20427 llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat), 20428 IsUndefMaskElt)) { 20429 N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), 20430 N0.getOperand(1), 20431 Mask.slice(0, NumElemsPerConcat)); 20432 N1 = DAG.getUNDEF(ConcatVT); 20433 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); 20434 } 20435 20436 // Look at every vector that's inserted. We're looking for exact 20437 // subvector-sized copies from a concatenated vector 20438 for (unsigned I = 0; I != NumConcats; ++I) { 20439 unsigned Begin = I * NumElemsPerConcat; 20440 ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat); 20441 20442 // Make sure we're dealing with a copy. 20443 if (llvm::all_of(SubMask, IsUndefMaskElt)) { 20444 Ops.push_back(DAG.getUNDEF(ConcatVT)); 20445 continue; 20446 } 20447 20448 int OpIdx = -1; 20449 for (int i = 0; i != (int)NumElemsPerConcat; ++i) { 20450 if (IsUndefMaskElt(SubMask[i])) 20451 continue; 20452 if ((SubMask[i] % (int)NumElemsPerConcat) != i) 20453 return SDValue(); 20454 int EltOpIdx = SubMask[i] / NumElemsPerConcat; 20455 if (0 <= OpIdx && EltOpIdx != OpIdx) 20456 return SDValue(); 20457 OpIdx = EltOpIdx; 20458 } 20459 assert(0 <= OpIdx && "Unknown concat_vectors op"); 20460 20461 if (OpIdx < (int)N0.getNumOperands()) 20462 Ops.push_back(N0.getOperand(OpIdx)); 20463 else 20464 Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands())); 20465 } 20466 20467 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 20468 } 20469 20470 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 20471 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 20472 // 20473 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always 20474 // a simplification in some sense, but it isn't appropriate in general: some 20475 // BUILD_VECTORs are substantially cheaper than others. The general case 20476 // of a BUILD_VECTOR requires inserting each element individually (or 20477 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of 20478 // all constants is a single constant pool load. A BUILD_VECTOR where each 20479 // element is identical is a splat. A BUILD_VECTOR where most of the operands 20480 // are undef lowers to a small number of element insertions. 20481 // 20482 // To deal with this, we currently use a bunch of mostly arbitrary heuristics. 20483 // We don't fold shuffles where one side is a non-zero constant, and we don't 20484 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate 20485 // non-constant operands. This seems to work out reasonably well in practice. 20486 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, 20487 SelectionDAG &DAG, 20488 const TargetLowering &TLI) { 20489 EVT VT = SVN->getValueType(0); 20490 unsigned NumElts = VT.getVectorNumElements(); 20491 SDValue N0 = SVN->getOperand(0); 20492 SDValue N1 = SVN->getOperand(1); 20493 20494 if (!N0->hasOneUse()) 20495 return SDValue(); 20496 20497 // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as 20498 // discussed above. 20499 if (!N1.isUndef()) { 20500 if (!N1->hasOneUse()) 20501 return SDValue(); 20502 20503 bool N0AnyConst = isAnyConstantBuildVector(N0); 20504 bool N1AnyConst = isAnyConstantBuildVector(N1); 20505 if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) 20506 return SDValue(); 20507 if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) 20508 return SDValue(); 20509 } 20510 20511 // If both inputs are splats of the same value then we can safely merge this 20512 // to a single BUILD_VECTOR with undef elements based on the shuffle mask. 20513 bool IsSplat = false; 20514 auto *BV0 = dyn_cast<BuildVectorSDNode>(N0); 20515 auto *BV1 = dyn_cast<BuildVectorSDNode>(N1); 20516 if (BV0 && BV1) 20517 if (SDValue Splat0 = BV0->getSplatValue()) 20518 IsSplat = (Splat0 == BV1->getSplatValue()); 20519 20520 SmallVector<SDValue, 8> Ops; 20521 SmallSet<SDValue, 16> DuplicateOps; 20522 for (int M : SVN->getMask()) { 20523 SDValue Op = DAG.getUNDEF(VT.getScalarType()); 20524 if (M >= 0) { 20525 int Idx = M < (int)NumElts ? M : M - NumElts; 20526 SDValue &S = (M < (int)NumElts ? N0 : N1); 20527 if (S.getOpcode() == ISD::BUILD_VECTOR) { 20528 Op = S.getOperand(Idx); 20529 } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { 20530 SDValue Op0 = S.getOperand(0); 20531 Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType()); 20532 } else { 20533 // Operand can't be combined - bail out. 20534 return SDValue(); 20535 } 20536 } 20537 20538 // Don't duplicate a non-constant BUILD_VECTOR operand unless we're 20539 // generating a splat; semantically, this is fine, but it's likely to 20540 // generate low-quality code if the target can't reconstruct an appropriate 20541 // shuffle. 20542 if (!Op.isUndef() && !isIntOrFPConstant(Op)) 20543 if (!IsSplat && !DuplicateOps.insert(Op).second) 20544 return SDValue(); 20545 20546 Ops.push_back(Op); 20547 } 20548 20549 // BUILD_VECTOR requires all inputs to be of the same type, find the 20550 // maximum type and extend them all. 20551 EVT SVT = VT.getScalarType(); 20552 if (SVT.isInteger()) 20553 for (SDValue &Op : Ops) 20554 SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); 20555 if (SVT != VT.getScalarType()) 20556 for (SDValue &Op : Ops) 20557 Op = TLI.isZExtFree(Op.getValueType(), SVT) 20558 ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) 20559 : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); 20560 return DAG.getBuildVector(VT, SDLoc(SVN), Ops); 20561 } 20562 20563 // Match shuffles that can be converted to any_vector_extend_in_reg. 20564 // This is often generated during legalization. 20565 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) 20566 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. 20567 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, 20568 SelectionDAG &DAG, 20569 const TargetLowering &TLI, 20570 bool LegalOperations) { 20571 EVT VT = SVN->getValueType(0); 20572 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 20573 20574 // TODO Add support for big-endian when we have a test case. 20575 if (!VT.isInteger() || IsBigEndian) 20576 return SDValue(); 20577 20578 unsigned NumElts = VT.getVectorNumElements(); 20579 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 20580 ArrayRef<int> Mask = SVN->getMask(); 20581 SDValue N0 = SVN->getOperand(0); 20582 20583 // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) 20584 auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { 20585 for (unsigned i = 0; i != NumElts; ++i) { 20586 if (Mask[i] < 0) 20587 continue; 20588 if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) 20589 continue; 20590 return false; 20591 } 20592 return true; 20593 }; 20594 20595 // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for 20596 // power-of-2 extensions as they are the most likely. 20597 for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { 20598 // Check for non power of 2 vector sizes 20599 if (NumElts % Scale != 0) 20600 continue; 20601 if (!isAnyExtend(Scale)) 20602 continue; 20603 20604 EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); 20605 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); 20606 // Never create an illegal type. Only create unsupported operations if we 20607 // are pre-legalization. 20608 if (TLI.isTypeLegal(OutVT)) 20609 if (!LegalOperations || 20610 TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) 20611 return DAG.getBitcast(VT, 20612 DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, 20613 SDLoc(SVN), OutVT, N0)); 20614 } 20615 20616 return SDValue(); 20617 } 20618 20619 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of 20620 // each source element of a large type into the lowest elements of a smaller 20621 // destination type. This is often generated during legalization. 20622 // If the source node itself was a '*_extend_vector_inreg' node then we should 20623 // then be able to remove it. 20624 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, 20625 SelectionDAG &DAG) { 20626 EVT VT = SVN->getValueType(0); 20627 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 20628 20629 // TODO Add support for big-endian when we have a test case. 20630 if (!VT.isInteger() || IsBigEndian) 20631 return SDValue(); 20632 20633 SDValue N0 = peekThroughBitcasts(SVN->getOperand(0)); 20634 20635 unsigned Opcode = N0.getOpcode(); 20636 if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && 20637 Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && 20638 Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) 20639 return SDValue(); 20640 20641 SDValue N00 = N0.getOperand(0); 20642 ArrayRef<int> Mask = SVN->getMask(); 20643 unsigned NumElts = VT.getVectorNumElements(); 20644 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 20645 unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); 20646 unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); 20647 20648 if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) 20649 return SDValue(); 20650 unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; 20651 20652 // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> 20653 // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> 20654 // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> 20655 auto isTruncate = [&Mask, &NumElts](unsigned Scale) { 20656 for (unsigned i = 0; i != NumElts; ++i) { 20657 if (Mask[i] < 0) 20658 continue; 20659 if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) 20660 continue; 20661 return false; 20662 } 20663 return true; 20664 }; 20665 20666 // At the moment we just handle the case where we've truncated back to the 20667 // same size as before the extension. 20668 // TODO: handle more extension/truncation cases as cases arise. 20669 if (EltSizeInBits != ExtSrcSizeInBits) 20670 return SDValue(); 20671 20672 // We can remove *extend_vector_inreg only if the truncation happens at 20673 // the same scale as the extension. 20674 if (isTruncate(ExtScale)) 20675 return DAG.getBitcast(VT, N00); 20676 20677 return SDValue(); 20678 } 20679 20680 // Combine shuffles of splat-shuffles of the form: 20681 // shuffle (shuffle V, undef, splat-mask), undef, M 20682 // If splat-mask contains undef elements, we need to be careful about 20683 // introducing undef's in the folded mask which are not the result of composing 20684 // the masks of the shuffles. 20685 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, 20686 SelectionDAG &DAG) { 20687 if (!Shuf->getOperand(1).isUndef()) 20688 return SDValue(); 20689 auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 20690 if (!Splat || !Splat->isSplat()) 20691 return SDValue(); 20692 20693 ArrayRef<int> ShufMask = Shuf->getMask(); 20694 ArrayRef<int> SplatMask = Splat->getMask(); 20695 assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch"); 20696 20697 // Prefer simplifying to the splat-shuffle, if possible. This is legal if 20698 // every undef mask element in the splat-shuffle has a corresponding undef 20699 // element in the user-shuffle's mask or if the composition of mask elements 20700 // would result in undef. 20701 // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): 20702 // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] 20703 // In this case it is not legal to simplify to the splat-shuffle because we 20704 // may be exposing the users of the shuffle an undef element at index 1 20705 // which was not there before the combine. 20706 // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] 20707 // In this case the composition of masks yields SplatMask, so it's ok to 20708 // simplify to the splat-shuffle. 20709 // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] 20710 // In this case the composed mask includes all undef elements of SplatMask 20711 // and in addition sets element zero to undef. It is safe to simplify to 20712 // the splat-shuffle. 20713 auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask, 20714 ArrayRef<int> SplatMask) { 20715 for (unsigned i = 0, e = UserMask.size(); i != e; ++i) 20716 if (UserMask[i] != -1 && SplatMask[i] == -1 && 20717 SplatMask[UserMask[i]] != -1) 20718 return false; 20719 return true; 20720 }; 20721 if (CanSimplifyToExistingSplat(ShufMask, SplatMask)) 20722 return Shuf->getOperand(0); 20723 20724 // Create a new shuffle with a mask that is composed of the two shuffles' 20725 // masks. 20726 SmallVector<int, 32> NewMask; 20727 for (int Idx : ShufMask) 20728 NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); 20729 20730 return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), 20731 Splat->getOperand(0), Splat->getOperand(1), 20732 NewMask); 20733 } 20734 20735 /// Combine shuffle of shuffle of the form: 20736 /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X 20737 static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf, 20738 SelectionDAG &DAG) { 20739 if (!OuterShuf->getOperand(1).isUndef()) 20740 return SDValue(); 20741 auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0)); 20742 if (!InnerShuf || !InnerShuf->getOperand(1).isUndef()) 20743 return SDValue(); 20744 20745 ArrayRef<int> OuterMask = OuterShuf->getMask(); 20746 ArrayRef<int> InnerMask = InnerShuf->getMask(); 20747 unsigned NumElts = OuterMask.size(); 20748 assert(NumElts == InnerMask.size() && "Mask length mismatch"); 20749 SmallVector<int, 32> CombinedMask(NumElts, -1); 20750 int SplatIndex = -1; 20751 for (unsigned i = 0; i != NumElts; ++i) { 20752 // Undef lanes remain undef. 20753 int OuterMaskElt = OuterMask[i]; 20754 if (OuterMaskElt == -1) 20755 continue; 20756 20757 // Peek through the shuffle masks to get the underlying source element. 20758 int InnerMaskElt = InnerMask[OuterMaskElt]; 20759 if (InnerMaskElt == -1) 20760 continue; 20761 20762 // Initialize the splatted element. 20763 if (SplatIndex == -1) 20764 SplatIndex = InnerMaskElt; 20765 20766 // Non-matching index - this is not a splat. 20767 if (SplatIndex != InnerMaskElt) 20768 return SDValue(); 20769 20770 CombinedMask[i] = InnerMaskElt; 20771 } 20772 assert((all_of(CombinedMask, [](int M) { return M == -1; }) || 20773 getSplatIndex(CombinedMask) != -1) && 20774 "Expected a splat mask"); 20775 20776 // TODO: The transform may be a win even if the mask is not legal. 20777 EVT VT = OuterShuf->getValueType(0); 20778 assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types"); 20779 if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT)) 20780 return SDValue(); 20781 20782 return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0), 20783 InnerShuf->getOperand(1), CombinedMask); 20784 } 20785 20786 /// If the shuffle mask is taking exactly one element from the first vector 20787 /// operand and passing through all other elements from the second vector 20788 /// operand, return the index of the mask element that is choosing an element 20789 /// from the first operand. Otherwise, return -1. 20790 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) { 20791 int MaskSize = Mask.size(); 20792 int EltFromOp0 = -1; 20793 // TODO: This does not match if there are undef elements in the shuffle mask. 20794 // Should we ignore undefs in the shuffle mask instead? The trade-off is 20795 // removing an instruction (a shuffle), but losing the knowledge that some 20796 // vector lanes are not needed. 20797 for (int i = 0; i != MaskSize; ++i) { 20798 if (Mask[i] >= 0 && Mask[i] < MaskSize) { 20799 // We're looking for a shuffle of exactly one element from operand 0. 20800 if (EltFromOp0 != -1) 20801 return -1; 20802 EltFromOp0 = i; 20803 } else if (Mask[i] != i + MaskSize) { 20804 // Nothing from operand 1 can change lanes. 20805 return -1; 20806 } 20807 } 20808 return EltFromOp0; 20809 } 20810 20811 /// If a shuffle inserts exactly one element from a source vector operand into 20812 /// another vector operand and we can access the specified element as a scalar, 20813 /// then we can eliminate the shuffle. 20814 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, 20815 SelectionDAG &DAG) { 20816 // First, check if we are taking one element of a vector and shuffling that 20817 // element into another vector. 20818 ArrayRef<int> Mask = Shuf->getMask(); 20819 SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end()); 20820 SDValue Op0 = Shuf->getOperand(0); 20821 SDValue Op1 = Shuf->getOperand(1); 20822 int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); 20823 if (ShufOp0Index == -1) { 20824 // Commute mask and check again. 20825 ShuffleVectorSDNode::commuteMask(CommutedMask); 20826 ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); 20827 if (ShufOp0Index == -1) 20828 return SDValue(); 20829 // Commute operands to match the commuted shuffle mask. 20830 std::swap(Op0, Op1); 20831 Mask = CommutedMask; 20832 } 20833 20834 // The shuffle inserts exactly one element from operand 0 into operand 1. 20835 // Now see if we can access that element as a scalar via a real insert element 20836 // instruction. 20837 // TODO: We can try harder to locate the element as a scalar. Examples: it 20838 // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. 20839 assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && 20840 "Shuffle mask value must be from operand 0"); 20841 if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) 20842 return SDValue(); 20843 20844 auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2)); 20845 if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) 20846 return SDValue(); 20847 20848 // There's an existing insertelement with constant insertion index, so we 20849 // don't need to check the legality/profitability of a replacement operation 20850 // that differs at most in the constant value. The target should be able to 20851 // lower any of those in a similar way. If not, legalization will expand this 20852 // to a scalar-to-vector plus shuffle. 20853 // 20854 // Note that the shuffle may move the scalar from the position that the insert 20855 // element used. Therefore, our new insert element occurs at the shuffle's 20856 // mask index value, not the insert's index value. 20857 // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' 20858 SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); 20859 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), 20860 Op1, Op0.getOperand(1), NewInsIndex); 20861 } 20862 20863 /// If we have a unary shuffle of a shuffle, see if it can be folded away 20864 /// completely. This has the potential to lose undef knowledge because the first 20865 /// shuffle may not have an undef mask element where the second one does. So 20866 /// only call this after doing simplifications based on demanded elements. 20867 static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { 20868 // shuf (shuf0 X, Y, Mask0), undef, Mask 20869 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 20870 if (!Shuf0 || !Shuf->getOperand(1).isUndef()) 20871 return SDValue(); 20872 20873 ArrayRef<int> Mask = Shuf->getMask(); 20874 ArrayRef<int> Mask0 = Shuf0->getMask(); 20875 for (int i = 0, e = (int)Mask.size(); i != e; ++i) { 20876 // Ignore undef elements. 20877 if (Mask[i] == -1) 20878 continue; 20879 assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value"); 20880 20881 // Is the element of the shuffle operand chosen by this shuffle the same as 20882 // the element chosen by the shuffle operand itself? 20883 if (Mask0[Mask[i]] != Mask0[i]) 20884 return SDValue(); 20885 } 20886 // Every element of this shuffle is identical to the result of the previous 20887 // shuffle, so we can replace this value. 20888 return Shuf->getOperand(0); 20889 } 20890 20891 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { 20892 EVT VT = N->getValueType(0); 20893 unsigned NumElts = VT.getVectorNumElements(); 20894 20895 SDValue N0 = N->getOperand(0); 20896 SDValue N1 = N->getOperand(1); 20897 20898 assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); 20899 20900 // Canonicalize shuffle undef, undef -> undef 20901 if (N0.isUndef() && N1.isUndef()) 20902 return DAG.getUNDEF(VT); 20903 20904 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 20905 20906 // Canonicalize shuffle v, v -> v, undef 20907 if (N0 == N1) { 20908 SmallVector<int, 8> NewMask; 20909 for (unsigned i = 0; i != NumElts; ++i) { 20910 int Idx = SVN->getMaskElt(i); 20911 if (Idx >= (int)NumElts) Idx -= NumElts; 20912 NewMask.push_back(Idx); 20913 } 20914 return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask); 20915 } 20916 20917 // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. 20918 if (N0.isUndef()) 20919 return DAG.getCommutedVectorShuffle(*SVN); 20920 20921 // Remove references to rhs if it is undef 20922 if (N1.isUndef()) { 20923 bool Changed = false; 20924 SmallVector<int, 8> NewMask; 20925 for (unsigned i = 0; i != NumElts; ++i) { 20926 int Idx = SVN->getMaskElt(i); 20927 if (Idx >= (int)NumElts) { 20928 Idx = -1; 20929 Changed = true; 20930 } 20931 NewMask.push_back(Idx); 20932 } 20933 if (Changed) 20934 return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); 20935 } 20936 20937 if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) 20938 return InsElt; 20939 20940 // A shuffle of a single vector that is a splatted value can always be folded. 20941 if (SDValue V = combineShuffleOfSplatVal(SVN, DAG)) 20942 return V; 20943 20944 if (SDValue V = formSplatFromShuffles(SVN, DAG)) 20945 return V; 20946 20947 // If it is a splat, check if the argument vector is another splat or a 20948 // build_vector. 20949 if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { 20950 int SplatIndex = SVN->getSplatIndex(); 20951 if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) && 20952 TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) { 20953 // splat (vector_bo L, R), Index --> 20954 // splat (scalar_bo (extelt L, Index), (extelt R, Index)) 20955 SDValue L = N0.getOperand(0), R = N0.getOperand(1); 20956 SDLoc DL(N); 20957 EVT EltVT = VT.getScalarType(); 20958 SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL); 20959 SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index); 20960 SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index); 20961 SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, 20962 N0.getNode()->getFlags()); 20963 SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO); 20964 SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0); 20965 return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask); 20966 } 20967 20968 // If this is a bit convert that changes the element type of the vector but 20969 // not the number of vector elements, look through it. Be careful not to 20970 // look though conversions that change things like v4f32 to v2f64. 20971 SDNode *V = N0.getNode(); 20972 if (V->getOpcode() == ISD::BITCAST) { 20973 SDValue ConvInput = V->getOperand(0); 20974 if (ConvInput.getValueType().isVector() && 20975 ConvInput.getValueType().getVectorNumElements() == NumElts) 20976 V = ConvInput.getNode(); 20977 } 20978 20979 if (V->getOpcode() == ISD::BUILD_VECTOR) { 20980 assert(V->getNumOperands() == NumElts && 20981 "BUILD_VECTOR has wrong number of operands"); 20982 SDValue Base; 20983 bool AllSame = true; 20984 for (unsigned i = 0; i != NumElts; ++i) { 20985 if (!V->getOperand(i).isUndef()) { 20986 Base = V->getOperand(i); 20987 break; 20988 } 20989 } 20990 // Splat of <u, u, u, u>, return <u, u, u, u> 20991 if (!Base.getNode()) 20992 return N0; 20993 for (unsigned i = 0; i != NumElts; ++i) { 20994 if (V->getOperand(i) != Base) { 20995 AllSame = false; 20996 break; 20997 } 20998 } 20999 // Splat of <x, x, x, x>, return <x, x, x, x> 21000 if (AllSame) 21001 return N0; 21002 21003 // Canonicalize any other splat as a build_vector. 21004 SDValue Splatted = V->getOperand(SplatIndex); 21005 SmallVector<SDValue, 8> Ops(NumElts, Splatted); 21006 SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); 21007 21008 // We may have jumped through bitcasts, so the type of the 21009 // BUILD_VECTOR may not match the type of the shuffle. 21010 if (V->getValueType(0) != VT) 21011 NewBV = DAG.getBitcast(VT, NewBV); 21012 return NewBV; 21013 } 21014 } 21015 21016 // Simplify source operands based on shuffle mask. 21017 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 21018 return SDValue(N, 0); 21019 21020 // This is intentionally placed after demanded elements simplification because 21021 // it could eliminate knowledge of undef elements created by this shuffle. 21022 if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) 21023 return ShufOp; 21024 21025 // Match shuffles that can be converted to any_vector_extend_in_reg. 21026 if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) 21027 return V; 21028 21029 // Combine "truncate_vector_in_reg" style shuffles. 21030 if (SDValue V = combineTruncationShuffle(SVN, DAG)) 21031 return V; 21032 21033 if (N0.getOpcode() == ISD::CONCAT_VECTORS && 21034 Level < AfterLegalizeVectorOps && 21035 (N1.isUndef() || 21036 (N1.getOpcode() == ISD::CONCAT_VECTORS && 21037 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { 21038 if (SDValue V = partitionShuffleOfConcats(N, DAG)) 21039 return V; 21040 } 21041 21042 // A shuffle of a concat of the same narrow vector can be reduced to use 21043 // only low-half elements of a concat with undef: 21044 // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask' 21045 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() && 21046 N0.getNumOperands() == 2 && 21047 N0.getOperand(0) == N0.getOperand(1)) { 21048 int HalfNumElts = (int)NumElts / 2; 21049 SmallVector<int, 8> NewMask; 21050 for (unsigned i = 0; i != NumElts; ++i) { 21051 int Idx = SVN->getMaskElt(i); 21052 if (Idx >= HalfNumElts) { 21053 assert(Idx < (int)NumElts && "Shuffle mask chooses undef op"); 21054 Idx -= HalfNumElts; 21055 } 21056 NewMask.push_back(Idx); 21057 } 21058 if (TLI.isShuffleMaskLegal(NewMask, VT)) { 21059 SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType()); 21060 SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 21061 N0.getOperand(0), UndefVec); 21062 return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask); 21063 } 21064 } 21065 21066 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 21067 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 21068 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) 21069 if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) 21070 return Res; 21071 21072 // If this shuffle only has a single input that is a bitcasted shuffle, 21073 // attempt to merge the 2 shuffles and suitably bitcast the inputs/output 21074 // back to their original types. 21075 if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 21076 N1.isUndef() && Level < AfterLegalizeVectorOps && 21077 TLI.isTypeLegal(VT)) { 21078 21079 SDValue BC0 = peekThroughOneUseBitcasts(N0); 21080 if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { 21081 EVT SVT = VT.getScalarType(); 21082 EVT InnerVT = BC0->getValueType(0); 21083 EVT InnerSVT = InnerVT.getScalarType(); 21084 21085 // Determine which shuffle works with the smaller scalar type. 21086 EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; 21087 EVT ScaleSVT = ScaleVT.getScalarType(); 21088 21089 if (TLI.isTypeLegal(ScaleVT) && 21090 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 21091 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { 21092 int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 21093 int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 21094 21095 // Scale the shuffle masks to the smaller scalar type. 21096 ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0); 21097 SmallVector<int, 8> InnerMask; 21098 SmallVector<int, 8> OuterMask; 21099 narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask); 21100 narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask); 21101 21102 // Merge the shuffle masks. 21103 SmallVector<int, 8> NewMask; 21104 for (int M : OuterMask) 21105 NewMask.push_back(M < 0 ? -1 : InnerMask[M]); 21106 21107 // Test for shuffle mask legality over both commutations. 21108 SDValue SV0 = BC0->getOperand(0); 21109 SDValue SV1 = BC0->getOperand(1); 21110 bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 21111 if (!LegalMask) { 21112 std::swap(SV0, SV1); 21113 ShuffleVectorSDNode::commuteMask(NewMask); 21114 LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 21115 } 21116 21117 if (LegalMask) { 21118 SV0 = DAG.getBitcast(ScaleVT, SV0); 21119 SV1 = DAG.getBitcast(ScaleVT, SV1); 21120 return DAG.getBitcast( 21121 VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); 21122 } 21123 } 21124 } 21125 } 21126 21127 // Compute the combined shuffle mask for a shuffle with SV0 as the first 21128 // operand, and SV1 as the second operand. 21129 // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false 21130 // Merge SVN(N1, OtherSVN) -> shuffle(SV0, SV1, Mask') iff Commute = true 21131 auto MergeInnerShuffle = 21132 [NumElts, &VT](bool Commute, ShuffleVectorSDNode *SVN, 21133 ShuffleVectorSDNode *OtherSVN, SDValue N1, 21134 const TargetLowering &TLI, SDValue &SV0, SDValue &SV1, 21135 SmallVectorImpl<int> &Mask) -> bool { 21136 // Don't try to fold splats; they're likely to simplify somehow, or they 21137 // might be free. 21138 if (OtherSVN->isSplat()) 21139 return false; 21140 21141 SV0 = SV1 = SDValue(); 21142 Mask.clear(); 21143 21144 for (unsigned i = 0; i != NumElts; ++i) { 21145 int Idx = SVN->getMaskElt(i); 21146 if (Idx < 0) { 21147 // Propagate Undef. 21148 Mask.push_back(Idx); 21149 continue; 21150 } 21151 21152 if (Commute) 21153 Idx = (Idx < (int)NumElts) ? (Idx + NumElts) : (Idx - NumElts); 21154 21155 SDValue CurrentVec; 21156 if (Idx < (int)NumElts) { 21157 // This shuffle index refers to the inner shuffle N0. Lookup the inner 21158 // shuffle mask to identify which vector is actually referenced. 21159 Idx = OtherSVN->getMaskElt(Idx); 21160 if (Idx < 0) { 21161 // Propagate Undef. 21162 Mask.push_back(Idx); 21163 continue; 21164 } 21165 CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0) 21166 : OtherSVN->getOperand(1); 21167 } else { 21168 // This shuffle index references an element within N1. 21169 CurrentVec = N1; 21170 } 21171 21172 // Simple case where 'CurrentVec' is UNDEF. 21173 if (CurrentVec.isUndef()) { 21174 Mask.push_back(-1); 21175 continue; 21176 } 21177 21178 // Canonicalize the shuffle index. We don't know yet if CurrentVec 21179 // will be the first or second operand of the combined shuffle. 21180 Idx = Idx % NumElts; 21181 if (!SV0.getNode() || SV0 == CurrentVec) { 21182 // Ok. CurrentVec is the left hand side. 21183 // Update the mask accordingly. 21184 SV0 = CurrentVec; 21185 Mask.push_back(Idx); 21186 continue; 21187 } 21188 if (!SV1.getNode() || SV1 == CurrentVec) { 21189 // Ok. CurrentVec is the right hand side. 21190 // Update the mask accordingly. 21191 SV1 = CurrentVec; 21192 Mask.push_back(Idx + NumElts); 21193 continue; 21194 } 21195 21196 // Last chance - see if the vector is another shuffle and if it 21197 // uses one of the existing candidate shuffle ops. 21198 if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) { 21199 int InnerIdx = CurrentSVN->getMaskElt(Idx); 21200 if (InnerIdx < 0) { 21201 Mask.push_back(-1); 21202 continue; 21203 } 21204 SDValue InnerVec = (InnerIdx < (int)NumElts) 21205 ? CurrentSVN->getOperand(0) 21206 : CurrentSVN->getOperand(1); 21207 if (InnerVec.isUndef()) { 21208 Mask.push_back(-1); 21209 continue; 21210 } 21211 InnerIdx %= NumElts; 21212 if (InnerVec == SV0) { 21213 Mask.push_back(InnerIdx); 21214 continue; 21215 } 21216 if (InnerVec == SV1) { 21217 Mask.push_back(InnerIdx + NumElts); 21218 continue; 21219 } 21220 } 21221 21222 // Bail out if we cannot convert the shuffle pair into a single shuffle. 21223 return false; 21224 } 21225 21226 if (llvm::all_of(Mask, [](int M) { return M < 0; })) 21227 return true; 21228 21229 // Avoid introducing shuffles with illegal mask. 21230 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 21231 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 21232 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 21233 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) 21234 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) 21235 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) 21236 if (TLI.isShuffleMaskLegal(Mask, VT)) 21237 return true; 21238 21239 std::swap(SV0, SV1); 21240 ShuffleVectorSDNode::commuteMask(Mask); 21241 return TLI.isShuffleMaskLegal(Mask, VT); 21242 }; 21243 21244 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { 21245 // Canonicalize shuffles according to rules: 21246 // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) 21247 // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) 21248 // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 21249 if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && 21250 N0.getOpcode() != ISD::VECTOR_SHUFFLE) { 21251 // The incoming shuffle must be of the same type as the result of the 21252 // current shuffle. 21253 assert(N1->getOperand(0).getValueType() == VT && 21254 "Shuffle types don't match"); 21255 21256 SDValue SV0 = N1->getOperand(0); 21257 SDValue SV1 = N1->getOperand(1); 21258 bool HasSameOp0 = N0 == SV0; 21259 bool IsSV1Undef = SV1.isUndef(); 21260 if (HasSameOp0 || IsSV1Undef || N0 == SV1) 21261 // Commute the operands of this shuffle so merging below will trigger. 21262 return DAG.getCommutedVectorShuffle(*SVN); 21263 } 21264 21265 // Canonicalize splat shuffles to the RHS to improve merging below. 21266 // shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u)) 21267 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && 21268 N1.getOpcode() == ISD::VECTOR_SHUFFLE && 21269 cast<ShuffleVectorSDNode>(N0)->isSplat() && 21270 !cast<ShuffleVectorSDNode>(N1)->isSplat()) { 21271 return DAG.getCommutedVectorShuffle(*SVN); 21272 } 21273 21274 // Try to fold according to rules: 21275 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 21276 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 21277 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 21278 // Don't try to fold shuffles with illegal type. 21279 // Only fold if this shuffle is the only user of the other shuffle. 21280 // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well. 21281 for (int i = 0; i != 2; ++i) { 21282 if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE && 21283 N->isOnlyUserOf(N->getOperand(i).getNode())) { 21284 // The incoming shuffle must be of the same type as the result of the 21285 // current shuffle. 21286 auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i)); 21287 assert(OtherSV->getOperand(0).getValueType() == VT && 21288 "Shuffle types don't match"); 21289 21290 SDValue SV0, SV1; 21291 SmallVector<int, 4> Mask; 21292 if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI, 21293 SV0, SV1, Mask)) { 21294 // Check if all indices in Mask are Undef. In case, propagate Undef. 21295 if (llvm::all_of(Mask, [](int M) { return M < 0; })) 21296 return DAG.getUNDEF(VT); 21297 21298 return DAG.getVectorShuffle(VT, SDLoc(N), 21299 SV0 ? SV0 : DAG.getUNDEF(VT), 21300 SV1 ? SV1 : DAG.getUNDEF(VT), Mask); 21301 } 21302 } 21303 } 21304 21305 // Merge shuffles through binops if we are able to merge it with at least 21306 // one other shuffles. 21307 // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef) 21308 // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) 21309 unsigned SrcOpcode = N0.getOpcode(); 21310 if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) && 21311 (N1.isUndef() || 21312 (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) { 21313 // Get binop source ops, or just pass on the undef. 21314 SDValue Op00 = N0.getOperand(0); 21315 SDValue Op01 = N0.getOperand(1); 21316 SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0); 21317 SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1); 21318 // TODO: We might be able to relax the VT check but we don't currently 21319 // have any isBinOp() that has different result/ops VTs so play safe until 21320 // we have test coverage. 21321 if (Op00.getValueType() == VT && Op10.getValueType() == VT && 21322 Op01.getValueType() == VT && Op11.getValueType() == VT && 21323 (Op00.getOpcode() == ISD::VECTOR_SHUFFLE || 21324 Op10.getOpcode() == ISD::VECTOR_SHUFFLE || 21325 Op01.getOpcode() == ISD::VECTOR_SHUFFLE || 21326 Op11.getOpcode() == ISD::VECTOR_SHUFFLE)) { 21327 auto CanMergeInnerShuffle = [&](SDValue &SV0, SDValue &SV1, 21328 SmallVectorImpl<int> &Mask, bool LeftOp, 21329 bool Commute) { 21330 SDValue InnerN = Commute ? N1 : N0; 21331 SDValue Op0 = LeftOp ? Op00 : Op01; 21332 SDValue Op1 = LeftOp ? Op10 : Op11; 21333 if (Commute) 21334 std::swap(Op0, Op1); 21335 // Only accept the merged shuffle if we don't introduce undef elements, 21336 // or the inner shuffle already contained undef elements. 21337 auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(Op0); 21338 return SVN0 && InnerN->isOnlyUserOf(SVN0) && 21339 MergeInnerShuffle(Commute, SVN, SVN0, Op1, TLI, SV0, SV1, 21340 Mask) && 21341 (llvm::any_of(SVN0->getMask(), [](int M) { return M < 0; }) || 21342 llvm::none_of(Mask, [](int M) { return M < 0; })); 21343 }; 21344 21345 // Ensure we don't increase the number of shuffles - we must merge a 21346 // shuffle from at least one of the LHS and RHS ops. 21347 bool MergedLeft = false; 21348 SDValue LeftSV0, LeftSV1; 21349 SmallVector<int, 4> LeftMask; 21350 if (CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, false) || 21351 CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, true)) { 21352 MergedLeft = true; 21353 } else { 21354 LeftMask.assign(SVN->getMask().begin(), SVN->getMask().end()); 21355 LeftSV0 = Op00, LeftSV1 = Op10; 21356 } 21357 21358 bool MergedRight = false; 21359 SDValue RightSV0, RightSV1; 21360 SmallVector<int, 4> RightMask; 21361 if (CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, false) || 21362 CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, true)) { 21363 MergedRight = true; 21364 } else { 21365 RightMask.assign(SVN->getMask().begin(), SVN->getMask().end()); 21366 RightSV0 = Op01, RightSV1 = Op11; 21367 } 21368 21369 if (MergedLeft || MergedRight) { 21370 SDLoc DL(N); 21371 SDValue LHS = DAG.getVectorShuffle( 21372 VT, DL, LeftSV0 ? LeftSV0 : DAG.getUNDEF(VT), 21373 LeftSV1 ? LeftSV1 : DAG.getUNDEF(VT), LeftMask); 21374 SDValue RHS = DAG.getVectorShuffle( 21375 VT, DL, RightSV0 ? RightSV0 : DAG.getUNDEF(VT), 21376 RightSV1 ? RightSV1 : DAG.getUNDEF(VT), RightMask); 21377 return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS); 21378 } 21379 } 21380 } 21381 } 21382 21383 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) 21384 return V; 21385 21386 return SDValue(); 21387 } 21388 21389 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { 21390 SDValue InVal = N->getOperand(0); 21391 EVT VT = N->getValueType(0); 21392 21393 // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern 21394 // with a VECTOR_SHUFFLE and possible truncate. 21395 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 21396 VT.isFixedLengthVector() && 21397 InVal->getOperand(0).getValueType().isFixedLengthVector()) { 21398 SDValue InVec = InVal->getOperand(0); 21399 SDValue EltNo = InVal->getOperand(1); 21400 auto InVecT = InVec.getValueType(); 21401 if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) { 21402 SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1); 21403 int Elt = C0->getZExtValue(); 21404 NewMask[0] = Elt; 21405 // If we have an implict truncate do truncate here as long as it's legal. 21406 // if it's not legal, this should 21407 if (VT.getScalarType() != InVal.getValueType() && 21408 InVal.getValueType().isScalarInteger() && 21409 isTypeLegal(VT.getScalarType())) { 21410 SDValue Val = 21411 DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); 21412 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); 21413 } 21414 if (VT.getScalarType() == InVecT.getScalarType() && 21415 VT.getVectorNumElements() <= InVecT.getVectorNumElements()) { 21416 SDValue LegalShuffle = 21417 TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec, 21418 DAG.getUNDEF(InVecT), NewMask, DAG); 21419 if (LegalShuffle) { 21420 // If the initial vector is the correct size this shuffle is a 21421 // valid result. 21422 if (VT == InVecT) 21423 return LegalShuffle; 21424 // If not we must truncate the vector. 21425 if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { 21426 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N)); 21427 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), 21428 InVecT.getVectorElementType(), 21429 VT.getVectorNumElements()); 21430 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, 21431 LegalShuffle, ZeroIdx); 21432 } 21433 } 21434 } 21435 } 21436 } 21437 21438 return SDValue(); 21439 } 21440 21441 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { 21442 EVT VT = N->getValueType(0); 21443 SDValue N0 = N->getOperand(0); 21444 SDValue N1 = N->getOperand(1); 21445 SDValue N2 = N->getOperand(2); 21446 uint64_t InsIdx = N->getConstantOperandVal(2); 21447 21448 // If inserting an UNDEF, just return the original vector. 21449 if (N1.isUndef()) 21450 return N0; 21451 21452 // If this is an insert of an extracted vector into an undef vector, we can 21453 // just use the input to the extract. 21454 if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && 21455 N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) 21456 return N1.getOperand(0); 21457 21458 // If we are inserting a bitcast value into an undef, with the same 21459 // number of elements, just use the bitcast input of the extract. 21460 // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> 21461 // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) 21462 if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && 21463 N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && 21464 N1.getOperand(0).getOperand(1) == N2 && 21465 N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() == 21466 VT.getVectorElementCount() && 21467 N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == 21468 VT.getSizeInBits()) { 21469 return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); 21470 } 21471 21472 // If both N1 and N2 are bitcast values on which insert_subvector 21473 // would makes sense, pull the bitcast through. 21474 // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> 21475 // BITCAST (INSERT_SUBVECTOR N0 N1 N2) 21476 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { 21477 SDValue CN0 = N0.getOperand(0); 21478 SDValue CN1 = N1.getOperand(0); 21479 EVT CN0VT = CN0.getValueType(); 21480 EVT CN1VT = CN1.getValueType(); 21481 if (CN0VT.isVector() && CN1VT.isVector() && 21482 CN0VT.getVectorElementType() == CN1VT.getVectorElementType() && 21483 CN0VT.getVectorElementCount() == VT.getVectorElementCount()) { 21484 SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), 21485 CN0.getValueType(), CN0, CN1, N2); 21486 return DAG.getBitcast(VT, NewINSERT); 21487 } 21488 } 21489 21490 // Combine INSERT_SUBVECTORs where we are inserting to the same index. 21491 // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) 21492 // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) 21493 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && 21494 N0.getOperand(1).getValueType() == N1.getValueType() && 21495 N0.getOperand(2) == N2) 21496 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), 21497 N1, N2); 21498 21499 // Eliminate an intermediate insert into an undef vector: 21500 // insert_subvector undef, (insert_subvector undef, X, 0), N2 --> 21501 // insert_subvector undef, X, N2 21502 if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR && 21503 N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2))) 21504 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0, 21505 N1.getOperand(1), N2); 21506 21507 // Push subvector bitcasts to the output, adjusting the index as we go. 21508 // insert_subvector(bitcast(v), bitcast(s), c1) 21509 // -> bitcast(insert_subvector(v, s, c2)) 21510 if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) && 21511 N1.getOpcode() == ISD::BITCAST) { 21512 SDValue N0Src = peekThroughBitcasts(N0); 21513 SDValue N1Src = peekThroughBitcasts(N1); 21514 EVT N0SrcSVT = N0Src.getValueType().getScalarType(); 21515 EVT N1SrcSVT = N1Src.getValueType().getScalarType(); 21516 if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) && 21517 N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) { 21518 EVT NewVT; 21519 SDLoc DL(N); 21520 SDValue NewIdx; 21521 LLVMContext &Ctx = *DAG.getContext(); 21522 ElementCount NumElts = VT.getVectorElementCount(); 21523 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 21524 if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { 21525 unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); 21526 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale); 21527 NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL); 21528 } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { 21529 unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; 21530 if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) { 21531 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, 21532 NumElts.divideCoefficientBy(Scale)); 21533 NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL); 21534 } 21535 } 21536 if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) { 21537 SDValue Res = DAG.getBitcast(NewVT, N0Src); 21538 Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx); 21539 return DAG.getBitcast(VT, Res); 21540 } 21541 } 21542 } 21543 21544 // Canonicalize insert_subvector dag nodes. 21545 // Example: 21546 // (insert_subvector (insert_subvector A, Idx0), Idx1) 21547 // -> (insert_subvector (insert_subvector A, Idx1), Idx0) 21548 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && 21549 N1.getValueType() == N0.getOperand(1).getValueType()) { 21550 unsigned OtherIdx = N0.getConstantOperandVal(2); 21551 if (InsIdx < OtherIdx) { 21552 // Swap nodes. 21553 SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, 21554 N0.getOperand(0), N1, N2); 21555 AddToWorklist(NewOp.getNode()); 21556 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), 21557 VT, NewOp, N0.getOperand(1), N0.getOperand(2)); 21558 } 21559 } 21560 21561 // If the input vector is a concatenation, and the insert replaces 21562 // one of the pieces, we can optimize into a single concat_vectors. 21563 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && 21564 N0.getOperand(0).getValueType() == N1.getValueType() && 21565 N0.getOperand(0).getValueType().isScalableVector() == 21566 N1.getValueType().isScalableVector()) { 21567 unsigned Factor = N1.getValueType().getVectorMinNumElements(); 21568 SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); 21569 Ops[InsIdx / Factor] = N1; 21570 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 21571 } 21572 21573 // Simplify source operands based on insertion. 21574 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 21575 return SDValue(N, 0); 21576 21577 return SDValue(); 21578 } 21579 21580 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { 21581 SDValue N0 = N->getOperand(0); 21582 21583 // fold (fp_to_fp16 (fp16_to_fp op)) -> op 21584 if (N0->getOpcode() == ISD::FP16_TO_FP) 21585 return N0->getOperand(0); 21586 21587 return SDValue(); 21588 } 21589 21590 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { 21591 SDValue N0 = N->getOperand(0); 21592 21593 // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) 21594 if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) { 21595 ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); 21596 if (AndConst && AndConst->getAPIntValue() == 0xffff) { 21597 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), 21598 N0.getOperand(0)); 21599 } 21600 } 21601 21602 return SDValue(); 21603 } 21604 21605 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { 21606 SDValue N0 = N->getOperand(0); 21607 EVT VT = N0.getValueType(); 21608 unsigned Opcode = N->getOpcode(); 21609 21610 // VECREDUCE over 1-element vector is just an extract. 21611 if (VT.getVectorElementCount().isScalar()) { 21612 SDLoc dl(N); 21613 SDValue Res = 21614 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, 21615 DAG.getVectorIdxConstant(0, dl)); 21616 if (Res.getValueType() != N->getValueType(0)) 21617 Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res); 21618 return Res; 21619 } 21620 21621 // On an boolean vector an and/or reduction is the same as a umin/umax 21622 // reduction. Convert them if the latter is legal while the former isn't. 21623 if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) { 21624 unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND 21625 ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX; 21626 if (!TLI.isOperationLegalOrCustom(Opcode, VT) && 21627 TLI.isOperationLegalOrCustom(NewOpcode, VT) && 21628 DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits()) 21629 return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0); 21630 } 21631 21632 return SDValue(); 21633 } 21634 21635 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle 21636 /// with the destination vector and a zero vector. 21637 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> 21638 /// vector_shuffle V, Zero, <0, 4, 2, 4> 21639 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { 21640 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); 21641 21642 EVT VT = N->getValueType(0); 21643 SDValue LHS = N->getOperand(0); 21644 SDValue RHS = peekThroughBitcasts(N->getOperand(1)); 21645 SDLoc DL(N); 21646 21647 // Make sure we're not running after operation legalization where it 21648 // may have custom lowered the vector shuffles. 21649 if (LegalOperations) 21650 return SDValue(); 21651 21652 if (RHS.getOpcode() != ISD::BUILD_VECTOR) 21653 return SDValue(); 21654 21655 EVT RVT = RHS.getValueType(); 21656 unsigned NumElts = RHS.getNumOperands(); 21657 21658 // Attempt to create a valid clear mask, splitting the mask into 21659 // sub elements and checking to see if each is 21660 // all zeros or all ones - suitable for shuffle masking. 21661 auto BuildClearMask = [&](int Split) { 21662 int NumSubElts = NumElts * Split; 21663 int NumSubBits = RVT.getScalarSizeInBits() / Split; 21664 21665 SmallVector<int, 8> Indices; 21666 for (int i = 0; i != NumSubElts; ++i) { 21667 int EltIdx = i / Split; 21668 int SubIdx = i % Split; 21669 SDValue Elt = RHS.getOperand(EltIdx); 21670 // X & undef --> 0 (not undef). So this lane must be converted to choose 21671 // from the zero constant vector (same as if the element had all 0-bits). 21672 if (Elt.isUndef()) { 21673 Indices.push_back(i + NumSubElts); 21674 continue; 21675 } 21676 21677 APInt Bits; 21678 if (isa<ConstantSDNode>(Elt)) 21679 Bits = cast<ConstantSDNode>(Elt)->getAPIntValue(); 21680 else if (isa<ConstantFPSDNode>(Elt)) 21681 Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt(); 21682 else 21683 return SDValue(); 21684 21685 // Extract the sub element from the constant bit mask. 21686 if (DAG.getDataLayout().isBigEndian()) 21687 Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits); 21688 else 21689 Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits); 21690 21691 if (Bits.isAllOnesValue()) 21692 Indices.push_back(i); 21693 else if (Bits == 0) 21694 Indices.push_back(i + NumSubElts); 21695 else 21696 return SDValue(); 21697 } 21698 21699 // Let's see if the target supports this vector_shuffle. 21700 EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); 21701 EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); 21702 if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) 21703 return SDValue(); 21704 21705 SDValue Zero = DAG.getConstant(0, DL, ClearVT); 21706 return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL, 21707 DAG.getBitcast(ClearVT, LHS), 21708 Zero, Indices)); 21709 }; 21710 21711 // Determine maximum split level (byte level masking). 21712 int MaxSplit = 1; 21713 if (RVT.getScalarSizeInBits() % 8 == 0) 21714 MaxSplit = RVT.getScalarSizeInBits() / 8; 21715 21716 for (int Split = 1; Split <= MaxSplit; ++Split) 21717 if (RVT.getScalarSizeInBits() % Split == 0) 21718 if (SDValue S = BuildClearMask(Split)) 21719 return S; 21720 21721 return SDValue(); 21722 } 21723 21724 /// If a vector binop is performed on splat values, it may be profitable to 21725 /// extract, scalarize, and insert/splat. 21726 static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) { 21727 SDValue N0 = N->getOperand(0); 21728 SDValue N1 = N->getOperand(1); 21729 unsigned Opcode = N->getOpcode(); 21730 EVT VT = N->getValueType(0); 21731 EVT EltVT = VT.getVectorElementType(); 21732 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 21733 21734 // TODO: Remove/replace the extract cost check? If the elements are available 21735 // as scalars, then there may be no extract cost. Should we ask if 21736 // inserting a scalar back into a vector is cheap instead? 21737 int Index0, Index1; 21738 SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); 21739 SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); 21740 if (!Src0 || !Src1 || Index0 != Index1 || 21741 Src0.getValueType().getVectorElementType() != EltVT || 21742 Src1.getValueType().getVectorElementType() != EltVT || 21743 !TLI.isExtractVecEltCheap(VT, Index0) || 21744 !TLI.isOperationLegalOrCustom(Opcode, EltVT)) 21745 return SDValue(); 21746 21747 SDLoc DL(N); 21748 SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); 21749 SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC); 21750 SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC); 21751 SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); 21752 21753 // If all lanes but 1 are undefined, no need to splat the scalar result. 21754 // TODO: Keep track of undefs and use that info in the general case. 21755 if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && 21756 count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 && 21757 count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) { 21758 // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> 21759 // build_vec ..undef, (bo X, Y), undef... 21760 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); 21761 Ops[Index0] = ScalarBO; 21762 return DAG.getBuildVector(VT, DL, Ops); 21763 } 21764 21765 // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index 21766 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO); 21767 return DAG.getBuildVector(VT, DL, Ops); 21768 } 21769 21770 /// Visit a binary vector operation, like ADD. 21771 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { 21772 assert(N->getValueType(0).isVector() && 21773 "SimplifyVBinOp only works on vectors!"); 21774 21775 SDValue LHS = N->getOperand(0); 21776 SDValue RHS = N->getOperand(1); 21777 SDValue Ops[] = {LHS, RHS}; 21778 EVT VT = N->getValueType(0); 21779 unsigned Opcode = N->getOpcode(); 21780 SDNodeFlags Flags = N->getFlags(); 21781 21782 // See if we can constant fold the vector operation. 21783 if (SDValue Fold = DAG.FoldConstantVectorArithmetic( 21784 Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) 21785 return Fold; 21786 21787 // Move unary shuffles with identical masks after a vector binop: 21788 // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask)) 21789 // --> shuffle (VBinOp A, B), Undef, Mask 21790 // This does not require type legality checks because we are creating the 21791 // same types of operations that are in the original sequence. We do have to 21792 // restrict ops like integer div that have immediate UB (eg, div-by-zero) 21793 // though. This code is adapted from the identical transform in instcombine. 21794 if (Opcode != ISD::UDIV && Opcode != ISD::SDIV && 21795 Opcode != ISD::UREM && Opcode != ISD::SREM && 21796 Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) { 21797 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS); 21798 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS); 21799 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) && 21800 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() && 21801 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) { 21802 SDLoc DL(N); 21803 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), 21804 RHS.getOperand(0), Flags); 21805 SDValue UndefV = LHS.getOperand(1); 21806 return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); 21807 } 21808 21809 // Try to sink a splat shuffle after a binop with a uniform constant. 21810 // This is limited to cases where neither the shuffle nor the constant have 21811 // undefined elements because that could be poison-unsafe or inhibit 21812 // demanded elements analysis. It is further limited to not change a splat 21813 // of an inserted scalar because that may be optimized better by 21814 // load-folding or other target-specific behaviors. 21815 if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) && 21816 Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() && 21817 Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) { 21818 // binop (splat X), (splat C) --> splat (binop X, C) 21819 SDLoc DL(N); 21820 SDValue X = Shuf0->getOperand(0); 21821 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags); 21822 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT), 21823 Shuf0->getMask()); 21824 } 21825 if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) && 21826 Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() && 21827 Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) { 21828 // binop (splat C), (splat X) --> splat (binop C, X) 21829 SDLoc DL(N); 21830 SDValue X = Shuf1->getOperand(0); 21831 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags); 21832 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT), 21833 Shuf1->getMask()); 21834 } 21835 } 21836 21837 // The following pattern is likely to emerge with vector reduction ops. Moving 21838 // the binary operation ahead of insertion may allow using a narrower vector 21839 // instruction that has better performance than the wide version of the op: 21840 // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z 21841 if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && 21842 RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && 21843 LHS.getOperand(2) == RHS.getOperand(2) && 21844 (LHS.hasOneUse() || RHS.hasOneUse())) { 21845 SDValue X = LHS.getOperand(1); 21846 SDValue Y = RHS.getOperand(1); 21847 SDValue Z = LHS.getOperand(2); 21848 EVT NarrowVT = X.getValueType(); 21849 if (NarrowVT == Y.getValueType() && 21850 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT, 21851 LegalOperations)) { 21852 // (binop undef, undef) may not return undef, so compute that result. 21853 SDLoc DL(N); 21854 SDValue VecC = 21855 DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT)); 21856 SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y); 21857 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); 21858 } 21859 } 21860 21861 // Make sure all but the first op are undef or constant. 21862 auto ConcatWithConstantOrUndef = [](SDValue Concat) { 21863 return Concat.getOpcode() == ISD::CONCAT_VECTORS && 21864 all_of(drop_begin(Concat->ops()), [](const SDValue &Op) { 21865 return Op.isUndef() || 21866 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); 21867 }); 21868 }; 21869 21870 // The following pattern is likely to emerge with vector reduction ops. Moving 21871 // the binary operation ahead of the concat may allow using a narrower vector 21872 // instruction that has better performance than the wide version of the op: 21873 // VBinOp (concat X, undef/constant), (concat Y, undef/constant) --> 21874 // concat (VBinOp X, Y), VecC 21875 if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) && 21876 (LHS.hasOneUse() || RHS.hasOneUse())) { 21877 EVT NarrowVT = LHS.getOperand(0).getValueType(); 21878 if (NarrowVT == RHS.getOperand(0).getValueType() && 21879 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { 21880 SDLoc DL(N); 21881 unsigned NumOperands = LHS.getNumOperands(); 21882 SmallVector<SDValue, 4> ConcatOps; 21883 for (unsigned i = 0; i != NumOperands; ++i) { 21884 // This constant fold for operands 1 and up. 21885 ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i), 21886 RHS.getOperand(i))); 21887 } 21888 21889 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 21890 } 21891 } 21892 21893 if (SDValue V = scalarizeBinOpOfSplats(N, DAG)) 21894 return V; 21895 21896 return SDValue(); 21897 } 21898 21899 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, 21900 SDValue N2) { 21901 assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); 21902 21903 SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, 21904 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 21905 21906 // If we got a simplified select_cc node back from SimplifySelectCC, then 21907 // break it down into a new SETCC node, and a new SELECT node, and then return 21908 // the SELECT node, since we were called with a SELECT node. 21909 if (SCC.getNode()) { 21910 // Check to see if we got a select_cc back (to turn into setcc/select). 21911 // Otherwise, just return whatever node we got back, like fabs. 21912 if (SCC.getOpcode() == ISD::SELECT_CC) { 21913 const SDNodeFlags Flags = N0.getNode()->getFlags(); 21914 SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), 21915 N0.getValueType(), 21916 SCC.getOperand(0), SCC.getOperand(1), 21917 SCC.getOperand(4), Flags); 21918 AddToWorklist(SETCC.getNode()); 21919 SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, 21920 SCC.getOperand(2), SCC.getOperand(3)); 21921 SelectNode->setFlags(Flags); 21922 return SelectNode; 21923 } 21924 21925 return SCC; 21926 } 21927 return SDValue(); 21928 } 21929 21930 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values 21931 /// being selected between, see if we can simplify the select. Callers of this 21932 /// should assume that TheSelect is deleted if this returns true. As such, they 21933 /// should return the appropriate thing (e.g. the node) back to the top-level of 21934 /// the DAG combiner loop to avoid it being looked at. 21935 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, 21936 SDValue RHS) { 21937 // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 21938 // The select + setcc is redundant, because fsqrt returns NaN for X < 0. 21939 if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { 21940 if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) { 21941 // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?)) 21942 SDValue Sqrt = RHS; 21943 ISD::CondCode CC; 21944 SDValue CmpLHS; 21945 const ConstantFPSDNode *Zero = nullptr; 21946 21947 if (TheSelect->getOpcode() == ISD::SELECT_CC) { 21948 CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get(); 21949 CmpLHS = TheSelect->getOperand(0); 21950 Zero = isConstOrConstSplatFP(TheSelect->getOperand(1)); 21951 } else { 21952 // SELECT or VSELECT 21953 SDValue Cmp = TheSelect->getOperand(0); 21954 if (Cmp.getOpcode() == ISD::SETCC) { 21955 CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get(); 21956 CmpLHS = Cmp.getOperand(0); 21957 Zero = isConstOrConstSplatFP(Cmp.getOperand(1)); 21958 } 21959 } 21960 if (Zero && Zero->isZero() && 21961 Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT || 21962 CC == ISD::SETULT || CC == ISD::SETLT)) { 21963 // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 21964 CombineTo(TheSelect, Sqrt); 21965 return true; 21966 } 21967 } 21968 } 21969 // Cannot simplify select with vector condition 21970 if (TheSelect->getOperand(0).getValueType().isVector()) return false; 21971 21972 // If this is a select from two identical things, try to pull the operation 21973 // through the select. 21974 if (LHS.getOpcode() != RHS.getOpcode() || 21975 !LHS.hasOneUse() || !RHS.hasOneUse()) 21976 return false; 21977 21978 // If this is a load and the token chain is identical, replace the select 21979 // of two loads with a load through a select of the address to load from. 21980 // This triggers in things like "select bool X, 10.0, 123.0" after the FP 21981 // constants have been dropped into the constant pool. 21982 if (LHS.getOpcode() == ISD::LOAD) { 21983 LoadSDNode *LLD = cast<LoadSDNode>(LHS); 21984 LoadSDNode *RLD = cast<LoadSDNode>(RHS); 21985 21986 // Token chains must be identical. 21987 if (LHS.getOperand(0) != RHS.getOperand(0) || 21988 // Do not let this transformation reduce the number of volatile loads. 21989 // Be conservative for atomics for the moment 21990 // TODO: This does appear to be legal for unordered atomics (see D66309) 21991 !LLD->isSimple() || !RLD->isSimple() || 21992 // FIXME: If either is a pre/post inc/dec load, 21993 // we'd need to split out the address adjustment. 21994 LLD->isIndexed() || RLD->isIndexed() || 21995 // If this is an EXTLOAD, the VT's must match. 21996 LLD->getMemoryVT() != RLD->getMemoryVT() || 21997 // If this is an EXTLOAD, the kind of extension must match. 21998 (LLD->getExtensionType() != RLD->getExtensionType() && 21999 // The only exception is if one of the extensions is anyext. 22000 LLD->getExtensionType() != ISD::EXTLOAD && 22001 RLD->getExtensionType() != ISD::EXTLOAD) || 22002 // FIXME: this discards src value information. This is 22003 // over-conservative. It would be beneficial to be able to remember 22004 // both potential memory locations. Since we are discarding 22005 // src value info, don't do the transformation if the memory 22006 // locations are not in the default address space. 22007 LLD->getPointerInfo().getAddrSpace() != 0 || 22008 RLD->getPointerInfo().getAddrSpace() != 0 || 22009 // We can't produce a CMOV of a TargetFrameIndex since we won't 22010 // generate the address generation required. 22011 LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 22012 RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 22013 !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), 22014 LLD->getBasePtr().getValueType())) 22015 return false; 22016 22017 // The loads must not depend on one another. 22018 if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) 22019 return false; 22020 22021 // Check that the select condition doesn't reach either load. If so, 22022 // folding this will induce a cycle into the DAG. If not, this is safe to 22023 // xform, so create a select of the addresses. 22024 22025 SmallPtrSet<const SDNode *, 32> Visited; 22026 SmallVector<const SDNode *, 16> Worklist; 22027 22028 // Always fail if LLD and RLD are not independent. TheSelect is a 22029 // predecessor to all Nodes in question so we need not search past it. 22030 22031 Visited.insert(TheSelect); 22032 Worklist.push_back(LLD); 22033 Worklist.push_back(RLD); 22034 22035 if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) || 22036 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)) 22037 return false; 22038 22039 SDValue Addr; 22040 if (TheSelect->getOpcode() == ISD::SELECT) { 22041 // We cannot do this optimization if any pair of {RLD, LLD} is a 22042 // predecessor to {RLD, LLD, CondNode}. As we've already compared the 22043 // Loads, we only need to check if CondNode is a successor to one of the 22044 // loads. We can further avoid this if there's no use of their chain 22045 // value. 22046 SDNode *CondNode = TheSelect->getOperand(0).getNode(); 22047 Worklist.push_back(CondNode); 22048 22049 if ((LLD->hasAnyUseOfValue(1) && 22050 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 22051 (RLD->hasAnyUseOfValue(1) && 22052 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 22053 return false; 22054 22055 Addr = DAG.getSelect(SDLoc(TheSelect), 22056 LLD->getBasePtr().getValueType(), 22057 TheSelect->getOperand(0), LLD->getBasePtr(), 22058 RLD->getBasePtr()); 22059 } else { // Otherwise SELECT_CC 22060 // We cannot do this optimization if any pair of {RLD, LLD} is a 22061 // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared 22062 // the Loads, we only need to check if CondLHS/CondRHS is a successor to 22063 // one of the loads. We can further avoid this if there's no use of their 22064 // chain value. 22065 22066 SDNode *CondLHS = TheSelect->getOperand(0).getNode(); 22067 SDNode *CondRHS = TheSelect->getOperand(1).getNode(); 22068 Worklist.push_back(CondLHS); 22069 Worklist.push_back(CondRHS); 22070 22071 if ((LLD->hasAnyUseOfValue(1) && 22072 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 22073 (RLD->hasAnyUseOfValue(1) && 22074 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 22075 return false; 22076 22077 Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), 22078 LLD->getBasePtr().getValueType(), 22079 TheSelect->getOperand(0), 22080 TheSelect->getOperand(1), 22081 LLD->getBasePtr(), RLD->getBasePtr(), 22082 TheSelect->getOperand(4)); 22083 } 22084 22085 SDValue Load; 22086 // It is safe to replace the two loads if they have different alignments, 22087 // but the new load must be the minimum (most restrictive) alignment of the 22088 // inputs. 22089 Align Alignment = std::min(LLD->getAlign(), RLD->getAlign()); 22090 MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); 22091 if (!RLD->isInvariant()) 22092 MMOFlags &= ~MachineMemOperand::MOInvariant; 22093 if (!RLD->isDereferenceable()) 22094 MMOFlags &= ~MachineMemOperand::MODereferenceable; 22095 if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { 22096 // FIXME: Discards pointer and AA info. 22097 Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), 22098 LLD->getChain(), Addr, MachinePointerInfo(), Alignment, 22099 MMOFlags); 22100 } else { 22101 // FIXME: Discards pointer and AA info. 22102 Load = DAG.getExtLoad( 22103 LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() 22104 : LLD->getExtensionType(), 22105 SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, 22106 MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); 22107 } 22108 22109 // Users of the select now use the result of the load. 22110 CombineTo(TheSelect, Load); 22111 22112 // Users of the old loads now use the new load's chain. We know the 22113 // old-load value is dead now. 22114 CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); 22115 CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); 22116 return true; 22117 } 22118 22119 return false; 22120 } 22121 22122 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and 22123 /// bitwise 'and'. 22124 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, 22125 SDValue N1, SDValue N2, SDValue N3, 22126 ISD::CondCode CC) { 22127 // If this is a select where the false operand is zero and the compare is a 22128 // check of the sign bit, see if we can perform the "gzip trick": 22129 // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A 22130 // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A 22131 EVT XType = N0.getValueType(); 22132 EVT AType = N2.getValueType(); 22133 if (!isNullConstant(N3) || !XType.bitsGE(AType)) 22134 return SDValue(); 22135 22136 // If the comparison is testing for a positive value, we have to invert 22137 // the sign bit mask, so only do that transform if the target has a bitwise 22138 // 'and not' instruction (the invert is free). 22139 if (CC == ISD::SETGT && TLI.hasAndNot(N2)) { 22140 // (X > -1) ? A : 0 22141 // (X > 0) ? X : 0 <-- This is canonical signed max. 22142 if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2))) 22143 return SDValue(); 22144 } else if (CC == ISD::SETLT) { 22145 // (X < 0) ? A : 0 22146 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min. 22147 if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2))) 22148 return SDValue(); 22149 } else { 22150 return SDValue(); 22151 } 22152 22153 // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit 22154 // constant. 22155 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 22156 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 22157 if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { 22158 unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; 22159 if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) { 22160 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 22161 SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); 22162 AddToWorklist(Shift.getNode()); 22163 22164 if (XType.bitsGT(AType)) { 22165 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 22166 AddToWorklist(Shift.getNode()); 22167 } 22168 22169 if (CC == ISD::SETGT) 22170 Shift = DAG.getNOT(DL, Shift, AType); 22171 22172 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 22173 } 22174 } 22175 22176 unsigned ShCt = XType.getSizeInBits() - 1; 22177 if (TLI.shouldAvoidTransformToShift(XType, ShCt)) 22178 return SDValue(); 22179 22180 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 22181 SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); 22182 AddToWorklist(Shift.getNode()); 22183 22184 if (XType.bitsGT(AType)) { 22185 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 22186 AddToWorklist(Shift.getNode()); 22187 } 22188 22189 if (CC == ISD::SETGT) 22190 Shift = DAG.getNOT(DL, Shift, AType); 22191 22192 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 22193 } 22194 22195 // Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values. 22196 SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) { 22197 SDValue N0 = N->getOperand(0); 22198 EVT VT = N->getValueType(0); 22199 bool IsFabs = N->getOpcode() == ISD::FABS; 22200 bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); 22201 22202 if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse()) 22203 return SDValue(); 22204 22205 SDValue Int = N0.getOperand(0); 22206 EVT IntVT = Int.getValueType(); 22207 22208 // The operand to cast should be integer. 22209 if (!IntVT.isInteger() || IntVT.isVector()) 22210 return SDValue(); 22211 22212 // (fneg (bitconvert x)) -> (bitconvert (xor x sign)) 22213 // (fabs (bitconvert x)) -> (bitconvert (and x ~sign)) 22214 APInt SignMask; 22215 if (N0.getValueType().isVector()) { 22216 // For vector, create a sign mask (0x80...) or its inverse (for fabs, 22217 // 0x7f...) per element and splat it. 22218 SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); 22219 if (IsFabs) 22220 SignMask = ~SignMask; 22221 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); 22222 } else { 22223 // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...) 22224 SignMask = APInt::getSignMask(IntVT.getSizeInBits()); 22225 if (IsFabs) 22226 SignMask = ~SignMask; 22227 } 22228 SDLoc DL(N0); 22229 Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int, 22230 DAG.getConstant(SignMask, DL, IntVT)); 22231 AddToWorklist(Int.getNode()); 22232 return DAG.getBitcast(VT, Int); 22233 } 22234 22235 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" 22236 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 22237 /// in it. This may be a win when the constant is not otherwise available 22238 /// because it replaces two constant pool loads with one. 22239 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset( 22240 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 22241 ISD::CondCode CC) { 22242 if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType())) 22243 return SDValue(); 22244 22245 // If we are before legalize types, we want the other legalization to happen 22246 // first (for example, to avoid messing with soft float). 22247 auto *TV = dyn_cast<ConstantFPSDNode>(N2); 22248 auto *FV = dyn_cast<ConstantFPSDNode>(N3); 22249 EVT VT = N2.getValueType(); 22250 if (!TV || !FV || !TLI.isTypeLegal(VT)) 22251 return SDValue(); 22252 22253 // If a constant can be materialized without loads, this does not make sense. 22254 if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal || 22255 TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) || 22256 TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize)) 22257 return SDValue(); 22258 22259 // If both constants have multiple uses, then we won't need to do an extra 22260 // load. The values are likely around in registers for other users. 22261 if (!TV->hasOneUse() && !FV->hasOneUse()) 22262 return SDValue(); 22263 22264 Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()), 22265 const_cast<ConstantFP*>(TV->getConstantFPValue()) }; 22266 Type *FPTy = Elts[0]->getType(); 22267 const DataLayout &TD = DAG.getDataLayout(); 22268 22269 // Create a ConstantArray of the two constants. 22270 Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); 22271 SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), 22272 TD.getPrefTypeAlign(FPTy)); 22273 Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign(); 22274 22275 // Get offsets to the 0 and 1 elements of the array, so we can select between 22276 // them. 22277 SDValue Zero = DAG.getIntPtrConstant(0, DL); 22278 unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); 22279 SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); 22280 SDValue Cond = 22281 DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); 22282 AddToWorklist(Cond.getNode()); 22283 SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); 22284 AddToWorklist(CstOffset.getNode()); 22285 CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); 22286 AddToWorklist(CPIdx.getNode()); 22287 return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, 22288 MachinePointerInfo::getConstantPool( 22289 DAG.getMachineFunction()), Alignment); 22290 } 22291 22292 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 22293 /// where 'cond' is the comparison specified by CC. 22294 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 22295 SDValue N2, SDValue N3, ISD::CondCode CC, 22296 bool NotExtCompare) { 22297 // (x ? y : y) -> y. 22298 if (N2 == N3) return N2; 22299 22300 EVT CmpOpVT = N0.getValueType(); 22301 EVT CmpResVT = getSetCCResultType(CmpOpVT); 22302 EVT VT = N2.getValueType(); 22303 auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode()); 22304 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 22305 auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode()); 22306 22307 // Determine if the condition we're dealing with is constant. 22308 if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) { 22309 AddToWorklist(SCC.getNode()); 22310 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) { 22311 // fold select_cc true, x, y -> x 22312 // fold select_cc false, x, y -> y 22313 return !(SCCC->isNullValue()) ? N2 : N3; 22314 } 22315 } 22316 22317 if (SDValue V = 22318 convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC)) 22319 return V; 22320 22321 if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) 22322 return V; 22323 22324 // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) 22325 // where y is has a single bit set. 22326 // A plaintext description would be, we can turn the SELECT_CC into an AND 22327 // when the condition can be materialized as an all-ones register. Any 22328 // single bit-test can be materialized as an all-ones register with 22329 // shift-left and shift-right-arith. 22330 if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && 22331 N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { 22332 SDValue AndLHS = N0->getOperand(0); 22333 auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 22334 if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { 22335 // Shift the tested bit over the sign bit. 22336 const APInt &AndMask = ConstAndRHS->getAPIntValue(); 22337 unsigned ShCt = AndMask.getBitWidth() - 1; 22338 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 22339 SDValue ShlAmt = 22340 DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), 22341 getShiftAmountTy(AndLHS.getValueType())); 22342 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); 22343 22344 // Now arithmetic right shift it all the way over, so the result is 22345 // either all-ones, or zero. 22346 SDValue ShrAmt = 22347 DAG.getConstant(ShCt, SDLoc(Shl), 22348 getShiftAmountTy(Shl.getValueType())); 22349 SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); 22350 22351 return DAG.getNode(ISD::AND, DL, VT, Shr, N3); 22352 } 22353 } 22354 } 22355 22356 // fold select C, 16, 0 -> shl C, 4 22357 bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2(); 22358 bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2(); 22359 22360 if ((Fold || Swap) && 22361 TLI.getBooleanContents(CmpOpVT) == 22362 TargetLowering::ZeroOrOneBooleanContent && 22363 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { 22364 22365 if (Swap) { 22366 CC = ISD::getSetCCInverse(CC, CmpOpVT); 22367 std::swap(N2C, N3C); 22368 } 22369 22370 // If the caller doesn't want us to simplify this into a zext of a compare, 22371 // don't do it. 22372 if (NotExtCompare && N2C->isOne()) 22373 return SDValue(); 22374 22375 SDValue Temp, SCC; 22376 // zext (setcc n0, n1) 22377 if (LegalTypes) { 22378 SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC); 22379 if (VT.bitsLT(SCC.getValueType())) 22380 Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT); 22381 else 22382 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 22383 } else { 22384 SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); 22385 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 22386 } 22387 22388 AddToWorklist(SCC.getNode()); 22389 AddToWorklist(Temp.getNode()); 22390 22391 if (N2C->isOne()) 22392 return Temp; 22393 22394 unsigned ShCt = N2C->getAPIntValue().logBase2(); 22395 if (TLI.shouldAvoidTransformToShift(VT, ShCt)) 22396 return SDValue(); 22397 22398 // shl setcc result by log2 n2c 22399 return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp, 22400 DAG.getConstant(ShCt, SDLoc(Temp), 22401 getShiftAmountTy(Temp.getValueType()))); 22402 } 22403 22404 // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) 22405 // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) 22406 // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) 22407 // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) 22408 // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) 22409 // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) 22410 // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) 22411 // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) 22412 if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 22413 SDValue ValueOnZero = N2; 22414 SDValue Count = N3; 22415 // If the condition is NE instead of E, swap the operands. 22416 if (CC == ISD::SETNE) 22417 std::swap(ValueOnZero, Count); 22418 // Check if the value on zero is a constant equal to the bits in the type. 22419 if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) { 22420 if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { 22421 // If the other operand is cttz/cttz_zero_undef of N0, and cttz is 22422 // legal, combine to just cttz. 22423 if ((Count.getOpcode() == ISD::CTTZ || 22424 Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && 22425 N0 == Count.getOperand(0) && 22426 (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) 22427 return DAG.getNode(ISD::CTTZ, DL, VT, N0); 22428 // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is 22429 // legal, combine to just ctlz. 22430 if ((Count.getOpcode() == ISD::CTLZ || 22431 Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && 22432 N0 == Count.getOperand(0) && 22433 (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) 22434 return DAG.getNode(ISD::CTLZ, DL, VT, N0); 22435 } 22436 } 22437 } 22438 22439 return SDValue(); 22440 } 22441 22442 /// This is a stub for TargetLowering::SimplifySetCC. 22443 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, 22444 ISD::CondCode Cond, const SDLoc &DL, 22445 bool foldBooleans) { 22446 TargetLowering::DAGCombinerInfo 22447 DagCombineInfo(DAG, Level, false, this); 22448 return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); 22449 } 22450 22451 /// Given an ISD::SDIV node expressing a divide by constant, return 22452 /// a DAG expression to select that will generate the same value by multiplying 22453 /// by a magic number. 22454 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 22455 SDValue DAGCombiner::BuildSDIV(SDNode *N) { 22456 // when optimising for minimum size, we don't want to expand a div to a mul 22457 // and a shift. 22458 if (DAG.getMachineFunction().getFunction().hasMinSize()) 22459 return SDValue(); 22460 22461 SmallVector<SDNode *, 8> Built; 22462 if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) { 22463 for (SDNode *N : Built) 22464 AddToWorklist(N); 22465 return S; 22466 } 22467 22468 return SDValue(); 22469 } 22470 22471 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a 22472 /// DAG expression that will generate the same value by right shifting. 22473 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { 22474 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); 22475 if (!C) 22476 return SDValue(); 22477 22478 // Avoid division by zero. 22479 if (C->isNullValue()) 22480 return SDValue(); 22481 22482 SmallVector<SDNode *, 8> Built; 22483 if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) { 22484 for (SDNode *N : Built) 22485 AddToWorklist(N); 22486 return S; 22487 } 22488 22489 return SDValue(); 22490 } 22491 22492 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG 22493 /// expression that will generate the same value by multiplying by a magic 22494 /// number. 22495 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 22496 SDValue DAGCombiner::BuildUDIV(SDNode *N) { 22497 // when optimising for minimum size, we don't want to expand a div to a mul 22498 // and a shift. 22499 if (DAG.getMachineFunction().getFunction().hasMinSize()) 22500 return SDValue(); 22501 22502 SmallVector<SDNode *, 8> Built; 22503 if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { 22504 for (SDNode *N : Built) 22505 AddToWorklist(N); 22506 return S; 22507 } 22508 22509 return SDValue(); 22510 } 22511 22512 /// Determines the LogBase2 value for a non-null input value using the 22513 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). 22514 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { 22515 EVT VT = V.getValueType(); 22516 SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); 22517 SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); 22518 SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); 22519 return LogBase2; 22520 } 22521 22522 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 22523 /// For the reciprocal, we need to find the zero of the function: 22524 /// F(X) = A X - 1 [which has a zero at X = 1/A] 22525 /// => 22526 /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form 22527 /// does not require additional intermediate precision] 22528 /// For the last iteration, put numerator N into it to gain more precision: 22529 /// Result = N X_i + X_i (N - N A X_i) 22530 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op, 22531 SDNodeFlags Flags) { 22532 if (LegalDAG) 22533 return SDValue(); 22534 22535 // TODO: Handle half and/or extended types? 22536 EVT VT = Op.getValueType(); 22537 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 22538 return SDValue(); 22539 22540 // If estimates are explicitly disabled for this function, we're done. 22541 MachineFunction &MF = DAG.getMachineFunction(); 22542 int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); 22543 if (Enabled == TLI.ReciprocalEstimate::Disabled) 22544 return SDValue(); 22545 22546 // Estimates may be explicitly enabled for this type with a custom number of 22547 // refinement steps. 22548 int Iterations = TLI.getDivRefinementSteps(VT, MF); 22549 if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { 22550 AddToWorklist(Est.getNode()); 22551 22552 SDLoc DL(Op); 22553 if (Iterations) { 22554 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 22555 22556 // Newton iterations: Est = Est + Est (N - Arg * Est) 22557 // If this is the last iteration, also multiply by the numerator. 22558 for (int i = 0; i < Iterations; ++i) { 22559 SDValue MulEst = Est; 22560 22561 if (i == Iterations - 1) { 22562 MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags); 22563 AddToWorklist(MulEst.getNode()); 22564 } 22565 22566 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags); 22567 AddToWorklist(NewEst.getNode()); 22568 22569 NewEst = DAG.getNode(ISD::FSUB, DL, VT, 22570 (i == Iterations - 1 ? N : FPOne), NewEst, Flags); 22571 AddToWorklist(NewEst.getNode()); 22572 22573 NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 22574 AddToWorklist(NewEst.getNode()); 22575 22576 Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags); 22577 AddToWorklist(Est.getNode()); 22578 } 22579 } else { 22580 // If no iterations are available, multiply with N. 22581 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags); 22582 AddToWorklist(Est.getNode()); 22583 } 22584 22585 return Est; 22586 } 22587 22588 return SDValue(); 22589 } 22590 22591 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 22592 /// For the reciprocal sqrt, we need to find the zero of the function: 22593 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 22594 /// => 22595 /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) 22596 /// As a result, we precompute A/2 prior to the iteration loop. 22597 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, 22598 unsigned Iterations, 22599 SDNodeFlags Flags, bool Reciprocal) { 22600 EVT VT = Arg.getValueType(); 22601 SDLoc DL(Arg); 22602 SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); 22603 22604 // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that 22605 // this entire sequence requires only one FP constant. 22606 SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); 22607 HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); 22608 22609 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) 22610 for (unsigned i = 0; i < Iterations; ++i) { 22611 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); 22612 NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); 22613 NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); 22614 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 22615 } 22616 22617 // If non-reciprocal square root is requested, multiply the result by Arg. 22618 if (!Reciprocal) 22619 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); 22620 22621 return Est; 22622 } 22623 22624 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 22625 /// For the reciprocal sqrt, we need to find the zero of the function: 22626 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 22627 /// => 22628 /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) 22629 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, 22630 unsigned Iterations, 22631 SDNodeFlags Flags, bool Reciprocal) { 22632 EVT VT = Arg.getValueType(); 22633 SDLoc DL(Arg); 22634 SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); 22635 SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT); 22636 22637 // This routine must enter the loop below to work correctly 22638 // when (Reciprocal == false). 22639 assert(Iterations > 0); 22640 22641 // Newton iterations for reciprocal square root: 22642 // E = (E * -0.5) * ((A * E) * E + -3.0) 22643 for (unsigned i = 0; i < Iterations; ++i) { 22644 SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags); 22645 SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags); 22646 SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags); 22647 22648 // When calculating a square root at the last iteration build: 22649 // S = ((A * E) * -0.5) * ((A * E) * E + -3.0) 22650 // (notice a common subexpression) 22651 SDValue LHS; 22652 if (Reciprocal || (i + 1) < Iterations) { 22653 // RSQRT: LHS = (E * -0.5) 22654 LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); 22655 } else { 22656 // SQRT: LHS = (A * E) * -0.5 22657 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags); 22658 } 22659 22660 Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags); 22661 } 22662 22663 return Est; 22664 } 22665 22666 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case 22667 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if 22668 /// Op can be zero. 22669 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, 22670 bool Reciprocal) { 22671 if (LegalDAG) 22672 return SDValue(); 22673 22674 // TODO: Handle half and/or extended types? 22675 EVT VT = Op.getValueType(); 22676 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 22677 return SDValue(); 22678 22679 // If estimates are explicitly disabled for this function, we're done. 22680 MachineFunction &MF = DAG.getMachineFunction(); 22681 int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); 22682 if (Enabled == TLI.ReciprocalEstimate::Disabled) 22683 return SDValue(); 22684 22685 // Estimates may be explicitly enabled for this type with a custom number of 22686 // refinement steps. 22687 int Iterations = TLI.getSqrtRefinementSteps(VT, MF); 22688 22689 bool UseOneConstNR = false; 22690 if (SDValue Est = 22691 TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, 22692 Reciprocal)) { 22693 AddToWorklist(Est.getNode()); 22694 22695 if (Iterations) 22696 Est = UseOneConstNR 22697 ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) 22698 : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); 22699 if (!Reciprocal) { 22700 SDLoc DL(Op); 22701 // Try the target specific test first. 22702 SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT)); 22703 22704 // The estimate is now completely wrong if the input was exactly 0.0 or 22705 // possibly a denormal. Force the answer to 0.0 or value provided by 22706 // target for those cases. 22707 Est = DAG.getNode( 22708 Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, 22709 Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est); 22710 } 22711 return Est; 22712 } 22713 22714 return SDValue(); 22715 } 22716 22717 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { 22718 return buildSqrtEstimateImpl(Op, Flags, true); 22719 } 22720 22721 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { 22722 return buildSqrtEstimateImpl(Op, Flags, false); 22723 } 22724 22725 /// Return true if there is any possibility that the two addresses overlap. 22726 bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const { 22727 22728 struct MemUseCharacteristics { 22729 bool IsVolatile; 22730 bool IsAtomic; 22731 SDValue BasePtr; 22732 int64_t Offset; 22733 Optional<int64_t> NumBytes; 22734 MachineMemOperand *MMO; 22735 }; 22736 22737 auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { 22738 if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { 22739 int64_t Offset = 0; 22740 if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) 22741 Offset = (LSN->getAddressingMode() == ISD::PRE_INC) 22742 ? C->getSExtValue() 22743 : (LSN->getAddressingMode() == ISD::PRE_DEC) 22744 ? -1 * C->getSExtValue() 22745 : 0; 22746 uint64_t Size = 22747 MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize()); 22748 return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(), 22749 Offset /*base offset*/, 22750 Optional<int64_t>(Size), 22751 LSN->getMemOperand()}; 22752 } 22753 if (const auto *LN = cast<LifetimeSDNode>(N)) 22754 return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 22755 (LN->hasOffset()) ? LN->getOffset() : 0, 22756 (LN->hasOffset()) ? Optional<int64_t>(LN->getSize()) 22757 : Optional<int64_t>(), 22758 (MachineMemOperand *)nullptr}; 22759 // Default. 22760 return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(), 22761 (int64_t)0 /*offset*/, 22762 Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr}; 22763 }; 22764 22765 MemUseCharacteristics MUC0 = getCharacteristics(Op0), 22766 MUC1 = getCharacteristics(Op1); 22767 22768 // If they are to the same address, then they must be aliases. 22769 if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr && 22770 MUC0.Offset == MUC1.Offset) 22771 return true; 22772 22773 // If they are both volatile then they cannot be reordered. 22774 if (MUC0.IsVolatile && MUC1.IsVolatile) 22775 return true; 22776 22777 // Be conservative about atomics for the moment 22778 // TODO: This is way overconservative for unordered atomics (see D66309) 22779 if (MUC0.IsAtomic && MUC1.IsAtomic) 22780 return true; 22781 22782 if (MUC0.MMO && MUC1.MMO) { 22783 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 22784 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 22785 return false; 22786 } 22787 22788 // Try to prove that there is aliasing, or that there is no aliasing. Either 22789 // way, we can return now. If nothing can be proved, proceed with more tests. 22790 bool IsAlias; 22791 if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes, 22792 DAG, IsAlias)) 22793 return IsAlias; 22794 22795 // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if 22796 // either are not known. 22797 if (!MUC0.MMO || !MUC1.MMO) 22798 return true; 22799 22800 // If one operation reads from invariant memory, and the other may store, they 22801 // cannot alias. These should really be checking the equivalent of mayWrite, 22802 // but it only matters for memory nodes other than load /store. 22803 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 22804 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 22805 return false; 22806 22807 // If we know required SrcValue1 and SrcValue2 have relatively large 22808 // alignment compared to the size and offset of the access, we may be able 22809 // to prove they do not alias. This check is conservative for now to catch 22810 // cases created by splitting vector types, it only works when the offsets are 22811 // multiples of the size of the data. 22812 int64_t SrcValOffset0 = MUC0.MMO->getOffset(); 22813 int64_t SrcValOffset1 = MUC1.MMO->getOffset(); 22814 Align OrigAlignment0 = MUC0.MMO->getBaseAlign(); 22815 Align OrigAlignment1 = MUC1.MMO->getBaseAlign(); 22816 auto &Size0 = MUC0.NumBytes; 22817 auto &Size1 = MUC1.NumBytes; 22818 if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && 22819 Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 && 22820 OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 && 22821 SrcValOffset1 % *Size1 == 0) { 22822 int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value(); 22823 int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value(); 22824 22825 // There is no overlap between these relatively aligned accesses of 22826 // similar size. Return no alias. 22827 if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0) 22828 return false; 22829 } 22830 22831 bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 22832 ? CombinerGlobalAA 22833 : DAG.getSubtarget().useAA(); 22834 #ifndef NDEBUG 22835 if (CombinerAAOnlyFunc.getNumOccurrences() && 22836 CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) 22837 UseAA = false; 22838 #endif 22839 22840 if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && 22841 Size0.hasValue() && Size1.hasValue()) { 22842 // Use alias analysis information. 22843 int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); 22844 int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset; 22845 int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset; 22846 if (AA->isNoAlias( 22847 MemoryLocation(MUC0.MMO->getValue(), Overlap0, 22848 UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()), 22849 MemoryLocation(MUC1.MMO->getValue(), Overlap1, 22850 UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes()))) 22851 return false; 22852 } 22853 22854 // Otherwise we have to assume they alias. 22855 return true; 22856 } 22857 22858 /// Walk up chain skipping non-aliasing memory nodes, 22859 /// looking for aliasing nodes and adding them to the Aliases vector. 22860 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, 22861 SmallVectorImpl<SDValue> &Aliases) { 22862 SmallVector<SDValue, 8> Chains; // List of chains to visit. 22863 SmallPtrSet<SDNode *, 16> Visited; // Visited node set. 22864 22865 // Get alias information for node. 22866 // TODO: relax aliasing for unordered atomics (see D66309) 22867 const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple(); 22868 22869 // Starting off. 22870 Chains.push_back(OriginalChain); 22871 unsigned Depth = 0; 22872 22873 // Attempt to improve chain by a single step 22874 std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool { 22875 switch (C.getOpcode()) { 22876 case ISD::EntryToken: 22877 // No need to mark EntryToken. 22878 C = SDValue(); 22879 return true; 22880 case ISD::LOAD: 22881 case ISD::STORE: { 22882 // Get alias information for C. 22883 // TODO: Relax aliasing for unordered atomics (see D66309) 22884 bool IsOpLoad = isa<LoadSDNode>(C.getNode()) && 22885 cast<LSBaseSDNode>(C.getNode())->isSimple(); 22886 if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) { 22887 // Look further up the chain. 22888 C = C.getOperand(0); 22889 return true; 22890 } 22891 // Alias, so stop here. 22892 return false; 22893 } 22894 22895 case ISD::CopyFromReg: 22896 // Always forward past past CopyFromReg. 22897 C = C.getOperand(0); 22898 return true; 22899 22900 case ISD::LIFETIME_START: 22901 case ISD::LIFETIME_END: { 22902 // We can forward past any lifetime start/end that can be proven not to 22903 // alias the memory access. 22904 if (!isAlias(N, C.getNode())) { 22905 // Look further up the chain. 22906 C = C.getOperand(0); 22907 return true; 22908 } 22909 return false; 22910 } 22911 default: 22912 return false; 22913 } 22914 }; 22915 22916 // Look at each chain and determine if it is an alias. If so, add it to the 22917 // aliases list. If not, then continue up the chain looking for the next 22918 // candidate. 22919 while (!Chains.empty()) { 22920 SDValue Chain = Chains.pop_back_val(); 22921 22922 // Don't bother if we've seen Chain before. 22923 if (!Visited.insert(Chain.getNode()).second) 22924 continue; 22925 22926 // For TokenFactor nodes, look at each operand and only continue up the 22927 // chain until we reach the depth limit. 22928 // 22929 // FIXME: The depth check could be made to return the last non-aliasing 22930 // chain we found before we hit a tokenfactor rather than the original 22931 // chain. 22932 if (Depth > TLI.getGatherAllAliasesMaxDepth()) { 22933 Aliases.clear(); 22934 Aliases.push_back(OriginalChain); 22935 return; 22936 } 22937 22938 if (Chain.getOpcode() == ISD::TokenFactor) { 22939 // We have to check each of the operands of the token factor for "small" 22940 // token factors, so we queue them up. Adding the operands to the queue 22941 // (stack) in reverse order maintains the original order and increases the 22942 // likelihood that getNode will find a matching token factor (CSE.) 22943 if (Chain.getNumOperands() > 16) { 22944 Aliases.push_back(Chain); 22945 continue; 22946 } 22947 for (unsigned n = Chain.getNumOperands(); n;) 22948 Chains.push_back(Chain.getOperand(--n)); 22949 ++Depth; 22950 continue; 22951 } 22952 // Everything else 22953 if (ImproveChain(Chain)) { 22954 // Updated Chain Found, Consider new chain if one exists. 22955 if (Chain.getNode()) 22956 Chains.push_back(Chain); 22957 ++Depth; 22958 continue; 22959 } 22960 // No Improved Chain Possible, treat as Alias. 22961 Aliases.push_back(Chain); 22962 } 22963 } 22964 22965 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain 22966 /// (aliasing node.) 22967 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { 22968 if (OptLevel == CodeGenOpt::None) 22969 return OldChain; 22970 22971 // Ops for replacing token factor. 22972 SmallVector<SDValue, 8> Aliases; 22973 22974 // Accumulate all the aliases to this node. 22975 GatherAllAliases(N, OldChain, Aliases); 22976 22977 // If no operands then chain to entry token. 22978 if (Aliases.size() == 0) 22979 return DAG.getEntryNode(); 22980 22981 // If a single operand then chain to it. We don't need to revisit it. 22982 if (Aliases.size() == 1) 22983 return Aliases[0]; 22984 22985 // Construct a custom tailored token factor. 22986 return DAG.getTokenFactor(SDLoc(N), Aliases); 22987 } 22988 22989 namespace { 22990 // TODO: Replace with with std::monostate when we move to C++17. 22991 struct UnitT { } Unit; 22992 bool operator==(const UnitT &, const UnitT &) { return true; } 22993 bool operator!=(const UnitT &, const UnitT &) { return false; } 22994 } // namespace 22995 22996 // This function tries to collect a bunch of potentially interesting 22997 // nodes to improve the chains of, all at once. This might seem 22998 // redundant, as this function gets called when visiting every store 22999 // node, so why not let the work be done on each store as it's visited? 23000 // 23001 // I believe this is mainly important because mergeConsecutiveStores 23002 // is unable to deal with merging stores of different sizes, so unless 23003 // we improve the chains of all the potential candidates up-front 23004 // before running mergeConsecutiveStores, it might only see some of 23005 // the nodes that will eventually be candidates, and then not be able 23006 // to go from a partially-merged state to the desired final 23007 // fully-merged state. 23008 23009 bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { 23010 SmallVector<StoreSDNode *, 8> ChainedStores; 23011 StoreSDNode *STChain = St; 23012 // Intervals records which offsets from BaseIndex have been covered. In 23013 // the common case, every store writes to the immediately previous address 23014 // space and thus merged with the previous interval at insertion time. 23015 23016 using IMap = 23017 llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>; 23018 IMap::Allocator A; 23019 IMap Intervals(A); 23020 23021 // This holds the base pointer, index, and the offset in bytes from the base 23022 // pointer. 23023 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 23024 23025 // We must have a base and an offset. 23026 if (!BasePtr.getBase().getNode()) 23027 return false; 23028 23029 // Do not handle stores to undef base pointers. 23030 if (BasePtr.getBase().isUndef()) 23031 return false; 23032 23033 // BaseIndexOffset assumes that offsets are fixed-size, which 23034 // is not valid for scalable vectors where the offsets are 23035 // scaled by `vscale`, so bail out early. 23036 if (St->getMemoryVT().isScalableVector()) 23037 return false; 23038 23039 // Add ST's interval. 23040 Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit); 23041 23042 while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) { 23043 if (Chain->getMemoryVT().isScalableVector()) 23044 return false; 23045 23046 // If the chain has more than one use, then we can't reorder the mem ops. 23047 if (!SDValue(Chain, 0)->hasOneUse()) 23048 break; 23049 // TODO: Relax for unordered atomics (see D66309) 23050 if (!Chain->isSimple() || Chain->isIndexed()) 23051 break; 23052 23053 // Find the base pointer and offset for this memory node. 23054 const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG); 23055 // Check that the base pointer is the same as the original one. 23056 int64_t Offset; 23057 if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset)) 23058 break; 23059 int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8; 23060 // Make sure we don't overlap with other intervals by checking the ones to 23061 // the left or right before inserting. 23062 auto I = Intervals.find(Offset); 23063 // If there's a next interval, we should end before it. 23064 if (I != Intervals.end() && I.start() < (Offset + Length)) 23065 break; 23066 // If there's a previous interval, we should start after it. 23067 if (I != Intervals.begin() && (--I).stop() <= Offset) 23068 break; 23069 Intervals.insert(Offset, Offset + Length, Unit); 23070 23071 ChainedStores.push_back(Chain); 23072 STChain = Chain; 23073 } 23074 23075 // If we didn't find a chained store, exit. 23076 if (ChainedStores.size() == 0) 23077 return false; 23078 23079 // Improve all chained stores (St and ChainedStores members) starting from 23080 // where the store chain ended and return single TokenFactor. 23081 SDValue NewChain = STChain->getChain(); 23082 SmallVector<SDValue, 8> TFOps; 23083 for (unsigned I = ChainedStores.size(); I;) { 23084 StoreSDNode *S = ChainedStores[--I]; 23085 SDValue BetterChain = FindBetterChain(S, NewChain); 23086 S = cast<StoreSDNode>(DAG.UpdateNodeOperands( 23087 S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3))); 23088 TFOps.push_back(SDValue(S, 0)); 23089 ChainedStores[I] = S; 23090 } 23091 23092 // Improve St's chain. Use a new node to avoid creating a loop from CombineTo. 23093 SDValue BetterChain = FindBetterChain(St, NewChain); 23094 SDValue NewST; 23095 if (St->isTruncatingStore()) 23096 NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(), 23097 St->getBasePtr(), St->getMemoryVT(), 23098 St->getMemOperand()); 23099 else 23100 NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(), 23101 St->getBasePtr(), St->getMemOperand()); 23102 23103 TFOps.push_back(NewST); 23104 23105 // If we improved every element of TFOps, then we've lost the dependence on 23106 // NewChain to successors of St and we need to add it back to TFOps. Do so at 23107 // the beginning to keep relative order consistent with FindBetterChains. 23108 auto hasImprovedChain = [&](SDValue ST) -> bool { 23109 return ST->getOperand(0) != NewChain; 23110 }; 23111 bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain); 23112 if (AddNewChain) 23113 TFOps.insert(TFOps.begin(), NewChain); 23114 23115 SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps); 23116 CombineTo(St, TF); 23117 23118 // Add TF and its operands to the worklist. 23119 AddToWorklist(TF.getNode()); 23120 for (const SDValue &Op : TF->ops()) 23121 AddToWorklist(Op.getNode()); 23122 AddToWorklist(STChain); 23123 return true; 23124 } 23125 23126 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { 23127 if (OptLevel == CodeGenOpt::None) 23128 return false; 23129 23130 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 23131 23132 // We must have a base and an offset. 23133 if (!BasePtr.getBase().getNode()) 23134 return false; 23135 23136 // Do not handle stores to undef base pointers. 23137 if (BasePtr.getBase().isUndef()) 23138 return false; 23139 23140 // Directly improve a chain of disjoint stores starting at St. 23141 if (parallelizeChainedStores(St)) 23142 return true; 23143 23144 // Improve St's Chain.. 23145 SDValue BetterChain = FindBetterChain(St, St->getChain()); 23146 if (St->getChain() != BetterChain) { 23147 replaceStoreChain(St, BetterChain); 23148 return true; 23149 } 23150 return false; 23151 } 23152 23153 /// This is the entry point for the file. 23154 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, 23155 CodeGenOpt::Level OptLevel) { 23156 /// This is the main entry point to this class. 23157 DAGCombiner(*this, AA, OptLevel).Run(Level); 23158 } 23159