1 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run 10 // both before and after the DAG is legalized. 11 // 12 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is 13 // primarily intended to handle simplification opportunities that are implicit 14 // in the LLVM IR and exposed by the various codegen lowering phases. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/ADT/APFloat.h" 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/IntervalMap.h" 23 #include "llvm/ADT/None.h" 24 #include "llvm/ADT/Optional.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallPtrSet.h" 28 #include "llvm/ADT/SmallSet.h" 29 #include "llvm/ADT/SmallVector.h" 30 #include "llvm/ADT/Statistic.h" 31 #include "llvm/Analysis/AliasAnalysis.h" 32 #include "llvm/Analysis/MemoryLocation.h" 33 #include "llvm/CodeGen/DAGCombine.h" 34 #include "llvm/CodeGen/ISDOpcodes.h" 35 #include "llvm/CodeGen/MachineFrameInfo.h" 36 #include "llvm/CodeGen/MachineFunction.h" 37 #include "llvm/CodeGen/MachineMemOperand.h" 38 #include "llvm/CodeGen/RuntimeLibcalls.h" 39 #include "llvm/CodeGen/SelectionDAG.h" 40 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" 41 #include "llvm/CodeGen/SelectionDAGNodes.h" 42 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 43 #include "llvm/CodeGen/TargetLowering.h" 44 #include "llvm/CodeGen/TargetRegisterInfo.h" 45 #include "llvm/CodeGen/TargetSubtargetInfo.h" 46 #include "llvm/CodeGen/ValueTypes.h" 47 #include "llvm/IR/Attributes.h" 48 #include "llvm/IR/Constant.h" 49 #include "llvm/IR/DataLayout.h" 50 #include "llvm/IR/DerivedTypes.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/LLVMContext.h" 53 #include "llvm/IR/Metadata.h" 54 #include "llvm/Support/Casting.h" 55 #include "llvm/Support/CodeGen.h" 56 #include "llvm/Support/CommandLine.h" 57 #include "llvm/Support/Compiler.h" 58 #include "llvm/Support/Debug.h" 59 #include "llvm/Support/ErrorHandling.h" 60 #include "llvm/Support/KnownBits.h" 61 #include "llvm/Support/MachineValueType.h" 62 #include "llvm/Support/MathExtras.h" 63 #include "llvm/Support/raw_ostream.h" 64 #include "llvm/Target/TargetMachine.h" 65 #include "llvm/Target/TargetOptions.h" 66 #include <algorithm> 67 #include <cassert> 68 #include <cstdint> 69 #include <functional> 70 #include <iterator> 71 #include <string> 72 #include <tuple> 73 #include <utility> 74 75 using namespace llvm; 76 77 #define DEBUG_TYPE "dagcombine" 78 79 STATISTIC(NodesCombined , "Number of dag nodes combined"); 80 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); 81 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); 82 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); 83 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); 84 STATISTIC(SlicedLoads, "Number of load sliced"); 85 STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops"); 86 87 static cl::opt<bool> 88 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, 89 cl::desc("Enable DAG combiner's use of IR alias analysis")); 90 91 static cl::opt<bool> 92 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), 93 cl::desc("Enable DAG combiner's use of TBAA")); 94 95 #ifndef NDEBUG 96 static cl::opt<std::string> 97 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, 98 cl::desc("Only use DAG-combiner alias analysis in this" 99 " function")); 100 #endif 101 102 /// Hidden option to stress test load slicing, i.e., when this option 103 /// is enabled, load slicing bypasses most of its profitability guards. 104 static cl::opt<bool> 105 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, 106 cl::desc("Bypass the profitability model of load slicing"), 107 cl::init(false)); 108 109 static cl::opt<bool> 110 MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), 111 cl::desc("DAG combiner may split indexing from loads")); 112 113 static cl::opt<bool> 114 EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true), 115 cl::desc("DAG combiner enable merging multiple stores " 116 "into a wider store")); 117 118 static cl::opt<unsigned> TokenFactorInlineLimit( 119 "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048), 120 cl::desc("Limit the number of operands to inline for Token Factors")); 121 122 static cl::opt<unsigned> StoreMergeDependenceLimit( 123 "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10), 124 cl::desc("Limit the number of times for the same StoreNode and RootNode " 125 "to bail out in store merging dependence check")); 126 127 namespace { 128 129 class DAGCombiner { 130 SelectionDAG &DAG; 131 const TargetLowering &TLI; 132 CombineLevel Level; 133 CodeGenOpt::Level OptLevel; 134 bool LegalDAG = false; 135 bool LegalOperations = false; 136 bool LegalTypes = false; 137 bool ForCodeSize; 138 139 /// Worklist of all of the nodes that need to be simplified. 140 /// 141 /// This must behave as a stack -- new nodes to process are pushed onto the 142 /// back and when processing we pop off of the back. 143 /// 144 /// The worklist will not contain duplicates but may contain null entries 145 /// due to nodes being deleted from the underlying DAG. 146 SmallVector<SDNode *, 64> Worklist; 147 148 /// Mapping from an SDNode to its position on the worklist. 149 /// 150 /// This is used to find and remove nodes from the worklist (by nulling 151 /// them) when they are deleted from the underlying DAG. It relies on 152 /// stable indices of nodes within the worklist. 153 DenseMap<SDNode *, unsigned> WorklistMap; 154 /// This records all nodes attempted to add to the worklist since we 155 /// considered a new worklist entry. As we keep do not add duplicate nodes 156 /// in the worklist, this is different from the tail of the worklist. 157 SmallSetVector<SDNode *, 32> PruningList; 158 159 /// Set of nodes which have been combined (at least once). 160 /// 161 /// This is used to allow us to reliably add any operands of a DAG node 162 /// which have not yet been combined to the worklist. 163 SmallPtrSet<SDNode *, 32> CombinedNodes; 164 165 /// Map from candidate StoreNode to the pair of RootNode and count. 166 /// The count is used to track how many times we have seen the StoreNode 167 /// with the same RootNode bail out in dependence check. If we have seen 168 /// the bail out for the same pair many times over a limit, we won't 169 /// consider the StoreNode with the same RootNode as store merging 170 /// candidate again. 171 DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap; 172 173 // AA - Used for DAG load/store alias analysis. 174 AliasAnalysis *AA; 175 176 /// When an instruction is simplified, add all users of the instruction to 177 /// the work lists because they might get more simplified now. 178 void AddUsersToWorklist(SDNode *N) { 179 for (SDNode *Node : N->uses()) 180 AddToWorklist(Node); 181 } 182 183 /// Convenient shorthand to add a node and all of its user to the worklist. 184 void AddToWorklistWithUsers(SDNode *N) { 185 AddUsersToWorklist(N); 186 AddToWorklist(N); 187 } 188 189 // Prune potentially dangling nodes. This is called after 190 // any visit to a node, but should also be called during a visit after any 191 // failed combine which may have created a DAG node. 192 void clearAddedDanglingWorklistEntries() { 193 // Check any nodes added to the worklist to see if they are prunable. 194 while (!PruningList.empty()) { 195 auto *N = PruningList.pop_back_val(); 196 if (N->use_empty()) 197 recursivelyDeleteUnusedNodes(N); 198 } 199 } 200 201 SDNode *getNextWorklistEntry() { 202 // Before we do any work, remove nodes that are not in use. 203 clearAddedDanglingWorklistEntries(); 204 SDNode *N = nullptr; 205 // The Worklist holds the SDNodes in order, but it may contain null 206 // entries. 207 while (!N && !Worklist.empty()) { 208 N = Worklist.pop_back_val(); 209 } 210 211 if (N) { 212 bool GoodWorklistEntry = WorklistMap.erase(N); 213 (void)GoodWorklistEntry; 214 assert(GoodWorklistEntry && 215 "Found a worklist entry without a corresponding map entry!"); 216 } 217 return N; 218 } 219 220 /// Call the node-specific routine that folds each particular type of node. 221 SDValue visit(SDNode *N); 222 223 public: 224 DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) 225 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), 226 OptLevel(OL), AA(AA) { 227 ForCodeSize = DAG.shouldOptForSize(); 228 229 MaximumLegalStoreInBits = 0; 230 // We use the minimum store size here, since that's all we can guarantee 231 // for the scalable vector types. 232 for (MVT VT : MVT::all_valuetypes()) 233 if (EVT(VT).isSimple() && VT != MVT::Other && 234 TLI.isTypeLegal(EVT(VT)) && 235 VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits) 236 MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize(); 237 } 238 239 void ConsiderForPruning(SDNode *N) { 240 // Mark this for potential pruning. 241 PruningList.insert(N); 242 } 243 244 /// Add to the worklist making sure its instance is at the back (next to be 245 /// processed.) 246 void AddToWorklist(SDNode *N) { 247 assert(N->getOpcode() != ISD::DELETED_NODE && 248 "Deleted Node added to Worklist"); 249 250 // Skip handle nodes as they can't usefully be combined and confuse the 251 // zero-use deletion strategy. 252 if (N->getOpcode() == ISD::HANDLENODE) 253 return; 254 255 ConsiderForPruning(N); 256 257 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) 258 Worklist.push_back(N); 259 } 260 261 /// Remove all instances of N from the worklist. 262 void removeFromWorklist(SDNode *N) { 263 CombinedNodes.erase(N); 264 PruningList.remove(N); 265 StoreRootCountMap.erase(N); 266 267 auto It = WorklistMap.find(N); 268 if (It == WorklistMap.end()) 269 return; // Not in the worklist. 270 271 // Null out the entry rather than erasing it to avoid a linear operation. 272 Worklist[It->second] = nullptr; 273 WorklistMap.erase(It); 274 } 275 276 void deleteAndRecombine(SDNode *N); 277 bool recursivelyDeleteUnusedNodes(SDNode *N); 278 279 /// Replaces all uses of the results of one DAG node with new values. 280 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 281 bool AddTo = true); 282 283 /// Replaces all uses of the results of one DAG node with new values. 284 SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { 285 return CombineTo(N, &Res, 1, AddTo); 286 } 287 288 /// Replaces all uses of the results of one DAG node with new values. 289 SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, 290 bool AddTo = true) { 291 SDValue To[] = { Res0, Res1 }; 292 return CombineTo(N, To, 2, AddTo); 293 } 294 295 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); 296 297 private: 298 unsigned MaximumLegalStoreInBits; 299 300 /// Check the specified integer node value to see if it can be simplified or 301 /// if things it uses can be simplified by bit propagation. 302 /// If so, return true. 303 bool SimplifyDemandedBits(SDValue Op) { 304 unsigned BitWidth = Op.getScalarValueSizeInBits(); 305 APInt DemandedBits = APInt::getAllOnesValue(BitWidth); 306 return SimplifyDemandedBits(Op, DemandedBits); 307 } 308 309 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) { 310 EVT VT = Op.getValueType(); 311 unsigned NumElts = VT.isVector() ? VT.getVectorNumElements() : 1; 312 APInt DemandedElts = APInt::getAllOnesValue(NumElts); 313 return SimplifyDemandedBits(Op, DemandedBits, DemandedElts); 314 } 315 316 /// Check the specified vector node value to see if it can be simplified or 317 /// if things it uses can be simplified as it only uses some of the 318 /// elements. If so, return true. 319 bool SimplifyDemandedVectorElts(SDValue Op) { 320 unsigned NumElts = Op.getValueType().getVectorNumElements(); 321 APInt DemandedElts = APInt::getAllOnesValue(NumElts); 322 return SimplifyDemandedVectorElts(Op, DemandedElts); 323 } 324 325 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 326 const APInt &DemandedElts); 327 bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, 328 bool AssumeSingleUse = false); 329 330 bool CombineToPreIndexedLoadStore(SDNode *N); 331 bool CombineToPostIndexedLoadStore(SDNode *N); 332 SDValue SplitIndexingFromLoad(LoadSDNode *LD); 333 bool SliceUpLoad(SDNode *N); 334 335 // Scalars have size 0 to distinguish from singleton vectors. 336 SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); 337 bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); 338 bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); 339 340 /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed 341 /// load. 342 /// 343 /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. 344 /// \param InVecVT type of the input vector to EVE with bitcasts resolved. 345 /// \param EltNo index of the vector element to load. 346 /// \param OriginalLoad load that EVE came from to be replaced. 347 /// \returns EVE on success SDValue() on failure. 348 SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 349 SDValue EltNo, 350 LoadSDNode *OriginalLoad); 351 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); 352 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); 353 SDValue SExtPromoteOperand(SDValue Op, EVT PVT); 354 SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); 355 SDValue PromoteIntBinOp(SDValue Op); 356 SDValue PromoteIntShiftOp(SDValue Op); 357 SDValue PromoteExtend(SDValue Op); 358 bool PromoteLoad(SDValue Op); 359 360 /// Call the node-specific routine that knows how to fold each 361 /// particular type of node. If that doesn't do anything, try the 362 /// target-specific DAG combines. 363 SDValue combine(SDNode *N); 364 365 // Visitation implementation - Implement dag node combining for different 366 // node types. The semantics are as follows: 367 // Return Value: 368 // SDValue.getNode() == 0 - No change was made 369 // SDValue.getNode() == N - N was replaced, is dead and has been handled. 370 // otherwise - N should be replaced by the returned Operand. 371 // 372 SDValue visitTokenFactor(SDNode *N); 373 SDValue visitMERGE_VALUES(SDNode *N); 374 SDValue visitADD(SDNode *N); 375 SDValue visitADDLike(SDNode *N); 376 SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); 377 SDValue visitSUB(SDNode *N); 378 SDValue visitADDSAT(SDNode *N); 379 SDValue visitSUBSAT(SDNode *N); 380 SDValue visitADDC(SDNode *N); 381 SDValue visitADDO(SDNode *N); 382 SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); 383 SDValue visitSUBC(SDNode *N); 384 SDValue visitSUBO(SDNode *N); 385 SDValue visitADDE(SDNode *N); 386 SDValue visitADDCARRY(SDNode *N); 387 SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); 388 SDValue visitSUBE(SDNode *N); 389 SDValue visitSUBCARRY(SDNode *N); 390 SDValue visitMUL(SDNode *N); 391 SDValue visitMULFIX(SDNode *N); 392 SDValue useDivRem(SDNode *N); 393 SDValue visitSDIV(SDNode *N); 394 SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N); 395 SDValue visitUDIV(SDNode *N); 396 SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N); 397 SDValue visitREM(SDNode *N); 398 SDValue visitMULHU(SDNode *N); 399 SDValue visitMULHS(SDNode *N); 400 SDValue visitSMUL_LOHI(SDNode *N); 401 SDValue visitUMUL_LOHI(SDNode *N); 402 SDValue visitMULO(SDNode *N); 403 SDValue visitIMINMAX(SDNode *N); 404 SDValue visitAND(SDNode *N); 405 SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N); 406 SDValue visitOR(SDNode *N); 407 SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N); 408 SDValue visitXOR(SDNode *N); 409 SDValue SimplifyVBinOp(SDNode *N); 410 SDValue visitSHL(SDNode *N); 411 SDValue visitSRA(SDNode *N); 412 SDValue visitSRL(SDNode *N); 413 SDValue visitFunnelShift(SDNode *N); 414 SDValue visitRotate(SDNode *N); 415 SDValue visitABS(SDNode *N); 416 SDValue visitBSWAP(SDNode *N); 417 SDValue visitBITREVERSE(SDNode *N); 418 SDValue visitCTLZ(SDNode *N); 419 SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); 420 SDValue visitCTTZ(SDNode *N); 421 SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); 422 SDValue visitCTPOP(SDNode *N); 423 SDValue visitSELECT(SDNode *N); 424 SDValue visitVSELECT(SDNode *N); 425 SDValue visitSELECT_CC(SDNode *N); 426 SDValue visitSETCC(SDNode *N); 427 SDValue visitSETCCCARRY(SDNode *N); 428 SDValue visitSIGN_EXTEND(SDNode *N); 429 SDValue visitZERO_EXTEND(SDNode *N); 430 SDValue visitANY_EXTEND(SDNode *N); 431 SDValue visitAssertExt(SDNode *N); 432 SDValue visitSIGN_EXTEND_INREG(SDNode *N); 433 SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); 434 SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); 435 SDValue visitTRUNCATE(SDNode *N); 436 SDValue visitBITCAST(SDNode *N); 437 SDValue visitBUILD_PAIR(SDNode *N); 438 SDValue visitFADD(SDNode *N); 439 SDValue visitFSUB(SDNode *N); 440 SDValue visitFMUL(SDNode *N); 441 SDValue visitFMA(SDNode *N); 442 SDValue visitFDIV(SDNode *N); 443 SDValue visitFREM(SDNode *N); 444 SDValue visitFSQRT(SDNode *N); 445 SDValue visitFCOPYSIGN(SDNode *N); 446 SDValue visitFPOW(SDNode *N); 447 SDValue visitSINT_TO_FP(SDNode *N); 448 SDValue visitUINT_TO_FP(SDNode *N); 449 SDValue visitFP_TO_SINT(SDNode *N); 450 SDValue visitFP_TO_UINT(SDNode *N); 451 SDValue visitFP_ROUND(SDNode *N); 452 SDValue visitFP_EXTEND(SDNode *N); 453 SDValue visitFNEG(SDNode *N); 454 SDValue visitFABS(SDNode *N); 455 SDValue visitFCEIL(SDNode *N); 456 SDValue visitFTRUNC(SDNode *N); 457 SDValue visitFFLOOR(SDNode *N); 458 SDValue visitFMINNUM(SDNode *N); 459 SDValue visitFMAXNUM(SDNode *N); 460 SDValue visitFMINIMUM(SDNode *N); 461 SDValue visitFMAXIMUM(SDNode *N); 462 SDValue visitBRCOND(SDNode *N); 463 SDValue visitBR_CC(SDNode *N); 464 SDValue visitLOAD(SDNode *N); 465 466 SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); 467 SDValue replaceStoreOfFPConstant(StoreSDNode *ST); 468 469 SDValue visitSTORE(SDNode *N); 470 SDValue visitLIFETIME_END(SDNode *N); 471 SDValue visitINSERT_VECTOR_ELT(SDNode *N); 472 SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); 473 SDValue visitBUILD_VECTOR(SDNode *N); 474 SDValue visitCONCAT_VECTORS(SDNode *N); 475 SDValue visitEXTRACT_SUBVECTOR(SDNode *N); 476 SDValue visitVECTOR_SHUFFLE(SDNode *N); 477 SDValue visitSCALAR_TO_VECTOR(SDNode *N); 478 SDValue visitINSERT_SUBVECTOR(SDNode *N); 479 SDValue visitMLOAD(SDNode *N); 480 SDValue visitMSTORE(SDNode *N); 481 SDValue visitMGATHER(SDNode *N); 482 SDValue visitMSCATTER(SDNode *N); 483 SDValue visitFP_TO_FP16(SDNode *N); 484 SDValue visitFP16_TO_FP(SDNode *N); 485 SDValue visitVECREDUCE(SDNode *N); 486 487 SDValue visitFADDForFMACombine(SDNode *N); 488 SDValue visitFSUBForFMACombine(SDNode *N); 489 SDValue visitFMULForFMADistributiveCombine(SDNode *N); 490 491 SDValue XformToShuffleWithZero(SDNode *N); 492 bool reassociationCanBreakAddressingModePattern(unsigned Opc, 493 const SDLoc &DL, SDValue N0, 494 SDValue N1); 495 SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, 496 SDValue N1); 497 SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 498 SDValue N1, SDNodeFlags Flags); 499 500 SDValue visitShiftByConstant(SDNode *N); 501 502 SDValue foldSelectOfConstants(SDNode *N); 503 SDValue foldVSelectOfConstants(SDNode *N); 504 SDValue foldBinOpIntoSelect(SDNode *BO); 505 bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); 506 SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N); 507 SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); 508 SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 509 SDValue N2, SDValue N3, ISD::CondCode CC, 510 bool NotExtCompare = false); 511 SDValue convertSelectOfFPConstantsToLoadOffset( 512 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 513 ISD::CondCode CC); 514 SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, 515 SDValue N2, SDValue N3, ISD::CondCode CC); 516 SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 517 const SDLoc &DL); 518 SDValue unfoldMaskedMerge(SDNode *N); 519 SDValue unfoldExtremeBitClearingToShifts(SDNode *N); 520 SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, 521 const SDLoc &DL, bool foldBooleans); 522 SDValue rebuildSetCC(SDValue N); 523 524 bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 525 SDValue &CC, bool MatchStrict = false) const; 526 bool isOneUseSetCC(SDValue N) const; 527 bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y); 528 529 SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 530 unsigned HiOp); 531 SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); 532 SDValue CombineExtLoad(SDNode *N); 533 SDValue CombineZExtLogicopShiftLoad(SDNode *N); 534 SDValue combineRepeatedFPDivisors(SDNode *N); 535 SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); 536 SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); 537 SDValue BuildSDIV(SDNode *N); 538 SDValue BuildSDIVPow2(SDNode *N); 539 SDValue BuildUDIV(SDNode *N); 540 SDValue BuildLogBase2(SDValue V, const SDLoc &DL); 541 SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); 542 SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); 543 SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); 544 SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); 545 SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, 546 SDNodeFlags Flags, bool Reciprocal); 547 SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, 548 SDNodeFlags Flags, bool Reciprocal); 549 SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 550 bool DemandHighBits = true); 551 SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); 552 SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, 553 SDValue InnerPos, SDValue InnerNeg, 554 unsigned PosOpcode, unsigned NegOpcode, 555 const SDLoc &DL); 556 SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); 557 SDValue MatchLoadCombine(SDNode *N); 558 SDValue MatchStoreCombine(StoreSDNode *N); 559 SDValue ReduceLoadWidth(SDNode *N); 560 SDValue ReduceLoadOpStoreWidth(SDNode *N); 561 SDValue splitMergedValStore(StoreSDNode *ST); 562 SDValue TransformFPLoadStorePair(SDNode *N); 563 SDValue convertBuildVecZextToZext(SDNode *N); 564 SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); 565 SDValue reduceBuildVecTruncToBitCast(SDNode *N); 566 SDValue reduceBuildVecToShuffle(SDNode *N); 567 SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, 568 ArrayRef<int> VectorMask, SDValue VecIn1, 569 SDValue VecIn2, unsigned LeftIdx, 570 bool DidSplitVec); 571 SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); 572 573 /// Walk up chain skipping non-aliasing memory nodes, 574 /// looking for aliasing nodes and adding them to the Aliases vector. 575 void GatherAllAliases(SDNode *N, SDValue OriginalChain, 576 SmallVectorImpl<SDValue> &Aliases); 577 578 /// Return true if there is any possibility that the two addresses overlap. 579 bool isAlias(SDNode *Op0, SDNode *Op1) const; 580 581 /// Walk up chain skipping non-aliasing memory nodes, looking for a better 582 /// chain (aliasing node.) 583 SDValue FindBetterChain(SDNode *N, SDValue Chain); 584 585 /// Try to replace a store and any possibly adjacent stores on 586 /// consecutive chains with better chains. Return true only if St is 587 /// replaced. 588 /// 589 /// Notice that other chains may still be replaced even if the function 590 /// returns false. 591 bool findBetterNeighborChains(StoreSDNode *St); 592 593 // Helper for findBetterNeighborChains. Walk up store chain add additional 594 // chained stores that do not overlap and can be parallelized. 595 bool parallelizeChainedStores(StoreSDNode *St); 596 597 /// Holds a pointer to an LSBaseSDNode as well as information on where it 598 /// is located in a sequence of memory operations connected by a chain. 599 struct MemOpLink { 600 // Ptr to the mem node. 601 LSBaseSDNode *MemNode; 602 603 // Offset from the base ptr. 604 int64_t OffsetFromBase; 605 606 MemOpLink(LSBaseSDNode *N, int64_t Offset) 607 : MemNode(N), OffsetFromBase(Offset) {} 608 }; 609 610 /// This is a helper function for visitMUL to check the profitability 611 /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 612 /// MulNode is the original multiply, AddNode is (add x, c1), 613 /// and ConstNode is c2. 614 bool isMulAddWithConstProfitable(SDNode *MulNode, 615 SDValue &AddNode, 616 SDValue &ConstNode); 617 618 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns 619 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns 620 /// the type of the loaded value to be extended. 621 bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 622 EVT LoadResultTy, EVT &ExtVT); 623 624 /// Helper function to calculate whether the given Load/Store can have its 625 /// width reduced to ExtVT. 626 bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType, 627 EVT &MemVT, unsigned ShAmt = 0); 628 629 /// Used by BackwardsPropagateMask to find suitable loads. 630 bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads, 631 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 632 ConstantSDNode *Mask, SDNode *&NodeToMask); 633 /// Attempt to propagate a given AND node back to load leaves so that they 634 /// can be combined into narrow loads. 635 bool BackwardsPropagateMask(SDNode *N); 636 637 /// Helper function for MergeConsecutiveStores which merges the 638 /// component store chains. 639 SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 640 unsigned NumStores); 641 642 /// This is a helper function for MergeConsecutiveStores. When the 643 /// source elements of the consecutive stores are all constants or 644 /// all extracted vector elements, try to merge them into one 645 /// larger store introducing bitcasts if necessary. \return True 646 /// if a merged store was created. 647 bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, 648 EVT MemVT, unsigned NumStores, 649 bool IsConstantSrc, bool UseVector, 650 bool UseTrunc); 651 652 /// This is a helper function for MergeConsecutiveStores. Stores 653 /// that potentially may be merged with St are placed in 654 /// StoreNodes. RootNode is a chain predecessor to all store 655 /// candidates. 656 void getStoreMergeCandidates(StoreSDNode *St, 657 SmallVectorImpl<MemOpLink> &StoreNodes, 658 SDNode *&Root); 659 660 /// Helper function for MergeConsecutiveStores. Checks if 661 /// candidate stores have indirect dependency through their 662 /// operands. RootNode is the predecessor to all stores calculated 663 /// by getStoreMergeCandidates and is used to prune the dependency check. 664 /// \return True if safe to merge. 665 bool checkMergeStoreCandidatesForDependencies( 666 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 667 SDNode *RootNode); 668 669 /// Merge consecutive store operations into a wide store. 670 /// This optimization uses wide integers or vectors when possible. 671 /// \return number of stores that were merged into a merged store (the 672 /// affected nodes are stored as a prefix in \p StoreNodes). 673 bool MergeConsecutiveStores(StoreSDNode *St); 674 675 /// Try to transform a truncation where C is a constant: 676 /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) 677 /// 678 /// \p N needs to be a truncation and its first operand an AND. Other 679 /// requirements are checked by the function (e.g. that trunc is 680 /// single-use) and if missed an empty SDValue is returned. 681 SDValue distributeTruncateThroughAnd(SDNode *N); 682 683 /// Helper function to determine whether the target supports operation 684 /// given by \p Opcode for type \p VT, that is, whether the operation 685 /// is legal or custom before legalizing operations, and whether is 686 /// legal (but not custom) after legalization. 687 bool hasOperation(unsigned Opcode, EVT VT) { 688 if (LegalOperations) 689 return TLI.isOperationLegal(Opcode, VT); 690 return TLI.isOperationLegalOrCustom(Opcode, VT); 691 } 692 693 public: 694 /// Runs the dag combiner on all nodes in the work list 695 void Run(CombineLevel AtLevel); 696 697 SelectionDAG &getDAG() const { return DAG; } 698 699 /// Returns a type large enough to hold any valid shift amount - before type 700 /// legalization these can be huge. 701 EVT getShiftAmountTy(EVT LHSTy) { 702 assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); 703 return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes); 704 } 705 706 /// This method returns true if we are running before type legalization or 707 /// if the specified VT is legal. 708 bool isTypeLegal(const EVT &VT) { 709 if (!LegalTypes) return true; 710 return TLI.isTypeLegal(VT); 711 } 712 713 /// Convenience wrapper around TargetLowering::getSetCCResultType 714 EVT getSetCCResultType(EVT VT) const { 715 return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 716 } 717 718 void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 719 SDValue OrigLoad, SDValue ExtLoad, 720 ISD::NodeType ExtType); 721 }; 722 723 /// This class is a DAGUpdateListener that removes any deleted 724 /// nodes from the worklist. 725 class WorklistRemover : public SelectionDAG::DAGUpdateListener { 726 DAGCombiner &DC; 727 728 public: 729 explicit WorklistRemover(DAGCombiner &dc) 730 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 731 732 void NodeDeleted(SDNode *N, SDNode *E) override { 733 DC.removeFromWorklist(N); 734 } 735 }; 736 737 class WorklistInserter : public SelectionDAG::DAGUpdateListener { 738 DAGCombiner &DC; 739 740 public: 741 explicit WorklistInserter(DAGCombiner &dc) 742 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 743 744 // FIXME: Ideally we could add N to the worklist, but this causes exponential 745 // compile time costs in large DAGs, e.g. Halide. 746 void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } 747 }; 748 749 } // end anonymous namespace 750 751 //===----------------------------------------------------------------------===// 752 // TargetLowering::DAGCombinerInfo implementation 753 //===----------------------------------------------------------------------===// 754 755 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { 756 ((DAGCombiner*)DC)->AddToWorklist(N); 757 } 758 759 SDValue TargetLowering::DAGCombinerInfo:: 760 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) { 761 return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); 762 } 763 764 SDValue TargetLowering::DAGCombinerInfo:: 765 CombineTo(SDNode *N, SDValue Res, bool AddTo) { 766 return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); 767 } 768 769 SDValue TargetLowering::DAGCombinerInfo:: 770 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { 771 return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); 772 } 773 774 bool TargetLowering::DAGCombinerInfo:: 775 recursivelyDeleteUnusedNodes(SDNode *N) { 776 return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N); 777 } 778 779 void TargetLowering::DAGCombinerInfo:: 780 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 781 return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); 782 } 783 784 //===----------------------------------------------------------------------===// 785 // Helper Functions 786 //===----------------------------------------------------------------------===// 787 788 void DAGCombiner::deleteAndRecombine(SDNode *N) { 789 removeFromWorklist(N); 790 791 // If the operands of this node are only used by the node, they will now be 792 // dead. Make sure to re-visit them and recursively delete dead nodes. 793 for (const SDValue &Op : N->ops()) 794 // For an operand generating multiple values, one of the values may 795 // become dead allowing further simplification (e.g. split index 796 // arithmetic from an indexed load). 797 if (Op->hasOneUse() || Op->getNumValues() > 1) 798 AddToWorklist(Op.getNode()); 799 800 DAG.DeleteNode(N); 801 } 802 803 // APInts must be the same size for most operations, this helper 804 // function zero extends the shorter of the pair so that they match. 805 // We provide an Offset so that we can create bitwidths that won't overflow. 806 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) { 807 unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth()); 808 LHS = LHS.zextOrSelf(Bits); 809 RHS = RHS.zextOrSelf(Bits); 810 } 811 812 // Return true if this node is a setcc, or is a select_cc 813 // that selects between the target values used for true and false, making it 814 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to 815 // the appropriate nodes based on the type of node we are checking. This 816 // simplifies life a bit for the callers. 817 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 818 SDValue &CC, bool MatchStrict) const { 819 if (N.getOpcode() == ISD::SETCC) { 820 LHS = N.getOperand(0); 821 RHS = N.getOperand(1); 822 CC = N.getOperand(2); 823 return true; 824 } 825 826 if (MatchStrict && 827 (N.getOpcode() == ISD::STRICT_FSETCC || 828 N.getOpcode() == ISD::STRICT_FSETCCS)) { 829 LHS = N.getOperand(1); 830 RHS = N.getOperand(2); 831 CC = N.getOperand(3); 832 return true; 833 } 834 835 if (N.getOpcode() != ISD::SELECT_CC || 836 !TLI.isConstTrueVal(N.getOperand(2).getNode()) || 837 !TLI.isConstFalseVal(N.getOperand(3).getNode())) 838 return false; 839 840 if (TLI.getBooleanContents(N.getValueType()) == 841 TargetLowering::UndefinedBooleanContent) 842 return false; 843 844 LHS = N.getOperand(0); 845 RHS = N.getOperand(1); 846 CC = N.getOperand(4); 847 return true; 848 } 849 850 /// Return true if this is a SetCC-equivalent operation with only one use. 851 /// If this is true, it allows the users to invert the operation for free when 852 /// it is profitable to do so. 853 bool DAGCombiner::isOneUseSetCC(SDValue N) const { 854 SDValue N0, N1, N2; 855 if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) 856 return true; 857 return false; 858 } 859 860 // Returns the SDNode if it is a constant float BuildVector 861 // or constant float. 862 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { 863 if (isa<ConstantFPSDNode>(N)) 864 return N.getNode(); 865 if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) 866 return N.getNode(); 867 return nullptr; 868 } 869 870 // Determines if it is a constant integer or a build vector of constant 871 // integers (and undefs). 872 // Do not permit build vector implicit truncation. 873 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { 874 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N)) 875 return !(Const->isOpaque() && NoOpaques); 876 if (N.getOpcode() != ISD::BUILD_VECTOR) 877 return false; 878 unsigned BitWidth = N.getScalarValueSizeInBits(); 879 for (const SDValue &Op : N->op_values()) { 880 if (Op.isUndef()) 881 continue; 882 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op); 883 if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || 884 (Const->isOpaque() && NoOpaques)) 885 return false; 886 } 887 return true; 888 } 889 890 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with 891 // undef's. 892 static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) { 893 if (V.getOpcode() != ISD::BUILD_VECTOR) 894 return false; 895 return isConstantOrConstantVector(V, NoOpaques) || 896 ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()); 897 } 898 899 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, 900 const SDLoc &DL, 901 SDValue N0, 902 SDValue N1) { 903 // Currently this only tries to ensure we don't undo the GEP splits done by 904 // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this, 905 // we check if the following transformation would be problematic: 906 // (load/store (add, (add, x, offset1), offset2)) -> 907 // (load/store (add, x, offset1+offset2)). 908 909 if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD) 910 return false; 911 912 if (N0.hasOneUse()) 913 return false; 914 915 auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 916 auto *C2 = dyn_cast<ConstantSDNode>(N1); 917 if (!C1 || !C2) 918 return false; 919 920 const APInt &C1APIntVal = C1->getAPIntValue(); 921 const APInt &C2APIntVal = C2->getAPIntValue(); 922 if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64) 923 return false; 924 925 const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal; 926 if (CombinedValueIntVal.getBitWidth() > 64) 927 return false; 928 const int64_t CombinedValue = CombinedValueIntVal.getSExtValue(); 929 930 for (SDNode *Node : N0->uses()) { 931 auto LoadStore = dyn_cast<MemSDNode>(Node); 932 if (LoadStore) { 933 // Is x[offset2] already not a legal addressing mode? If so then 934 // reassociating the constants breaks nothing (we test offset2 because 935 // that's the one we hope to fold into the load or store). 936 TargetLoweringBase::AddrMode AM; 937 AM.HasBaseReg = true; 938 AM.BaseOffs = C2APIntVal.getSExtValue(); 939 EVT VT = LoadStore->getMemoryVT(); 940 unsigned AS = LoadStore->getAddressSpace(); 941 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); 942 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 943 continue; 944 945 // Would x[offset1+offset2] still be a legal addressing mode? 946 AM.BaseOffs = CombinedValue; 947 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 948 return true; 949 } 950 } 951 952 return false; 953 } 954 955 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression 956 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. 957 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, 958 SDValue N0, SDValue N1) { 959 EVT VT = N0.getValueType(); 960 961 if (N0.getOpcode() != Opc) 962 return SDValue(); 963 964 // Don't reassociate reductions. 965 if (N0->getFlags().hasVectorReduction()) 966 return SDValue(); 967 968 if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { 969 if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { 970 // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) 971 if (SDValue OpNode = 972 DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1})) 973 return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); 974 return SDValue(); 975 } 976 if (N0.hasOneUse()) { 977 // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) 978 // iff (op x, c1) has one use 979 SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); 980 if (!OpNode.getNode()) 981 return SDValue(); 982 return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); 983 } 984 } 985 return SDValue(); 986 } 987 988 // Try to reassociate commutative binops. 989 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 990 SDValue N1, SDNodeFlags Flags) { 991 assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); 992 // Don't reassociate reductions. 993 if (Flags.hasVectorReduction()) 994 return SDValue(); 995 996 // Floating-point reassociation is not allowed without loose FP math. 997 if (N0.getValueType().isFloatingPoint() || 998 N1.getValueType().isFloatingPoint()) 999 if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros()) 1000 return SDValue(); 1001 1002 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) 1003 return Combined; 1004 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) 1005 return Combined; 1006 return SDValue(); 1007 } 1008 1009 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 1010 bool AddTo) { 1011 assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); 1012 ++NodesCombined; 1013 LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; 1014 To[0].getNode()->dump(&DAG); 1015 dbgs() << " and " << NumTo - 1 << " other values\n"); 1016 for (unsigned i = 0, e = NumTo; i != e; ++i) 1017 assert((!To[i].getNode() || 1018 N->getValueType(i) == To[i].getValueType()) && 1019 "Cannot combine value to value of different type!"); 1020 1021 WorklistRemover DeadNodes(*this); 1022 DAG.ReplaceAllUsesWith(N, To); 1023 if (AddTo) { 1024 // Push the new nodes and any users onto the worklist 1025 for (unsigned i = 0, e = NumTo; i != e; ++i) { 1026 if (To[i].getNode()) { 1027 AddToWorklist(To[i].getNode()); 1028 AddUsersToWorklist(To[i].getNode()); 1029 } 1030 } 1031 } 1032 1033 // Finally, if the node is now dead, remove it from the graph. The node 1034 // may not be dead if the replacement process recursively simplified to 1035 // something else needing this node. 1036 if (N->use_empty()) 1037 deleteAndRecombine(N); 1038 return SDValue(N, 0); 1039 } 1040 1041 void DAGCombiner:: 1042 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 1043 // Replace all uses. If any nodes become isomorphic to other nodes and 1044 // are deleted, make sure to remove them from our worklist. 1045 WorklistRemover DeadNodes(*this); 1046 DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); 1047 1048 // Push the new node and any (possibly new) users onto the worklist. 1049 AddToWorklistWithUsers(TLO.New.getNode()); 1050 1051 // Finally, if the node is now dead, remove it from the graph. The node 1052 // may not be dead if the replacement process recursively simplified to 1053 // something else needing this node. 1054 if (TLO.Old.getNode()->use_empty()) 1055 deleteAndRecombine(TLO.Old.getNode()); 1056 } 1057 1058 /// Check the specified integer node value to see if it can be simplified or if 1059 /// things it uses can be simplified by bit propagation. If so, return true. 1060 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 1061 const APInt &DemandedElts) { 1062 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1063 KnownBits Known; 1064 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO)) 1065 return false; 1066 1067 // Revisit the node. 1068 AddToWorklist(Op.getNode()); 1069 1070 // Replace the old value with the new one. 1071 ++NodesCombined; 1072 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); 1073 dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); 1074 dbgs() << '\n'); 1075 1076 CommitTargetLoweringOpt(TLO); 1077 return true; 1078 } 1079 1080 /// Check the specified vector node value to see if it can be simplified or 1081 /// if things it uses can be simplified as it only uses some of the elements. 1082 /// If so, return true. 1083 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, 1084 const APInt &DemandedElts, 1085 bool AssumeSingleUse) { 1086 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1087 APInt KnownUndef, KnownZero; 1088 if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, 1089 TLO, 0, AssumeSingleUse)) 1090 return false; 1091 1092 // Revisit the node. 1093 AddToWorklist(Op.getNode()); 1094 1095 // Replace the old value with the new one. 1096 ++NodesCombined; 1097 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); 1098 dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); 1099 dbgs() << '\n'); 1100 1101 CommitTargetLoweringOpt(TLO); 1102 return true; 1103 } 1104 1105 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { 1106 SDLoc DL(Load); 1107 EVT VT = Load->getValueType(0); 1108 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0)); 1109 1110 LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; 1111 Trunc.getNode()->dump(&DAG); dbgs() << '\n'); 1112 WorklistRemover DeadNodes(*this); 1113 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); 1114 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); 1115 deleteAndRecombine(Load); 1116 AddToWorklist(Trunc.getNode()); 1117 } 1118 1119 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { 1120 Replace = false; 1121 SDLoc DL(Op); 1122 if (ISD::isUNINDEXEDLoad(Op.getNode())) { 1123 LoadSDNode *LD = cast<LoadSDNode>(Op); 1124 EVT MemVT = LD->getMemoryVT(); 1125 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1126 : LD->getExtensionType(); 1127 Replace = true; 1128 return DAG.getExtLoad(ExtType, DL, PVT, 1129 LD->getChain(), LD->getBasePtr(), 1130 MemVT, LD->getMemOperand()); 1131 } 1132 1133 unsigned Opc = Op.getOpcode(); 1134 switch (Opc) { 1135 default: break; 1136 case ISD::AssertSext: 1137 if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) 1138 return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); 1139 break; 1140 case ISD::AssertZext: 1141 if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) 1142 return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); 1143 break; 1144 case ISD::Constant: { 1145 unsigned ExtOpc = 1146 Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1147 return DAG.getNode(ExtOpc, DL, PVT, Op); 1148 } 1149 } 1150 1151 if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) 1152 return SDValue(); 1153 return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op); 1154 } 1155 1156 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { 1157 if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) 1158 return SDValue(); 1159 EVT OldVT = Op.getValueType(); 1160 SDLoc DL(Op); 1161 bool Replace = false; 1162 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1163 if (!NewOp.getNode()) 1164 return SDValue(); 1165 AddToWorklist(NewOp.getNode()); 1166 1167 if (Replace) 1168 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1169 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp, 1170 DAG.getValueType(OldVT)); 1171 } 1172 1173 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { 1174 EVT OldVT = Op.getValueType(); 1175 SDLoc DL(Op); 1176 bool Replace = false; 1177 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1178 if (!NewOp.getNode()) 1179 return SDValue(); 1180 AddToWorklist(NewOp.getNode()); 1181 1182 if (Replace) 1183 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1184 return DAG.getZeroExtendInReg(NewOp, DL, OldVT); 1185 } 1186 1187 /// Promote the specified integer binary operation if the target indicates it is 1188 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1189 /// i32 since i16 instructions are longer. 1190 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { 1191 if (!LegalOperations) 1192 return SDValue(); 1193 1194 EVT VT = Op.getValueType(); 1195 if (VT.isVector() || !VT.isInteger()) 1196 return SDValue(); 1197 1198 // If operation type is 'undesirable', e.g. i16 on x86, consider 1199 // promoting it. 1200 unsigned Opc = Op.getOpcode(); 1201 if (TLI.isTypeDesirableForOp(Opc, VT)) 1202 return SDValue(); 1203 1204 EVT PVT = VT; 1205 // Consult target whether it is a good idea to promote this operation and 1206 // what's the right type to promote it to. 1207 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1208 assert(PVT != VT && "Don't know what type to promote to!"); 1209 1210 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1211 1212 bool Replace0 = false; 1213 SDValue N0 = Op.getOperand(0); 1214 SDValue NN0 = PromoteOperand(N0, PVT, Replace0); 1215 1216 bool Replace1 = false; 1217 SDValue N1 = Op.getOperand(1); 1218 SDValue NN1 = PromoteOperand(N1, PVT, Replace1); 1219 SDLoc DL(Op); 1220 1221 SDValue RV = 1222 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); 1223 1224 // We are always replacing N0/N1's use in N and only need 1225 // additional replacements if there are additional uses. 1226 Replace0 &= !N0->hasOneUse(); 1227 Replace1 &= (N0 != N1) && !N1->hasOneUse(); 1228 1229 // Combine Op here so it is preserved past replacements. 1230 CombineTo(Op.getNode(), RV); 1231 1232 // If operands have a use ordering, make sure we deal with 1233 // predecessor first. 1234 if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { 1235 std::swap(N0, N1); 1236 std::swap(NN0, NN1); 1237 } 1238 1239 if (Replace0) { 1240 AddToWorklist(NN0.getNode()); 1241 ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); 1242 } 1243 if (Replace1) { 1244 AddToWorklist(NN1.getNode()); 1245 ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); 1246 } 1247 return Op; 1248 } 1249 return SDValue(); 1250 } 1251 1252 /// Promote the specified integer shift operation if the target indicates it is 1253 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1254 /// i32 since i16 instructions are longer. 1255 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { 1256 if (!LegalOperations) 1257 return SDValue(); 1258 1259 EVT VT = Op.getValueType(); 1260 if (VT.isVector() || !VT.isInteger()) 1261 return SDValue(); 1262 1263 // If operation type is 'undesirable', e.g. i16 on x86, consider 1264 // promoting it. 1265 unsigned Opc = Op.getOpcode(); 1266 if (TLI.isTypeDesirableForOp(Opc, VT)) 1267 return SDValue(); 1268 1269 EVT PVT = VT; 1270 // Consult target whether it is a good idea to promote this operation and 1271 // what's the right type to promote it to. 1272 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1273 assert(PVT != VT && "Don't know what type to promote to!"); 1274 1275 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1276 1277 bool Replace = false; 1278 SDValue N0 = Op.getOperand(0); 1279 SDValue N1 = Op.getOperand(1); 1280 if (Opc == ISD::SRA) 1281 N0 = SExtPromoteOperand(N0, PVT); 1282 else if (Opc == ISD::SRL) 1283 N0 = ZExtPromoteOperand(N0, PVT); 1284 else 1285 N0 = PromoteOperand(N0, PVT, Replace); 1286 1287 if (!N0.getNode()) 1288 return SDValue(); 1289 1290 SDLoc DL(Op); 1291 SDValue RV = 1292 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); 1293 1294 if (Replace) 1295 ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); 1296 1297 // Deal with Op being deleted. 1298 if (Op && Op.getOpcode() != ISD::DELETED_NODE) 1299 return RV; 1300 } 1301 return SDValue(); 1302 } 1303 1304 SDValue DAGCombiner::PromoteExtend(SDValue Op) { 1305 if (!LegalOperations) 1306 return SDValue(); 1307 1308 EVT VT = Op.getValueType(); 1309 if (VT.isVector() || !VT.isInteger()) 1310 return SDValue(); 1311 1312 // If operation type is 'undesirable', e.g. i16 on x86, consider 1313 // promoting it. 1314 unsigned Opc = Op.getOpcode(); 1315 if (TLI.isTypeDesirableForOp(Opc, VT)) 1316 return SDValue(); 1317 1318 EVT PVT = VT; 1319 // Consult target whether it is a good idea to promote this operation and 1320 // what's the right type to promote it to. 1321 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1322 assert(PVT != VT && "Don't know what type to promote to!"); 1323 // fold (aext (aext x)) -> (aext x) 1324 // fold (aext (zext x)) -> (zext x) 1325 // fold (aext (sext x)) -> (sext x) 1326 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1327 return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); 1328 } 1329 return SDValue(); 1330 } 1331 1332 bool DAGCombiner::PromoteLoad(SDValue Op) { 1333 if (!LegalOperations) 1334 return false; 1335 1336 if (!ISD::isUNINDEXEDLoad(Op.getNode())) 1337 return false; 1338 1339 EVT VT = Op.getValueType(); 1340 if (VT.isVector() || !VT.isInteger()) 1341 return false; 1342 1343 // If operation type is 'undesirable', e.g. i16 on x86, consider 1344 // promoting it. 1345 unsigned Opc = Op.getOpcode(); 1346 if (TLI.isTypeDesirableForOp(Opc, VT)) 1347 return false; 1348 1349 EVT PVT = VT; 1350 // Consult target whether it is a good idea to promote this operation and 1351 // what's the right type to promote it to. 1352 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1353 assert(PVT != VT && "Don't know what type to promote to!"); 1354 1355 SDLoc DL(Op); 1356 SDNode *N = Op.getNode(); 1357 LoadSDNode *LD = cast<LoadSDNode>(N); 1358 EVT MemVT = LD->getMemoryVT(); 1359 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1360 : LD->getExtensionType(); 1361 SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT, 1362 LD->getChain(), LD->getBasePtr(), 1363 MemVT, LD->getMemOperand()); 1364 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); 1365 1366 LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; 1367 Result.getNode()->dump(&DAG); dbgs() << '\n'); 1368 WorklistRemover DeadNodes(*this); 1369 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1370 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); 1371 deleteAndRecombine(N); 1372 AddToWorklist(Result.getNode()); 1373 return true; 1374 } 1375 return false; 1376 } 1377 1378 /// Recursively delete a node which has no uses and any operands for 1379 /// which it is the only use. 1380 /// 1381 /// Note that this both deletes the nodes and removes them from the worklist. 1382 /// It also adds any nodes who have had a user deleted to the worklist as they 1383 /// may now have only one use and subject to other combines. 1384 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { 1385 if (!N->use_empty()) 1386 return false; 1387 1388 SmallSetVector<SDNode *, 16> Nodes; 1389 Nodes.insert(N); 1390 do { 1391 N = Nodes.pop_back_val(); 1392 if (!N) 1393 continue; 1394 1395 if (N->use_empty()) { 1396 for (const SDValue &ChildN : N->op_values()) 1397 Nodes.insert(ChildN.getNode()); 1398 1399 removeFromWorklist(N); 1400 DAG.DeleteNode(N); 1401 } else { 1402 AddToWorklist(N); 1403 } 1404 } while (!Nodes.empty()); 1405 return true; 1406 } 1407 1408 //===----------------------------------------------------------------------===// 1409 // Main DAG Combiner implementation 1410 //===----------------------------------------------------------------------===// 1411 1412 void DAGCombiner::Run(CombineLevel AtLevel) { 1413 // set the instance variables, so that the various visit routines may use it. 1414 Level = AtLevel; 1415 LegalDAG = Level >= AfterLegalizeDAG; 1416 LegalOperations = Level >= AfterLegalizeVectorOps; 1417 LegalTypes = Level >= AfterLegalizeTypes; 1418 1419 WorklistInserter AddNodes(*this); 1420 1421 // Add all the dag nodes to the worklist. 1422 for (SDNode &Node : DAG.allnodes()) 1423 AddToWorklist(&Node); 1424 1425 // Create a dummy node (which is not added to allnodes), that adds a reference 1426 // to the root node, preventing it from being deleted, and tracking any 1427 // changes of the root. 1428 HandleSDNode Dummy(DAG.getRoot()); 1429 1430 // While we have a valid worklist entry node, try to combine it. 1431 while (SDNode *N = getNextWorklistEntry()) { 1432 // If N has no uses, it is dead. Make sure to revisit all N's operands once 1433 // N is deleted from the DAG, since they too may now be dead or may have a 1434 // reduced number of uses, allowing other xforms. 1435 if (recursivelyDeleteUnusedNodes(N)) 1436 continue; 1437 1438 WorklistRemover DeadNodes(*this); 1439 1440 // If this combine is running after legalizing the DAG, re-legalize any 1441 // nodes pulled off the worklist. 1442 if (LegalDAG) { 1443 SmallSetVector<SDNode *, 16> UpdatedNodes; 1444 bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); 1445 1446 for (SDNode *LN : UpdatedNodes) 1447 AddToWorklistWithUsers(LN); 1448 1449 if (!NIsValid) 1450 continue; 1451 } 1452 1453 LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); 1454 1455 // Add any operands of the new node which have not yet been combined to the 1456 // worklist as well. Because the worklist uniques things already, this 1457 // won't repeatedly process the same operand. 1458 CombinedNodes.insert(N); 1459 for (const SDValue &ChildN : N->op_values()) 1460 if (!CombinedNodes.count(ChildN.getNode())) 1461 AddToWorklist(ChildN.getNode()); 1462 1463 SDValue RV = combine(N); 1464 1465 if (!RV.getNode()) 1466 continue; 1467 1468 ++NodesCombined; 1469 1470 // If we get back the same node we passed in, rather than a new node or 1471 // zero, we know that the node must have defined multiple values and 1472 // CombineTo was used. Since CombineTo takes care of the worklist 1473 // mechanics for us, we have no work to do in this case. 1474 if (RV.getNode() == N) 1475 continue; 1476 1477 assert(N->getOpcode() != ISD::DELETED_NODE && 1478 RV.getOpcode() != ISD::DELETED_NODE && 1479 "Node was deleted but visit returned new node!"); 1480 1481 LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); 1482 1483 if (N->getNumValues() == RV.getNode()->getNumValues()) 1484 DAG.ReplaceAllUsesWith(N, RV.getNode()); 1485 else { 1486 assert(N->getValueType(0) == RV.getValueType() && 1487 N->getNumValues() == 1 && "Type mismatch"); 1488 DAG.ReplaceAllUsesWith(N, &RV); 1489 } 1490 1491 // Push the new node and any users onto the worklist 1492 AddToWorklist(RV.getNode()); 1493 AddUsersToWorklist(RV.getNode()); 1494 1495 // Finally, if the node is now dead, remove it from the graph. The node 1496 // may not be dead if the replacement process recursively simplified to 1497 // something else needing this node. This will also take care of adding any 1498 // operands which have lost a user to the worklist. 1499 recursivelyDeleteUnusedNodes(N); 1500 } 1501 1502 // If the root changed (e.g. it was a dead load, update the root). 1503 DAG.setRoot(Dummy.getValue()); 1504 DAG.RemoveDeadNodes(); 1505 } 1506 1507 SDValue DAGCombiner::visit(SDNode *N) { 1508 switch (N->getOpcode()) { 1509 default: break; 1510 case ISD::TokenFactor: return visitTokenFactor(N); 1511 case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); 1512 case ISD::ADD: return visitADD(N); 1513 case ISD::SUB: return visitSUB(N); 1514 case ISD::SADDSAT: 1515 case ISD::UADDSAT: return visitADDSAT(N); 1516 case ISD::SSUBSAT: 1517 case ISD::USUBSAT: return visitSUBSAT(N); 1518 case ISD::ADDC: return visitADDC(N); 1519 case ISD::SADDO: 1520 case ISD::UADDO: return visitADDO(N); 1521 case ISD::SUBC: return visitSUBC(N); 1522 case ISD::SSUBO: 1523 case ISD::USUBO: return visitSUBO(N); 1524 case ISD::ADDE: return visitADDE(N); 1525 case ISD::ADDCARRY: return visitADDCARRY(N); 1526 case ISD::SUBE: return visitSUBE(N); 1527 case ISD::SUBCARRY: return visitSUBCARRY(N); 1528 case ISD::SMULFIX: 1529 case ISD::SMULFIXSAT: 1530 case ISD::UMULFIX: 1531 case ISD::UMULFIXSAT: return visitMULFIX(N); 1532 case ISD::MUL: return visitMUL(N); 1533 case ISD::SDIV: return visitSDIV(N); 1534 case ISD::UDIV: return visitUDIV(N); 1535 case ISD::SREM: 1536 case ISD::UREM: return visitREM(N); 1537 case ISD::MULHU: return visitMULHU(N); 1538 case ISD::MULHS: return visitMULHS(N); 1539 case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); 1540 case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); 1541 case ISD::SMULO: 1542 case ISD::UMULO: return visitMULO(N); 1543 case ISD::SMIN: 1544 case ISD::SMAX: 1545 case ISD::UMIN: 1546 case ISD::UMAX: return visitIMINMAX(N); 1547 case ISD::AND: return visitAND(N); 1548 case ISD::OR: return visitOR(N); 1549 case ISD::XOR: return visitXOR(N); 1550 case ISD::SHL: return visitSHL(N); 1551 case ISD::SRA: return visitSRA(N); 1552 case ISD::SRL: return visitSRL(N); 1553 case ISD::ROTR: 1554 case ISD::ROTL: return visitRotate(N); 1555 case ISD::FSHL: 1556 case ISD::FSHR: return visitFunnelShift(N); 1557 case ISD::ABS: return visitABS(N); 1558 case ISD::BSWAP: return visitBSWAP(N); 1559 case ISD::BITREVERSE: return visitBITREVERSE(N); 1560 case ISD::CTLZ: return visitCTLZ(N); 1561 case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); 1562 case ISD::CTTZ: return visitCTTZ(N); 1563 case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); 1564 case ISD::CTPOP: return visitCTPOP(N); 1565 case ISD::SELECT: return visitSELECT(N); 1566 case ISD::VSELECT: return visitVSELECT(N); 1567 case ISD::SELECT_CC: return visitSELECT_CC(N); 1568 case ISD::SETCC: return visitSETCC(N); 1569 case ISD::SETCCCARRY: return visitSETCCCARRY(N); 1570 case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); 1571 case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); 1572 case ISD::ANY_EXTEND: return visitANY_EXTEND(N); 1573 case ISD::AssertSext: 1574 case ISD::AssertZext: return visitAssertExt(N); 1575 case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); 1576 case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); 1577 case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); 1578 case ISD::TRUNCATE: return visitTRUNCATE(N); 1579 case ISD::BITCAST: return visitBITCAST(N); 1580 case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); 1581 case ISD::FADD: return visitFADD(N); 1582 case ISD::FSUB: return visitFSUB(N); 1583 case ISD::FMUL: return visitFMUL(N); 1584 case ISD::FMA: return visitFMA(N); 1585 case ISD::FDIV: return visitFDIV(N); 1586 case ISD::FREM: return visitFREM(N); 1587 case ISD::FSQRT: return visitFSQRT(N); 1588 case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); 1589 case ISD::FPOW: return visitFPOW(N); 1590 case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); 1591 case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); 1592 case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); 1593 case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); 1594 case ISD::FP_ROUND: return visitFP_ROUND(N); 1595 case ISD::FP_EXTEND: return visitFP_EXTEND(N); 1596 case ISD::FNEG: return visitFNEG(N); 1597 case ISD::FABS: return visitFABS(N); 1598 case ISD::FFLOOR: return visitFFLOOR(N); 1599 case ISD::FMINNUM: return visitFMINNUM(N); 1600 case ISD::FMAXNUM: return visitFMAXNUM(N); 1601 case ISD::FMINIMUM: return visitFMINIMUM(N); 1602 case ISD::FMAXIMUM: return visitFMAXIMUM(N); 1603 case ISD::FCEIL: return visitFCEIL(N); 1604 case ISD::FTRUNC: return visitFTRUNC(N); 1605 case ISD::BRCOND: return visitBRCOND(N); 1606 case ISD::BR_CC: return visitBR_CC(N); 1607 case ISD::LOAD: return visitLOAD(N); 1608 case ISD::STORE: return visitSTORE(N); 1609 case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); 1610 case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); 1611 case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); 1612 case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); 1613 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); 1614 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); 1615 case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); 1616 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); 1617 case ISD::MGATHER: return visitMGATHER(N); 1618 case ISD::MLOAD: return visitMLOAD(N); 1619 case ISD::MSCATTER: return visitMSCATTER(N); 1620 case ISD::MSTORE: return visitMSTORE(N); 1621 case ISD::LIFETIME_END: return visitLIFETIME_END(N); 1622 case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); 1623 case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); 1624 case ISD::VECREDUCE_FADD: 1625 case ISD::VECREDUCE_FMUL: 1626 case ISD::VECREDUCE_ADD: 1627 case ISD::VECREDUCE_MUL: 1628 case ISD::VECREDUCE_AND: 1629 case ISD::VECREDUCE_OR: 1630 case ISD::VECREDUCE_XOR: 1631 case ISD::VECREDUCE_SMAX: 1632 case ISD::VECREDUCE_SMIN: 1633 case ISD::VECREDUCE_UMAX: 1634 case ISD::VECREDUCE_UMIN: 1635 case ISD::VECREDUCE_FMAX: 1636 case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); 1637 } 1638 return SDValue(); 1639 } 1640 1641 SDValue DAGCombiner::combine(SDNode *N) { 1642 SDValue RV = visit(N); 1643 1644 // If nothing happened, try a target-specific DAG combine. 1645 if (!RV.getNode()) { 1646 assert(N->getOpcode() != ISD::DELETED_NODE && 1647 "Node was deleted but visit returned NULL!"); 1648 1649 if (N->getOpcode() >= ISD::BUILTIN_OP_END || 1650 TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { 1651 1652 // Expose the DAG combiner to the target combiner impls. 1653 TargetLowering::DAGCombinerInfo 1654 DagCombineInfo(DAG, Level, false, this); 1655 1656 RV = TLI.PerformDAGCombine(N, DagCombineInfo); 1657 } 1658 } 1659 1660 // If nothing happened still, try promoting the operation. 1661 if (!RV.getNode()) { 1662 switch (N->getOpcode()) { 1663 default: break; 1664 case ISD::ADD: 1665 case ISD::SUB: 1666 case ISD::MUL: 1667 case ISD::AND: 1668 case ISD::OR: 1669 case ISD::XOR: 1670 RV = PromoteIntBinOp(SDValue(N, 0)); 1671 break; 1672 case ISD::SHL: 1673 case ISD::SRA: 1674 case ISD::SRL: 1675 RV = PromoteIntShiftOp(SDValue(N, 0)); 1676 break; 1677 case ISD::SIGN_EXTEND: 1678 case ISD::ZERO_EXTEND: 1679 case ISD::ANY_EXTEND: 1680 RV = PromoteExtend(SDValue(N, 0)); 1681 break; 1682 case ISD::LOAD: 1683 if (PromoteLoad(SDValue(N, 0))) 1684 RV = SDValue(N, 0); 1685 break; 1686 } 1687 } 1688 1689 // If N is a commutative binary node, try to eliminate it if the commuted 1690 // version is already present in the DAG. 1691 if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && 1692 N->getNumValues() == 1) { 1693 SDValue N0 = N->getOperand(0); 1694 SDValue N1 = N->getOperand(1); 1695 1696 // Constant operands are canonicalized to RHS. 1697 if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) { 1698 SDValue Ops[] = {N1, N0}; 1699 SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, 1700 N->getFlags()); 1701 if (CSENode) 1702 return SDValue(CSENode, 0); 1703 } 1704 } 1705 1706 return RV; 1707 } 1708 1709 /// Given a node, return its input chain if it has one, otherwise return a null 1710 /// sd operand. 1711 static SDValue getInputChainForNode(SDNode *N) { 1712 if (unsigned NumOps = N->getNumOperands()) { 1713 if (N->getOperand(0).getValueType() == MVT::Other) 1714 return N->getOperand(0); 1715 if (N->getOperand(NumOps-1).getValueType() == MVT::Other) 1716 return N->getOperand(NumOps-1); 1717 for (unsigned i = 1; i < NumOps-1; ++i) 1718 if (N->getOperand(i).getValueType() == MVT::Other) 1719 return N->getOperand(i); 1720 } 1721 return SDValue(); 1722 } 1723 1724 SDValue DAGCombiner::visitTokenFactor(SDNode *N) { 1725 // If N has two operands, where one has an input chain equal to the other, 1726 // the 'other' chain is redundant. 1727 if (N->getNumOperands() == 2) { 1728 if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) 1729 return N->getOperand(0); 1730 if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) 1731 return N->getOperand(1); 1732 } 1733 1734 // Don't simplify token factors if optnone. 1735 if (OptLevel == CodeGenOpt::None) 1736 return SDValue(); 1737 1738 // If the sole user is a token factor, we should make sure we have a 1739 // chance to merge them together. This prevents TF chains from inhibiting 1740 // optimizations. 1741 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) 1742 AddToWorklist(*(N->use_begin())); 1743 1744 SmallVector<SDNode *, 8> TFs; // List of token factors to visit. 1745 SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. 1746 SmallPtrSet<SDNode*, 16> SeenOps; 1747 bool Changed = false; // If we should replace this token factor. 1748 1749 // Start out with this token factor. 1750 TFs.push_back(N); 1751 1752 // Iterate through token factors. The TFs grows when new token factors are 1753 // encountered. 1754 for (unsigned i = 0; i < TFs.size(); ++i) { 1755 // Limit number of nodes to inline, to avoid quadratic compile times. 1756 // We have to add the outstanding Token Factors to Ops, otherwise we might 1757 // drop Ops from the resulting Token Factors. 1758 if (Ops.size() > TokenFactorInlineLimit) { 1759 for (unsigned j = i; j < TFs.size(); j++) 1760 Ops.emplace_back(TFs[j], 0); 1761 // Drop unprocessed Token Factors from TFs, so we do not add them to the 1762 // combiner worklist later. 1763 TFs.resize(i); 1764 break; 1765 } 1766 1767 SDNode *TF = TFs[i]; 1768 // Check each of the operands. 1769 for (const SDValue &Op : TF->op_values()) { 1770 switch (Op.getOpcode()) { 1771 case ISD::EntryToken: 1772 // Entry tokens don't need to be added to the list. They are 1773 // redundant. 1774 Changed = true; 1775 break; 1776 1777 case ISD::TokenFactor: 1778 if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { 1779 // Queue up for processing. 1780 TFs.push_back(Op.getNode()); 1781 Changed = true; 1782 break; 1783 } 1784 LLVM_FALLTHROUGH; 1785 1786 default: 1787 // Only add if it isn't already in the list. 1788 if (SeenOps.insert(Op.getNode()).second) 1789 Ops.push_back(Op); 1790 else 1791 Changed = true; 1792 break; 1793 } 1794 } 1795 } 1796 1797 // Re-visit inlined Token Factors, to clean them up in case they have been 1798 // removed. Skip the first Token Factor, as this is the current node. 1799 for (unsigned i = 1, e = TFs.size(); i < e; i++) 1800 AddToWorklist(TFs[i]); 1801 1802 // Remove Nodes that are chained to another node in the list. Do so 1803 // by walking up chains breath-first stopping when we've seen 1804 // another operand. In general we must climb to the EntryNode, but we can exit 1805 // early if we find all remaining work is associated with just one operand as 1806 // no further pruning is possible. 1807 1808 // List of nodes to search through and original Ops from which they originate. 1809 SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; 1810 SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. 1811 SmallPtrSet<SDNode *, 16> SeenChains; 1812 bool DidPruneOps = false; 1813 1814 unsigned NumLeftToConsider = 0; 1815 for (const SDValue &Op : Ops) { 1816 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); 1817 OpWorkCount.push_back(1); 1818 } 1819 1820 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { 1821 // If this is an Op, we can remove the op from the list. Remark any 1822 // search associated with it as from the current OpNumber. 1823 if (SeenOps.count(Op) != 0) { 1824 Changed = true; 1825 DidPruneOps = true; 1826 unsigned OrigOpNumber = 0; 1827 while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) 1828 OrigOpNumber++; 1829 assert((OrigOpNumber != Ops.size()) && 1830 "expected to find TokenFactor Operand"); 1831 // Re-mark worklist from OrigOpNumber to OpNumber 1832 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { 1833 if (Worklist[i].second == OrigOpNumber) { 1834 Worklist[i].second = OpNumber; 1835 } 1836 } 1837 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; 1838 OpWorkCount[OrigOpNumber] = 0; 1839 NumLeftToConsider--; 1840 } 1841 // Add if it's a new chain 1842 if (SeenChains.insert(Op).second) { 1843 OpWorkCount[OpNumber]++; 1844 Worklist.push_back(std::make_pair(Op, OpNumber)); 1845 } 1846 }; 1847 1848 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { 1849 // We need at least be consider at least 2 Ops to prune. 1850 if (NumLeftToConsider <= 1) 1851 break; 1852 auto CurNode = Worklist[i].first; 1853 auto CurOpNumber = Worklist[i].second; 1854 assert((OpWorkCount[CurOpNumber] > 0) && 1855 "Node should not appear in worklist"); 1856 switch (CurNode->getOpcode()) { 1857 case ISD::EntryToken: 1858 // Hitting EntryToken is the only way for the search to terminate without 1859 // hitting 1860 // another operand's search. Prevent us from marking this operand 1861 // considered. 1862 NumLeftToConsider++; 1863 break; 1864 case ISD::TokenFactor: 1865 for (const SDValue &Op : CurNode->op_values()) 1866 AddToWorklist(i, Op.getNode(), CurOpNumber); 1867 break; 1868 case ISD::LIFETIME_START: 1869 case ISD::LIFETIME_END: 1870 case ISD::CopyFromReg: 1871 case ISD::CopyToReg: 1872 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); 1873 break; 1874 default: 1875 if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) 1876 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); 1877 break; 1878 } 1879 OpWorkCount[CurOpNumber]--; 1880 if (OpWorkCount[CurOpNumber] == 0) 1881 NumLeftToConsider--; 1882 } 1883 1884 // If we've changed things around then replace token factor. 1885 if (Changed) { 1886 SDValue Result; 1887 if (Ops.empty()) { 1888 // The entry token is the only possible outcome. 1889 Result = DAG.getEntryNode(); 1890 } else { 1891 if (DidPruneOps) { 1892 SmallVector<SDValue, 8> PrunedOps; 1893 // 1894 for (const SDValue &Op : Ops) { 1895 if (SeenChains.count(Op.getNode()) == 0) 1896 PrunedOps.push_back(Op); 1897 } 1898 Result = DAG.getTokenFactor(SDLoc(N), PrunedOps); 1899 } else { 1900 Result = DAG.getTokenFactor(SDLoc(N), Ops); 1901 } 1902 } 1903 return Result; 1904 } 1905 return SDValue(); 1906 } 1907 1908 /// MERGE_VALUES can always be eliminated. 1909 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { 1910 WorklistRemover DeadNodes(*this); 1911 // Replacing results may cause a different MERGE_VALUES to suddenly 1912 // be CSE'd with N, and carry its uses with it. Iterate until no 1913 // uses remain, to ensure that the node can be safely deleted. 1914 // First add the users of this node to the work list so that they 1915 // can be tried again once they have new operands. 1916 AddUsersToWorklist(N); 1917 do { 1918 // Do as a single replacement to avoid rewalking use lists. 1919 SmallVector<SDValue, 8> Ops; 1920 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 1921 Ops.push_back(N->getOperand(i)); 1922 DAG.ReplaceAllUsesWith(N, Ops.data()); 1923 } while (!N->use_empty()); 1924 deleteAndRecombine(N); 1925 return SDValue(N, 0); // Return N so it doesn't get rechecked! 1926 } 1927 1928 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a 1929 /// ConstantSDNode pointer else nullptr. 1930 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { 1931 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N); 1932 return Const != nullptr && !Const->isOpaque() ? Const : nullptr; 1933 } 1934 1935 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { 1936 assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && 1937 "Unexpected binary operator"); 1938 1939 // Don't do this unless the old select is going away. We want to eliminate the 1940 // binary operator, not replace a binop with a select. 1941 // TODO: Handle ISD::SELECT_CC. 1942 unsigned SelOpNo = 0; 1943 SDValue Sel = BO->getOperand(0); 1944 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { 1945 SelOpNo = 1; 1946 Sel = BO->getOperand(1); 1947 } 1948 1949 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) 1950 return SDValue(); 1951 1952 SDValue CT = Sel.getOperand(1); 1953 if (!isConstantOrConstantVector(CT, true) && 1954 !isConstantFPBuildVectorOrConstantFP(CT)) 1955 return SDValue(); 1956 1957 SDValue CF = Sel.getOperand(2); 1958 if (!isConstantOrConstantVector(CF, true) && 1959 !isConstantFPBuildVectorOrConstantFP(CF)) 1960 return SDValue(); 1961 1962 // Bail out if any constants are opaque because we can't constant fold those. 1963 // The exception is "and" and "or" with either 0 or -1 in which case we can 1964 // propagate non constant operands into select. I.e.: 1965 // and (select Cond, 0, -1), X --> select Cond, 0, X 1966 // or X, (select Cond, -1, 0) --> select Cond, -1, X 1967 auto BinOpcode = BO->getOpcode(); 1968 bool CanFoldNonConst = 1969 (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && 1970 (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) && 1971 (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF)); 1972 1973 SDValue CBO = BO->getOperand(SelOpNo ^ 1); 1974 if (!CanFoldNonConst && 1975 !isConstantOrConstantVector(CBO, true) && 1976 !isConstantFPBuildVectorOrConstantFP(CBO)) 1977 return SDValue(); 1978 1979 EVT VT = Sel.getValueType(); 1980 1981 // In case of shift value and shift amount may have different VT. For instance 1982 // on x86 shift amount is i8 regardles of LHS type. Bail out if we have 1983 // swapped operands and value types do not match. NB: x86 is fine if operands 1984 // are not swapped with shift amount VT being not bigger than shifted value. 1985 // TODO: that is possible to check for a shift operation, correct VTs and 1986 // still perform optimization on x86 if needed. 1987 if (SelOpNo && VT != CBO.getValueType()) 1988 return SDValue(); 1989 1990 // We have a select-of-constants followed by a binary operator with a 1991 // constant. Eliminate the binop by pulling the constant math into the select. 1992 // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO 1993 SDLoc DL(Sel); 1994 SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) 1995 : DAG.getNode(BinOpcode, DL, VT, CT, CBO); 1996 if (!CanFoldNonConst && !NewCT.isUndef() && 1997 !isConstantOrConstantVector(NewCT, true) && 1998 !isConstantFPBuildVectorOrConstantFP(NewCT)) 1999 return SDValue(); 2000 2001 SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) 2002 : DAG.getNode(BinOpcode, DL, VT, CF, CBO); 2003 if (!CanFoldNonConst && !NewCF.isUndef() && 2004 !isConstantOrConstantVector(NewCF, true) && 2005 !isConstantFPBuildVectorOrConstantFP(NewCF)) 2006 return SDValue(); 2007 2008 SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); 2009 SelectOp->setFlags(BO->getFlags()); 2010 return SelectOp; 2011 } 2012 2013 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) { 2014 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2015 "Expecting add or sub"); 2016 2017 // Match a constant operand and a zext operand for the math instruction: 2018 // add Z, C 2019 // sub C, Z 2020 bool IsAdd = N->getOpcode() == ISD::ADD; 2021 SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0); 2022 SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1); 2023 auto *CN = dyn_cast<ConstantSDNode>(C); 2024 if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND) 2025 return SDValue(); 2026 2027 // Match the zext operand as a setcc of a boolean. 2028 if (Z.getOperand(0).getOpcode() != ISD::SETCC || 2029 Z.getOperand(0).getValueType() != MVT::i1) 2030 return SDValue(); 2031 2032 // Match the compare as: setcc (X & 1), 0, eq. 2033 SDValue SetCC = Z.getOperand(0); 2034 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 2035 if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) || 2036 SetCC.getOperand(0).getOpcode() != ISD::AND || 2037 !isOneConstant(SetCC.getOperand(0).getOperand(1))) 2038 return SDValue(); 2039 2040 // We are adding/subtracting a constant and an inverted low bit. Turn that 2041 // into a subtract/add of the low bit with incremented/decremented constant: 2042 // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1)) 2043 // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1)) 2044 EVT VT = C.getValueType(); 2045 SDLoc DL(N); 2046 SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT); 2047 SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) : 2048 DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); 2049 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit); 2050 } 2051 2052 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into 2053 /// a shift and add with a different constant. 2054 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { 2055 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2056 "Expecting add or sub"); 2057 2058 // We need a constant operand for the add/sub, and the other operand is a 2059 // logical shift right: add (srl), C or sub C, (srl). 2060 // TODO - support non-uniform vector amounts. 2061 bool IsAdd = N->getOpcode() == ISD::ADD; 2062 SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0); 2063 SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1); 2064 ConstantSDNode *C = isConstOrConstSplat(ConstantOp); 2065 if (!C || ShiftOp.getOpcode() != ISD::SRL) 2066 return SDValue(); 2067 2068 // The shift must be of a 'not' value. 2069 SDValue Not = ShiftOp.getOperand(0); 2070 if (!Not.hasOneUse() || !isBitwiseNot(Not)) 2071 return SDValue(); 2072 2073 // The shift must be moving the sign bit to the least-significant-bit. 2074 EVT VT = ShiftOp.getValueType(); 2075 SDValue ShAmt = ShiftOp.getOperand(1); 2076 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 2077 if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1)) 2078 return SDValue(); 2079 2080 // Eliminate the 'not' by adjusting the shift and add/sub constant: 2081 // add (srl (not X), 31), C --> add (sra X, 31), (C + 1) 2082 // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1) 2083 SDLoc DL(N); 2084 auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL; 2085 SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt); 2086 APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1; 2087 return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT)); 2088 } 2089 2090 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily 2091 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there 2092 /// are no common bits set in the operands). 2093 SDValue DAGCombiner::visitADDLike(SDNode *N) { 2094 SDValue N0 = N->getOperand(0); 2095 SDValue N1 = N->getOperand(1); 2096 EVT VT = N0.getValueType(); 2097 SDLoc DL(N); 2098 2099 // fold vector ops 2100 if (VT.isVector()) { 2101 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 2102 return FoldedVOp; 2103 2104 // fold (add x, 0) -> x, vector edition 2105 if (ISD::isBuildVectorAllZeros(N1.getNode())) 2106 return N0; 2107 if (ISD::isBuildVectorAllZeros(N0.getNode())) 2108 return N1; 2109 } 2110 2111 // fold (add x, undef) -> undef 2112 if (N0.isUndef()) 2113 return N0; 2114 2115 if (N1.isUndef()) 2116 return N1; 2117 2118 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2119 // canonicalize constant to RHS 2120 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2121 return DAG.getNode(ISD::ADD, DL, VT, N1, N0); 2122 // fold (add c1, c2) -> c1+c2 2123 return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}); 2124 } 2125 2126 // fold (add x, 0) -> x 2127 if (isNullConstant(N1)) 2128 return N0; 2129 2130 if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { 2131 // fold ((A-c1)+c2) -> (A+(c2-c1)) 2132 if (N0.getOpcode() == ISD::SUB && 2133 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { 2134 SDValue Sub = 2135 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)}); 2136 assert(Sub && "Constant folding failed"); 2137 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub); 2138 } 2139 2140 // fold ((c1-A)+c2) -> (c1+c2)-A 2141 if (N0.getOpcode() == ISD::SUB && 2142 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { 2143 SDValue Add = 2144 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)}); 2145 assert(Add && "Constant folding failed"); 2146 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2147 } 2148 2149 // add (sext i1 X), 1 -> zext (not i1 X) 2150 // We don't transform this pattern: 2151 // add (zext i1 X), -1 -> sext (not i1 X) 2152 // because most (?) targets generate better code for the zext form. 2153 if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && 2154 isOneOrOneSplat(N1)) { 2155 SDValue X = N0.getOperand(0); 2156 if ((!LegalOperations || 2157 (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && 2158 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && 2159 X.getScalarValueSizeInBits() == 1) { 2160 SDValue Not = DAG.getNOT(DL, X, X.getValueType()); 2161 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); 2162 } 2163 } 2164 2165 // Undo the add -> or combine to merge constant offsets from a frame index. 2166 if (N0.getOpcode() == ISD::OR && 2167 isa<FrameIndexSDNode>(N0.getOperand(0)) && 2168 isa<ConstantSDNode>(N0.getOperand(1)) && 2169 DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { 2170 SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1)); 2171 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); 2172 } 2173 } 2174 2175 if (SDValue NewSel = foldBinOpIntoSelect(N)) 2176 return NewSel; 2177 2178 // reassociate add 2179 if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) { 2180 if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) 2181 return RADD; 2182 } 2183 // fold ((0-A) + B) -> B-A 2184 if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) 2185 return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2186 2187 // fold (A + (0-B)) -> A-B 2188 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 2189 return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); 2190 2191 // fold (A+(B-A)) -> B 2192 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) 2193 return N1.getOperand(0); 2194 2195 // fold ((B-A)+A) -> B 2196 if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) 2197 return N0.getOperand(0); 2198 2199 // fold ((A-B)+(C-A)) -> (C-B) 2200 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2201 N0.getOperand(0) == N1.getOperand(1)) 2202 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2203 N0.getOperand(1)); 2204 2205 // fold ((A-B)+(B-C)) -> (A-C) 2206 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2207 N0.getOperand(1) == N1.getOperand(0)) 2208 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 2209 N1.getOperand(1)); 2210 2211 // fold (A+(B-(A+C))) to (B-C) 2212 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2213 N0 == N1.getOperand(1).getOperand(0)) 2214 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2215 N1.getOperand(1).getOperand(1)); 2216 2217 // fold (A+(B-(C+A))) to (B-C) 2218 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2219 N0 == N1.getOperand(1).getOperand(1)) 2220 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2221 N1.getOperand(1).getOperand(0)); 2222 2223 // fold (A+((B-A)+or-C)) to (B+or-C) 2224 if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && 2225 N1.getOperand(0).getOpcode() == ISD::SUB && 2226 N0 == N1.getOperand(0).getOperand(1)) 2227 return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0), 2228 N1.getOperand(1)); 2229 2230 // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant 2231 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { 2232 SDValue N00 = N0.getOperand(0); 2233 SDValue N01 = N0.getOperand(1); 2234 SDValue N10 = N1.getOperand(0); 2235 SDValue N11 = N1.getOperand(1); 2236 2237 if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) 2238 return DAG.getNode(ISD::SUB, DL, VT, 2239 DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), 2240 DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); 2241 } 2242 2243 // fold (add (umax X, C), -C) --> (usubsat X, C) 2244 if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) { 2245 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { 2246 return (!Max && !Op) || 2247 (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue())); 2248 }; 2249 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT, 2250 /*AllowUndefs*/ true)) 2251 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), 2252 N0.getOperand(1)); 2253 } 2254 2255 if (SimplifyDemandedBits(SDValue(N, 0))) 2256 return SDValue(N, 0); 2257 2258 if (isOneOrOneSplat(N1)) { 2259 // fold (add (xor a, -1), 1) -> (sub 0, a) 2260 if (isBitwiseNot(N0)) 2261 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 2262 N0.getOperand(0)); 2263 2264 // fold (add (add (xor a, -1), b), 1) -> (sub b, a) 2265 if (N0.getOpcode() == ISD::ADD || 2266 N0.getOpcode() == ISD::UADDO || 2267 N0.getOpcode() == ISD::SADDO) { 2268 SDValue A, Xor; 2269 2270 if (isBitwiseNot(N0.getOperand(0))) { 2271 A = N0.getOperand(1); 2272 Xor = N0.getOperand(0); 2273 } else if (isBitwiseNot(N0.getOperand(1))) { 2274 A = N0.getOperand(0); 2275 Xor = N0.getOperand(1); 2276 } 2277 2278 if (Xor) 2279 return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0)); 2280 } 2281 2282 // Look for: 2283 // add (add x, y), 1 2284 // And if the target does not like this form then turn into: 2285 // sub y, (xor x, -1) 2286 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2287 N0.getOpcode() == ISD::ADD) { 2288 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2289 DAG.getAllOnesConstant(DL, VT)); 2290 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); 2291 } 2292 } 2293 2294 // (x - y) + -1 -> add (xor y, -1), x 2295 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2296 isAllOnesOrAllOnesSplat(N1)) { 2297 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1); 2298 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 2299 } 2300 2301 if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) 2302 return Combined; 2303 2304 if (SDValue Combined = visitADDLikeCommutative(N1, N0, N)) 2305 return Combined; 2306 2307 return SDValue(); 2308 } 2309 2310 SDValue DAGCombiner::visitADD(SDNode *N) { 2311 SDValue N0 = N->getOperand(0); 2312 SDValue N1 = N->getOperand(1); 2313 EVT VT = N0.getValueType(); 2314 SDLoc DL(N); 2315 2316 if (SDValue Combined = visitADDLike(N)) 2317 return Combined; 2318 2319 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 2320 return V; 2321 2322 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 2323 return V; 2324 2325 // fold (a+b) -> (a|b) iff a and b share no bits. 2326 if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && 2327 DAG.haveNoCommonBitsSet(N0, N1)) 2328 return DAG.getNode(ISD::OR, DL, VT, N0, N1); 2329 2330 return SDValue(); 2331 } 2332 2333 SDValue DAGCombiner::visitADDSAT(SDNode *N) { 2334 unsigned Opcode = N->getOpcode(); 2335 SDValue N0 = N->getOperand(0); 2336 SDValue N1 = N->getOperand(1); 2337 EVT VT = N0.getValueType(); 2338 SDLoc DL(N); 2339 2340 // fold vector ops 2341 if (VT.isVector()) { 2342 // TODO SimplifyVBinOp 2343 2344 // fold (add_sat x, 0) -> x, vector edition 2345 if (ISD::isBuildVectorAllZeros(N1.getNode())) 2346 return N0; 2347 if (ISD::isBuildVectorAllZeros(N0.getNode())) 2348 return N1; 2349 } 2350 2351 // fold (add_sat x, undef) -> -1 2352 if (N0.isUndef() || N1.isUndef()) 2353 return DAG.getAllOnesConstant(DL, VT); 2354 2355 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2356 // canonicalize constant to RHS 2357 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2358 return DAG.getNode(Opcode, DL, VT, N1, N0); 2359 // fold (add_sat c1, c2) -> c3 2360 return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}); 2361 } 2362 2363 // fold (add_sat x, 0) -> x 2364 if (isNullConstant(N1)) 2365 return N0; 2366 2367 // If it cannot overflow, transform into an add. 2368 if (Opcode == ISD::UADDSAT) 2369 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2370 return DAG.getNode(ISD::ADD, DL, VT, N0, N1); 2371 2372 return SDValue(); 2373 } 2374 2375 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { 2376 bool Masked = false; 2377 2378 // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. 2379 while (true) { 2380 if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { 2381 V = V.getOperand(0); 2382 continue; 2383 } 2384 2385 if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { 2386 Masked = true; 2387 V = V.getOperand(0); 2388 continue; 2389 } 2390 2391 break; 2392 } 2393 2394 // If this is not a carry, return. 2395 if (V.getResNo() != 1) 2396 return SDValue(); 2397 2398 if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && 2399 V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) 2400 return SDValue(); 2401 2402 EVT VT = V.getNode()->getValueType(0); 2403 if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT)) 2404 return SDValue(); 2405 2406 // If the result is masked, then no matter what kind of bool it is we can 2407 // return. If it isn't, then we need to make sure the bool type is either 0 or 2408 // 1 and not other values. 2409 if (Masked || 2410 TLI.getBooleanContents(V.getValueType()) == 2411 TargetLoweringBase::ZeroOrOneBooleanContent) 2412 return V; 2413 2414 return SDValue(); 2415 } 2416 2417 /// Given the operands of an add/sub operation, see if the 2nd operand is a 2418 /// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert 2419 /// the opcode and bypass the mask operation. 2420 static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1, 2421 SelectionDAG &DAG, const SDLoc &DL) { 2422 if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1))) 2423 return SDValue(); 2424 2425 EVT VT = N0.getValueType(); 2426 if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits()) 2427 return SDValue(); 2428 2429 // add N0, (and (AssertSext X, i1), 1) --> sub N0, X 2430 // sub N0, (and (AssertSext X, i1), 1) --> add N0, X 2431 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0)); 2432 } 2433 2434 /// Helper for doing combines based on N0 and N1 being added to each other. 2435 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, 2436 SDNode *LocReference) { 2437 EVT VT = N0.getValueType(); 2438 SDLoc DL(LocReference); 2439 2440 // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) 2441 if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && 2442 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) 2443 return DAG.getNode(ISD::SUB, DL, VT, N0, 2444 DAG.getNode(ISD::SHL, DL, VT, 2445 N1.getOperand(0).getOperand(1), 2446 N1.getOperand(1))); 2447 2448 if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL)) 2449 return V; 2450 2451 // Look for: 2452 // add (add x, 1), y 2453 // And if the target does not like this form then turn into: 2454 // sub y, (xor x, -1) 2455 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2456 N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) { 2457 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2458 DAG.getAllOnesConstant(DL, VT)); 2459 return DAG.getNode(ISD::SUB, DL, VT, N1, Not); 2460 } 2461 2462 // Hoist one-use subtraction by non-opaque constant: 2463 // (x - C) + y -> (x + y) - C 2464 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 2465 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2466 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 2467 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1); 2468 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2469 } 2470 // Hoist one-use subtraction from non-opaque constant: 2471 // (C - x) + y -> (y - x) + C 2472 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2473 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 2474 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2475 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0)); 2476 } 2477 2478 // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1' 2479 // rather than 'add 0/-1' (the zext should get folded). 2480 // add (sext i1 Y), X --> sub X, (zext i1 Y) 2481 if (N0.getOpcode() == ISD::SIGN_EXTEND && 2482 N0.getOperand(0).getScalarValueSizeInBits() == 1 && 2483 TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) { 2484 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); 2485 return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); 2486 } 2487 2488 // add X, (sextinreg Y i1) -> sub X, (and Y 1) 2489 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 2490 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 2491 if (TN->getVT() == MVT::i1) { 2492 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 2493 DAG.getConstant(1, DL, VT)); 2494 return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); 2495 } 2496 } 2497 2498 // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2499 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && 2500 N1.getResNo() == 0) 2501 return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), 2502 N0, N1.getOperand(0), N1.getOperand(2)); 2503 2504 // (add X, Carry) -> (addcarry X, 0, Carry) 2505 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2506 if (SDValue Carry = getAsCarry(TLI, N1)) 2507 return DAG.getNode(ISD::ADDCARRY, DL, 2508 DAG.getVTList(VT, Carry.getValueType()), N0, 2509 DAG.getConstant(0, DL, VT), Carry); 2510 2511 return SDValue(); 2512 } 2513 2514 SDValue DAGCombiner::visitADDC(SDNode *N) { 2515 SDValue N0 = N->getOperand(0); 2516 SDValue N1 = N->getOperand(1); 2517 EVT VT = N0.getValueType(); 2518 SDLoc DL(N); 2519 2520 // If the flag result is dead, turn this into an ADD. 2521 if (!N->hasAnyUseOfValue(1)) 2522 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2523 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2524 2525 // canonicalize constant to RHS. 2526 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2527 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2528 if (N0C && !N1C) 2529 return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); 2530 2531 // fold (addc x, 0) -> x + no carry out 2532 if (isNullConstant(N1)) 2533 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, 2534 DL, MVT::Glue)); 2535 2536 // If it cannot overflow, transform into an add. 2537 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2538 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2539 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2540 2541 return SDValue(); 2542 } 2543 2544 static SDValue flipBoolean(SDValue V, const SDLoc &DL, 2545 SelectionDAG &DAG, const TargetLowering &TLI) { 2546 EVT VT = V.getValueType(); 2547 2548 SDValue Cst; 2549 switch (TLI.getBooleanContents(VT)) { 2550 case TargetLowering::ZeroOrOneBooleanContent: 2551 case TargetLowering::UndefinedBooleanContent: 2552 Cst = DAG.getConstant(1, DL, VT); 2553 break; 2554 case TargetLowering::ZeroOrNegativeOneBooleanContent: 2555 Cst = DAG.getAllOnesConstant(DL, VT); 2556 break; 2557 } 2558 2559 return DAG.getNode(ISD::XOR, DL, VT, V, Cst); 2560 } 2561 2562 /** 2563 * Flips a boolean if it is cheaper to compute. If the Force parameters is set, 2564 * then the flip also occurs if computing the inverse is the same cost. 2565 * This function returns an empty SDValue in case it cannot flip the boolean 2566 * without increasing the cost of the computation. If you want to flip a boolean 2567 * no matter what, use flipBoolean. 2568 */ 2569 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, 2570 const TargetLowering &TLI, 2571 bool Force) { 2572 if (Force && isa<ConstantSDNode>(V)) 2573 return flipBoolean(V, SDLoc(V), DAG, TLI); 2574 2575 if (V.getOpcode() != ISD::XOR) 2576 return SDValue(); 2577 2578 ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false); 2579 if (!Const) 2580 return SDValue(); 2581 2582 EVT VT = V.getValueType(); 2583 2584 bool IsFlip = false; 2585 switch(TLI.getBooleanContents(VT)) { 2586 case TargetLowering::ZeroOrOneBooleanContent: 2587 IsFlip = Const->isOne(); 2588 break; 2589 case TargetLowering::ZeroOrNegativeOneBooleanContent: 2590 IsFlip = Const->isAllOnesValue(); 2591 break; 2592 case TargetLowering::UndefinedBooleanContent: 2593 IsFlip = (Const->getAPIntValue() & 0x01) == 1; 2594 break; 2595 } 2596 2597 if (IsFlip) 2598 return V.getOperand(0); 2599 if (Force) 2600 return flipBoolean(V, SDLoc(V), DAG, TLI); 2601 return SDValue(); 2602 } 2603 2604 SDValue DAGCombiner::visitADDO(SDNode *N) { 2605 SDValue N0 = N->getOperand(0); 2606 SDValue N1 = N->getOperand(1); 2607 EVT VT = N0.getValueType(); 2608 bool IsSigned = (ISD::SADDO == N->getOpcode()); 2609 2610 EVT CarryVT = N->getValueType(1); 2611 SDLoc DL(N); 2612 2613 // If the flag result is dead, turn this into an ADD. 2614 if (!N->hasAnyUseOfValue(1)) 2615 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2616 DAG.getUNDEF(CarryVT)); 2617 2618 // canonicalize constant to RHS. 2619 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 2620 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2621 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 2622 2623 // fold (addo x, 0) -> x + no carry out 2624 if (isNullOrNullSplat(N1)) 2625 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 2626 2627 if (!IsSigned) { 2628 // If it cannot overflow, transform into an add. 2629 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2630 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2631 DAG.getConstant(0, DL, CarryVT)); 2632 2633 // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. 2634 if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { 2635 SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), 2636 DAG.getConstant(0, DL, VT), N0.getOperand(0)); 2637 return CombineTo(N, Sub, 2638 flipBoolean(Sub.getValue(1), DL, DAG, TLI)); 2639 } 2640 2641 if (SDValue Combined = visitUADDOLike(N0, N1, N)) 2642 return Combined; 2643 2644 if (SDValue Combined = visitUADDOLike(N1, N0, N)) 2645 return Combined; 2646 } 2647 2648 return SDValue(); 2649 } 2650 2651 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { 2652 EVT VT = N0.getValueType(); 2653 if (VT.isVector()) 2654 return SDValue(); 2655 2656 // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2657 // If Y + 1 cannot overflow. 2658 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { 2659 SDValue Y = N1.getOperand(0); 2660 SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); 2661 if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) 2662 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, 2663 N1.getOperand(2)); 2664 } 2665 2666 // (uaddo X, Carry) -> (addcarry X, 0, Carry) 2667 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2668 if (SDValue Carry = getAsCarry(TLI, N1)) 2669 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, 2670 DAG.getConstant(0, SDLoc(N), VT), Carry); 2671 2672 return SDValue(); 2673 } 2674 2675 SDValue DAGCombiner::visitADDE(SDNode *N) { 2676 SDValue N0 = N->getOperand(0); 2677 SDValue N1 = N->getOperand(1); 2678 SDValue CarryIn = N->getOperand(2); 2679 2680 // canonicalize constant to RHS 2681 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2682 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2683 if (N0C && !N1C) 2684 return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), 2685 N1, N0, CarryIn); 2686 2687 // fold (adde x, y, false) -> (addc x, y) 2688 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 2689 return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); 2690 2691 return SDValue(); 2692 } 2693 2694 SDValue DAGCombiner::visitADDCARRY(SDNode *N) { 2695 SDValue N0 = N->getOperand(0); 2696 SDValue N1 = N->getOperand(1); 2697 SDValue CarryIn = N->getOperand(2); 2698 SDLoc DL(N); 2699 2700 // canonicalize constant to RHS 2701 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2702 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2703 if (N0C && !N1C) 2704 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); 2705 2706 // fold (addcarry x, y, false) -> (uaddo x, y) 2707 if (isNullConstant(CarryIn)) { 2708 if (!LegalOperations || 2709 TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) 2710 return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); 2711 } 2712 2713 // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. 2714 if (isNullConstant(N0) && isNullConstant(N1)) { 2715 EVT VT = N0.getValueType(); 2716 EVT CarryVT = CarryIn.getValueType(); 2717 SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); 2718 AddToWorklist(CarryExt.getNode()); 2719 return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, 2720 DAG.getConstant(1, DL, VT)), 2721 DAG.getConstant(0, DL, CarryVT)); 2722 } 2723 2724 if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) 2725 return Combined; 2726 2727 if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) 2728 return Combined; 2729 2730 return SDValue(); 2731 } 2732 2733 /** 2734 * If we are facing some sort of diamond carry propapagtion pattern try to 2735 * break it up to generate something like: 2736 * (addcarry X, 0, (addcarry A, B, Z):Carry) 2737 * 2738 * The end result is usually an increase in operation required, but because the 2739 * carry is now linearized, other tranforms can kick in and optimize the DAG. 2740 * 2741 * Patterns typically look something like 2742 * (uaddo A, B) 2743 * / \ 2744 * Carry Sum 2745 * | \ 2746 * | (addcarry *, 0, Z) 2747 * | / 2748 * \ Carry 2749 * | / 2750 * (addcarry X, *, *) 2751 * 2752 * But numerous variation exist. Our goal is to identify A, B, X and Z and 2753 * produce a combine with a single path for carry propagation. 2754 */ 2755 static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 2756 SDValue X, SDValue Carry0, SDValue Carry1, 2757 SDNode *N) { 2758 if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1) 2759 return SDValue(); 2760 if (Carry1.getOpcode() != ISD::UADDO) 2761 return SDValue(); 2762 2763 SDValue Z; 2764 2765 /** 2766 * First look for a suitable Z. It will present itself in the form of 2767 * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true 2768 */ 2769 if (Carry0.getOpcode() == ISD::ADDCARRY && 2770 isNullConstant(Carry0.getOperand(1))) { 2771 Z = Carry0.getOperand(2); 2772 } else if (Carry0.getOpcode() == ISD::UADDO && 2773 isOneConstant(Carry0.getOperand(1))) { 2774 EVT VT = Combiner.getSetCCResultType(Carry0.getValueType()); 2775 Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT); 2776 } else { 2777 // We couldn't find a suitable Z. 2778 return SDValue(); 2779 } 2780 2781 2782 auto cancelDiamond = [&](SDValue A,SDValue B) { 2783 SDLoc DL(N); 2784 SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z); 2785 Combiner.AddToWorklist(NewY.getNode()); 2786 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X, 2787 DAG.getConstant(0, DL, X.getValueType()), 2788 NewY.getValue(1)); 2789 }; 2790 2791 /** 2792 * (uaddo A, B) 2793 * | 2794 * Sum 2795 * | 2796 * (addcarry *, 0, Z) 2797 */ 2798 if (Carry0.getOperand(0) == Carry1.getValue(0)) { 2799 return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1)); 2800 } 2801 2802 /** 2803 * (addcarry A, 0, Z) 2804 * | 2805 * Sum 2806 * | 2807 * (uaddo *, B) 2808 */ 2809 if (Carry1.getOperand(0) == Carry0.getValue(0)) { 2810 return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1)); 2811 } 2812 2813 if (Carry1.getOperand(1) == Carry0.getValue(0)) { 2814 return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0)); 2815 } 2816 2817 return SDValue(); 2818 } 2819 2820 // If we are facing some sort of diamond carry/borrow in/out pattern try to 2821 // match patterns like: 2822 // 2823 // (uaddo A, B) CarryIn 2824 // | \ | 2825 // | \ | 2826 // PartialSum PartialCarryOutX / 2827 // | | / 2828 // | ____|____________/ 2829 // | / | 2830 // (uaddo *, *) \________ 2831 // | \ \ 2832 // | \ | 2833 // | PartialCarryOutY | 2834 // | \ | 2835 // | \ / 2836 // AddCarrySum | ______/ 2837 // | / 2838 // CarryOut = (or *, *) 2839 // 2840 // And generate ADDCARRY (or SUBCARRY) with two result values: 2841 // 2842 // {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) 2843 // 2844 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with 2845 // a single path for carry/borrow out propagation: 2846 static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 2847 const TargetLowering &TLI, SDValue Carry0, 2848 SDValue Carry1, SDNode *N) { 2849 if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) 2850 return SDValue(); 2851 unsigned Opcode = Carry0.getOpcode(); 2852 if (Opcode != Carry1.getOpcode()) 2853 return SDValue(); 2854 if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) 2855 return SDValue(); 2856 2857 // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the 2858 // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in 2859 // the above ASCII art.) 2860 if (Carry1.getOperand(0) != Carry0.getValue(0) && 2861 Carry1.getOperand(1) != Carry0.getValue(0)) 2862 std::swap(Carry0, Carry1); 2863 if (Carry1.getOperand(0) != Carry0.getValue(0) && 2864 Carry1.getOperand(1) != Carry0.getValue(0)) 2865 return SDValue(); 2866 2867 // The carry in value must be on the righthand side for subtraction. 2868 unsigned CarryInOperandNum = 2869 Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0; 2870 if (Opcode == ISD::USUBO && CarryInOperandNum != 1) 2871 return SDValue(); 2872 SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); 2873 2874 unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; 2875 if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) 2876 return SDValue(); 2877 2878 // Verify that the carry/borrow in is plausibly a carry/borrow bit. 2879 // TODO: make getAsCarry() aware of how partial carries are merged. 2880 if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) 2881 return SDValue(); 2882 CarryIn = CarryIn.getOperand(0); 2883 if (CarryIn.getValueType() != MVT::i1) 2884 return SDValue(); 2885 2886 SDLoc DL(N); 2887 SDValue Merged = 2888 DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0), 2889 Carry0.getOperand(1), CarryIn); 2890 2891 // Please note that because we have proven that the result of the UADDO/USUBO 2892 // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can 2893 // therefore prove that if the first UADDO/USUBO overflows, the second 2894 // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the 2895 // maximum value. 2896 // 2897 // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry 2898 // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) 2899 // 2900 // This is important because it means that OR and XOR can be used to merge 2901 // carry flags; and that AND can return a constant zero. 2902 // 2903 // TODO: match other operations that can merge flags (ADD, etc) 2904 DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); 2905 if (N->getOpcode() == ISD::AND) 2906 return DAG.getConstant(0, DL, MVT::i1); 2907 return Merged.getValue(1); 2908 } 2909 2910 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, 2911 SDNode *N) { 2912 // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. 2913 if (isBitwiseNot(N0)) 2914 if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) { 2915 SDLoc DL(N); 2916 SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, 2917 N0.getOperand(0), NotC); 2918 return CombineTo(N, Sub, 2919 flipBoolean(Sub.getValue(1), DL, DAG, TLI)); 2920 } 2921 2922 // Iff the flag result is dead: 2923 // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) 2924 // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo 2925 // or the dependency between the instructions. 2926 if ((N0.getOpcode() == ISD::ADD || 2927 (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 && 2928 N0.getValue(1) != CarryIn)) && 2929 isNullConstant(N1) && !N->hasAnyUseOfValue(1)) 2930 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), 2931 N0.getOperand(0), N0.getOperand(1), CarryIn); 2932 2933 /** 2934 * When one of the addcarry argument is itself a carry, we may be facing 2935 * a diamond carry propagation. In which case we try to transform the DAG 2936 * to ensure linear carry propagation if that is possible. 2937 */ 2938 if (auto Y = getAsCarry(TLI, N1)) { 2939 // Because both are carries, Y and Z can be swapped. 2940 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) 2941 return R; 2942 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) 2943 return R; 2944 } 2945 2946 return SDValue(); 2947 } 2948 2949 // Since it may not be valid to emit a fold to zero for vector initializers 2950 // check if we can before folding. 2951 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, 2952 SelectionDAG &DAG, bool LegalOperations) { 2953 if (!VT.isVector()) 2954 return DAG.getConstant(0, DL, VT); 2955 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 2956 return DAG.getConstant(0, DL, VT); 2957 return SDValue(); 2958 } 2959 2960 SDValue DAGCombiner::visitSUB(SDNode *N) { 2961 SDValue N0 = N->getOperand(0); 2962 SDValue N1 = N->getOperand(1); 2963 EVT VT = N0.getValueType(); 2964 SDLoc DL(N); 2965 2966 // fold vector ops 2967 if (VT.isVector()) { 2968 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 2969 return FoldedVOp; 2970 2971 // fold (sub x, 0) -> x, vector edition 2972 if (ISD::isBuildVectorAllZeros(N1.getNode())) 2973 return N0; 2974 } 2975 2976 // fold (sub x, x) -> 0 2977 // FIXME: Refactor this and xor and other similar operations together. 2978 if (N0 == N1) 2979 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 2980 2981 // fold (sub c1, c2) -> c3 2982 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) 2983 return C; 2984 2985 if (SDValue NewSel = foldBinOpIntoSelect(N)) 2986 return NewSel; 2987 2988 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 2989 2990 // fold (sub x, c) -> (add x, -c) 2991 if (N1C) { 2992 return DAG.getNode(ISD::ADD, DL, VT, N0, 2993 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 2994 } 2995 2996 if (isNullOrNullSplat(N0)) { 2997 unsigned BitWidth = VT.getScalarSizeInBits(); 2998 // Right-shifting everything out but the sign bit followed by negation is 2999 // the same as flipping arithmetic/logical shift type without the negation: 3000 // -(X >>u 31) -> (X >>s 31) 3001 // -(X >>s 31) -> (X >>u 31) 3002 if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { 3003 ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); 3004 if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) { 3005 auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; 3006 if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) 3007 return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); 3008 } 3009 } 3010 3011 // 0 - X --> 0 if the sub is NUW. 3012 if (N->getFlags().hasNoUnsignedWrap()) 3013 return N0; 3014 3015 if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { 3016 // N1 is either 0 or the minimum signed value. If the sub is NSW, then 3017 // N1 must be 0 because negating the minimum signed value is undefined. 3018 if (N->getFlags().hasNoSignedWrap()) 3019 return N0; 3020 3021 // 0 - X --> X if X is 0 or the minimum signed value. 3022 return N1; 3023 } 3024 } 3025 3026 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) 3027 if (isAllOnesOrAllOnesSplat(N0)) 3028 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 3029 3030 // fold (A - (0-B)) -> A+B 3031 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 3032 return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1)); 3033 3034 // fold A-(A-B) -> B 3035 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) 3036 return N1.getOperand(1); 3037 3038 // fold (A+B)-A -> B 3039 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) 3040 return N0.getOperand(1); 3041 3042 // fold (A+B)-B -> A 3043 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) 3044 return N0.getOperand(0); 3045 3046 // fold (A+C1)-C2 -> A+(C1-C2) 3047 if (N0.getOpcode() == ISD::ADD && 3048 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3049 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3050 SDValue NewC = 3051 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1}); 3052 assert(NewC && "Constant folding failed"); 3053 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC); 3054 } 3055 3056 // fold C2-(A+C1) -> (C2-C1)-A 3057 if (N1.getOpcode() == ISD::ADD) { 3058 SDValue N11 = N1.getOperand(1); 3059 if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && 3060 isConstantOrConstantVector(N11, /* NoOpaques */ true)) { 3061 SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}); 3062 assert(NewC && "Constant folding failed"); 3063 return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); 3064 } 3065 } 3066 3067 // fold (A-C1)-C2 -> A-(C1+C2) 3068 if (N0.getOpcode() == ISD::SUB && 3069 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3070 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3071 SDValue NewC = 3072 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1}); 3073 assert(NewC && "Constant folding failed"); 3074 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC); 3075 } 3076 3077 // fold (c1-A)-c2 -> (c1-c2)-A 3078 if (N0.getOpcode() == ISD::SUB && 3079 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3080 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) { 3081 SDValue NewC = 3082 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1}); 3083 assert(NewC && "Constant folding failed"); 3084 return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1)); 3085 } 3086 3087 // fold ((A+(B+or-C))-B) -> A+or-C 3088 if (N0.getOpcode() == ISD::ADD && 3089 (N0.getOperand(1).getOpcode() == ISD::SUB || 3090 N0.getOperand(1).getOpcode() == ISD::ADD) && 3091 N0.getOperand(1).getOperand(0) == N1) 3092 return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0), 3093 N0.getOperand(1).getOperand(1)); 3094 3095 // fold ((A+(C+B))-B) -> A+C 3096 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && 3097 N0.getOperand(1).getOperand(1) == N1) 3098 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), 3099 N0.getOperand(1).getOperand(0)); 3100 3101 // fold ((A-(B-C))-C) -> A-B 3102 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && 3103 N0.getOperand(1).getOperand(1) == N1) 3104 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 3105 N0.getOperand(1).getOperand(0)); 3106 3107 // fold (A-(B-C)) -> A+(C-B) 3108 if (N1.getOpcode() == ISD::SUB && N1.hasOneUse()) 3109 return DAG.getNode(ISD::ADD, DL, VT, N0, 3110 DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1), 3111 N1.getOperand(0))); 3112 3113 // A - (A & B) -> A & (~B) 3114 if (N1.getOpcode() == ISD::AND) { 3115 SDValue A = N1.getOperand(0); 3116 SDValue B = N1.getOperand(1); 3117 if (A != N0) 3118 std::swap(A, B); 3119 if (A == N0 && 3120 (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) { 3121 SDValue InvB = 3122 DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT)); 3123 return DAG.getNode(ISD::AND, DL, VT, A, InvB); 3124 } 3125 } 3126 3127 // fold (X - (-Y * Z)) -> (X + (Y * Z)) 3128 if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { 3129 if (N1.getOperand(0).getOpcode() == ISD::SUB && 3130 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) { 3131 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3132 N1.getOperand(0).getOperand(1), 3133 N1.getOperand(1)); 3134 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3135 } 3136 if (N1.getOperand(1).getOpcode() == ISD::SUB && 3137 isNullOrNullSplat(N1.getOperand(1).getOperand(0))) { 3138 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3139 N1.getOperand(0), 3140 N1.getOperand(1).getOperand(1)); 3141 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3142 } 3143 } 3144 3145 // If either operand of a sub is undef, the result is undef 3146 if (N0.isUndef()) 3147 return N0; 3148 if (N1.isUndef()) 3149 return N1; 3150 3151 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 3152 return V; 3153 3154 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 3155 return V; 3156 3157 if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) 3158 return V; 3159 3160 // (x - y) - 1 -> add (xor y, -1), x 3161 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) { 3162 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), 3163 DAG.getAllOnesConstant(DL, VT)); 3164 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 3165 } 3166 3167 // Look for: 3168 // sub y, (xor x, -1) 3169 // And if the target does not like this form then turn into: 3170 // add (add x, y), 1 3171 if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) { 3172 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0)); 3173 return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT)); 3174 } 3175 3176 // Hoist one-use addition by non-opaque constant: 3177 // (x + C) - y -> (x - y) + C 3178 if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD && 3179 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3180 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3181 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); 3182 } 3183 // y - (x + C) -> (y - x) - C 3184 if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD && 3185 isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) { 3186 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0)); 3187 return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1)); 3188 } 3189 // (x - C) - y -> (x - y) - C 3190 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 3191 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3192 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3193 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3194 return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); 3195 } 3196 // (C - x) - y -> C - (x + y) 3197 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3198 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 3199 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); 3200 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); 3201 } 3202 3203 // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1' 3204 // rather than 'sub 0/1' (the sext should get folded). 3205 // sub X, (zext i1 Y) --> add X, (sext i1 Y) 3206 if (N1.getOpcode() == ISD::ZERO_EXTEND && 3207 N1.getOperand(0).getScalarValueSizeInBits() == 1 && 3208 TLI.getBooleanContents(VT) == 3209 TargetLowering::ZeroOrNegativeOneBooleanContent) { 3210 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0)); 3211 return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); 3212 } 3213 3214 // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) 3215 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 3216 if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { 3217 SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); 3218 SDValue S0 = N1.getOperand(0); 3219 if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) { 3220 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 3221 if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) 3222 if (C->getAPIntValue() == (OpSizeInBits - 1)) 3223 return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); 3224 } 3225 } 3226 } 3227 3228 // If the relocation model supports it, consider symbol offsets. 3229 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) 3230 if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { 3231 // fold (sub Sym, c) -> Sym-c 3232 if (N1C && GA->getOpcode() == ISD::GlobalAddress) 3233 return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, 3234 GA->getOffset() - 3235 (uint64_t)N1C->getSExtValue()); 3236 // fold (sub Sym+c1, Sym+c2) -> c1-c2 3237 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1)) 3238 if (GA->getGlobal() == GB->getGlobal()) 3239 return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), 3240 DL, VT); 3241 } 3242 3243 // sub X, (sextinreg Y i1) -> add X, (and Y 1) 3244 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 3245 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 3246 if (TN->getVT() == MVT::i1) { 3247 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 3248 DAG.getConstant(1, DL, VT)); 3249 return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); 3250 } 3251 } 3252 3253 // Prefer an add for more folding potential and possibly better codegen: 3254 // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) 3255 if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { 3256 SDValue ShAmt = N1.getOperand(1); 3257 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 3258 if (ShAmtC && 3259 ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) { 3260 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt); 3261 return DAG.getNode(ISD::ADD, DL, VT, N0, SRA); 3262 } 3263 } 3264 3265 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) { 3266 // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry) 3267 if (SDValue Carry = getAsCarry(TLI, N0)) { 3268 SDValue X = N1; 3269 SDValue Zero = DAG.getConstant(0, DL, VT); 3270 SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X); 3271 return DAG.getNode(ISD::ADDCARRY, DL, 3272 DAG.getVTList(VT, Carry.getValueType()), NegX, Zero, 3273 Carry); 3274 } 3275 } 3276 3277 return SDValue(); 3278 } 3279 3280 SDValue DAGCombiner::visitSUBSAT(SDNode *N) { 3281 SDValue N0 = N->getOperand(0); 3282 SDValue N1 = N->getOperand(1); 3283 EVT VT = N0.getValueType(); 3284 SDLoc DL(N); 3285 3286 // fold vector ops 3287 if (VT.isVector()) { 3288 // TODO SimplifyVBinOp 3289 3290 // fold (sub_sat x, 0) -> x, vector edition 3291 if (ISD::isBuildVectorAllZeros(N1.getNode())) 3292 return N0; 3293 } 3294 3295 // fold (sub_sat x, undef) -> 0 3296 if (N0.isUndef() || N1.isUndef()) 3297 return DAG.getConstant(0, DL, VT); 3298 3299 // fold (sub_sat x, x) -> 0 3300 if (N0 == N1) 3301 return DAG.getConstant(0, DL, VT); 3302 3303 // fold (sub_sat c1, c2) -> c3 3304 if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1})) 3305 return C; 3306 3307 // fold (sub_sat x, 0) -> x 3308 if (isNullConstant(N1)) 3309 return N0; 3310 3311 return SDValue(); 3312 } 3313 3314 SDValue DAGCombiner::visitSUBC(SDNode *N) { 3315 SDValue N0 = N->getOperand(0); 3316 SDValue N1 = N->getOperand(1); 3317 EVT VT = N0.getValueType(); 3318 SDLoc DL(N); 3319 3320 // If the flag result is dead, turn this into an SUB. 3321 if (!N->hasAnyUseOfValue(1)) 3322 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3323 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3324 3325 // fold (subc x, x) -> 0 + no borrow 3326 if (N0 == N1) 3327 return CombineTo(N, DAG.getConstant(0, DL, VT), 3328 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3329 3330 // fold (subc x, 0) -> x + no borrow 3331 if (isNullConstant(N1)) 3332 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3333 3334 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3335 if (isAllOnesConstant(N0)) 3336 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3337 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3338 3339 return SDValue(); 3340 } 3341 3342 SDValue DAGCombiner::visitSUBO(SDNode *N) { 3343 SDValue N0 = N->getOperand(0); 3344 SDValue N1 = N->getOperand(1); 3345 EVT VT = N0.getValueType(); 3346 bool IsSigned = (ISD::SSUBO == N->getOpcode()); 3347 3348 EVT CarryVT = N->getValueType(1); 3349 SDLoc DL(N); 3350 3351 // If the flag result is dead, turn this into an SUB. 3352 if (!N->hasAnyUseOfValue(1)) 3353 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3354 DAG.getUNDEF(CarryVT)); 3355 3356 // fold (subo x, x) -> 0 + no borrow 3357 if (N0 == N1) 3358 return CombineTo(N, DAG.getConstant(0, DL, VT), 3359 DAG.getConstant(0, DL, CarryVT)); 3360 3361 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 3362 3363 // fold (subox, c) -> (addo x, -c) 3364 if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) { 3365 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, 3366 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 3367 } 3368 3369 // fold (subo x, 0) -> x + no borrow 3370 if (isNullOrNullSplat(N1)) 3371 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 3372 3373 // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3374 if (!IsSigned && isAllOnesOrAllOnesSplat(N0)) 3375 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3376 DAG.getConstant(0, DL, CarryVT)); 3377 3378 return SDValue(); 3379 } 3380 3381 SDValue DAGCombiner::visitSUBE(SDNode *N) { 3382 SDValue N0 = N->getOperand(0); 3383 SDValue N1 = N->getOperand(1); 3384 SDValue CarryIn = N->getOperand(2); 3385 3386 // fold (sube x, y, false) -> (subc x, y) 3387 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 3388 return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); 3389 3390 return SDValue(); 3391 } 3392 3393 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { 3394 SDValue N0 = N->getOperand(0); 3395 SDValue N1 = N->getOperand(1); 3396 SDValue CarryIn = N->getOperand(2); 3397 3398 // fold (subcarry x, y, false) -> (usubo x, y) 3399 if (isNullConstant(CarryIn)) { 3400 if (!LegalOperations || 3401 TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) 3402 return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); 3403 } 3404 3405 return SDValue(); 3406 } 3407 3408 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and 3409 // UMULFIXSAT here. 3410 SDValue DAGCombiner::visitMULFIX(SDNode *N) { 3411 SDValue N0 = N->getOperand(0); 3412 SDValue N1 = N->getOperand(1); 3413 SDValue Scale = N->getOperand(2); 3414 EVT VT = N0.getValueType(); 3415 3416 // fold (mulfix x, undef, scale) -> 0 3417 if (N0.isUndef() || N1.isUndef()) 3418 return DAG.getConstant(0, SDLoc(N), VT); 3419 3420 // Canonicalize constant to RHS (vector doesn't have to splat) 3421 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3422 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3423 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale); 3424 3425 // fold (mulfix x, 0, scale) -> 0 3426 if (isNullConstant(N1)) 3427 return DAG.getConstant(0, SDLoc(N), VT); 3428 3429 return SDValue(); 3430 } 3431 3432 SDValue DAGCombiner::visitMUL(SDNode *N) { 3433 SDValue N0 = N->getOperand(0); 3434 SDValue N1 = N->getOperand(1); 3435 EVT VT = N0.getValueType(); 3436 3437 // fold (mul x, undef) -> 0 3438 if (N0.isUndef() || N1.isUndef()) 3439 return DAG.getConstant(0, SDLoc(N), VT); 3440 3441 bool N1IsConst = false; 3442 bool N1IsOpaqueConst = false; 3443 APInt ConstValue1; 3444 3445 // fold vector ops 3446 if (VT.isVector()) { 3447 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3448 return FoldedVOp; 3449 3450 N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); 3451 assert((!N1IsConst || 3452 ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && 3453 "Splat APInt should be element width"); 3454 } else { 3455 N1IsConst = isa<ConstantSDNode>(N1); 3456 if (N1IsConst) { 3457 ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue(); 3458 N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque(); 3459 } 3460 } 3461 3462 // fold (mul c1, c2) -> c1*c2 3463 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1})) 3464 return C; 3465 3466 // canonicalize constant to RHS (vector doesn't have to splat) 3467 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3468 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3469 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); 3470 3471 // fold (mul x, 0) -> 0 3472 if (N1IsConst && ConstValue1.isNullValue()) 3473 return N1; 3474 3475 // fold (mul x, 1) -> x 3476 if (N1IsConst && ConstValue1.isOneValue()) 3477 return N0; 3478 3479 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3480 return NewSel; 3481 3482 // fold (mul x, -1) -> 0-x 3483 if (N1IsConst && ConstValue1.isAllOnesValue()) { 3484 SDLoc DL(N); 3485 return DAG.getNode(ISD::SUB, DL, VT, 3486 DAG.getConstant(0, DL, VT), N0); 3487 } 3488 3489 // fold (mul x, (1 << c)) -> x << c 3490 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 3491 DAG.isKnownToBeAPowerOfTwo(N1) && 3492 (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { 3493 SDLoc DL(N); 3494 SDValue LogBase2 = BuildLogBase2(N1, DL); 3495 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 3496 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 3497 return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); 3498 } 3499 3500 // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c 3501 if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { 3502 unsigned Log2Val = (-ConstValue1).logBase2(); 3503 SDLoc DL(N); 3504 // FIXME: If the input is something that is easily negated (e.g. a 3505 // single-use add), we should put the negate there. 3506 return DAG.getNode(ISD::SUB, DL, VT, 3507 DAG.getConstant(0, DL, VT), 3508 DAG.getNode(ISD::SHL, DL, VT, N0, 3509 DAG.getConstant(Log2Val, DL, 3510 getShiftAmountTy(N0.getValueType())))); 3511 } 3512 3513 // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub. 3514 // mul x, (2^N + 1) --> add (shl x, N), x 3515 // mul x, (2^N - 1) --> sub (shl x, N), x 3516 // Examples: x * 33 --> (x << 5) + x 3517 // x * 15 --> (x << 4) - x 3518 // x * -33 --> -((x << 5) + x) 3519 // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) 3520 if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { 3521 // TODO: We could handle more general decomposition of any constant by 3522 // having the target set a limit on number of ops and making a 3523 // callback to determine that sequence (similar to sqrt expansion). 3524 unsigned MathOp = ISD::DELETED_NODE; 3525 APInt MulC = ConstValue1.abs(); 3526 if ((MulC - 1).isPowerOf2()) 3527 MathOp = ISD::ADD; 3528 else if ((MulC + 1).isPowerOf2()) 3529 MathOp = ISD::SUB; 3530 3531 if (MathOp != ISD::DELETED_NODE) { 3532 unsigned ShAmt = 3533 MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); 3534 assert(ShAmt < VT.getScalarSizeInBits() && 3535 "multiply-by-constant generated out of bounds shift"); 3536 SDLoc DL(N); 3537 SDValue Shl = 3538 DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); 3539 SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0); 3540 if (ConstValue1.isNegative()) 3541 R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); 3542 return R; 3543 } 3544 } 3545 3546 // (mul (shl X, c1), c2) -> (mul X, c2 << c1) 3547 if (N0.getOpcode() == ISD::SHL && 3548 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3549 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3550 SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); 3551 if (isConstantOrConstantVector(C3)) 3552 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); 3553 } 3554 3555 // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one 3556 // use. 3557 { 3558 SDValue Sh(nullptr, 0), Y(nullptr, 0); 3559 3560 // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). 3561 if (N0.getOpcode() == ISD::SHL && 3562 isConstantOrConstantVector(N0.getOperand(1)) && 3563 N0.getNode()->hasOneUse()) { 3564 Sh = N0; Y = N1; 3565 } else if (N1.getOpcode() == ISD::SHL && 3566 isConstantOrConstantVector(N1.getOperand(1)) && 3567 N1.getNode()->hasOneUse()) { 3568 Sh = N1; Y = N0; 3569 } 3570 3571 if (Sh.getNode()) { 3572 SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); 3573 return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); 3574 } 3575 } 3576 3577 // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) 3578 if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && 3579 N0.getOpcode() == ISD::ADD && 3580 DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && 3581 isMulAddWithConstProfitable(N, N0, N1)) 3582 return DAG.getNode(ISD::ADD, SDLoc(N), VT, 3583 DAG.getNode(ISD::MUL, SDLoc(N0), VT, 3584 N0.getOperand(0), N1), 3585 DAG.getNode(ISD::MUL, SDLoc(N1), VT, 3586 N0.getOperand(1), N1)); 3587 3588 // reassociate mul 3589 if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) 3590 return RMUL; 3591 3592 return SDValue(); 3593 } 3594 3595 /// Return true if divmod libcall is available. 3596 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, 3597 const TargetLowering &TLI) { 3598 RTLIB::Libcall LC; 3599 EVT NodeType = Node->getValueType(0); 3600 if (!NodeType.isSimple()) 3601 return false; 3602 switch (NodeType.getSimpleVT().SimpleTy) { 3603 default: return false; // No libcall for vector types. 3604 case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 3605 case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 3606 case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 3607 case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 3608 case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; 3609 } 3610 3611 return TLI.getLibcallName(LC) != nullptr; 3612 } 3613 3614 /// Issue divrem if both quotient and remainder are needed. 3615 SDValue DAGCombiner::useDivRem(SDNode *Node) { 3616 if (Node->use_empty()) 3617 return SDValue(); // This is a dead node, leave it alone. 3618 3619 unsigned Opcode = Node->getOpcode(); 3620 bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); 3621 unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; 3622 3623 // DivMod lib calls can still work on non-legal types if using lib-calls. 3624 EVT VT = Node->getValueType(0); 3625 if (VT.isVector() || !VT.isInteger()) 3626 return SDValue(); 3627 3628 if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT)) 3629 return SDValue(); 3630 3631 // If DIVREM is going to get expanded into a libcall, 3632 // but there is no libcall available, then don't combine. 3633 if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && 3634 !isDivRemLibcallAvailable(Node, isSigned, TLI)) 3635 return SDValue(); 3636 3637 // If div is legal, it's better to do the normal expansion 3638 unsigned OtherOpcode = 0; 3639 if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { 3640 OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; 3641 if (TLI.isOperationLegalOrCustom(Opcode, VT)) 3642 return SDValue(); 3643 } else { 3644 OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 3645 if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) 3646 return SDValue(); 3647 } 3648 3649 SDValue Op0 = Node->getOperand(0); 3650 SDValue Op1 = Node->getOperand(1); 3651 SDValue combined; 3652 for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), 3653 UE = Op0.getNode()->use_end(); UI != UE; ++UI) { 3654 SDNode *User = *UI; 3655 if (User == Node || User->getOpcode() == ISD::DELETED_NODE || 3656 User->use_empty()) 3657 continue; 3658 // Convert the other matching node(s), too; 3659 // otherwise, the DIVREM may get target-legalized into something 3660 // target-specific that we won't be able to recognize. 3661 unsigned UserOpc = User->getOpcode(); 3662 if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && 3663 User->getOperand(0) == Op0 && 3664 User->getOperand(1) == Op1) { 3665 if (!combined) { 3666 if (UserOpc == OtherOpcode) { 3667 SDVTList VTs = DAG.getVTList(VT, VT); 3668 combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); 3669 } else if (UserOpc == DivRemOpc) { 3670 combined = SDValue(User, 0); 3671 } else { 3672 assert(UserOpc == Opcode); 3673 continue; 3674 } 3675 } 3676 if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) 3677 CombineTo(User, combined); 3678 else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) 3679 CombineTo(User, combined.getValue(1)); 3680 } 3681 } 3682 return combined; 3683 } 3684 3685 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { 3686 SDValue N0 = N->getOperand(0); 3687 SDValue N1 = N->getOperand(1); 3688 EVT VT = N->getValueType(0); 3689 SDLoc DL(N); 3690 3691 unsigned Opc = N->getOpcode(); 3692 bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc); 3693 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3694 3695 // X / undef -> undef 3696 // X % undef -> undef 3697 // X / 0 -> undef 3698 // X % 0 -> undef 3699 // NOTE: This includes vectors where any divisor element is zero/undef. 3700 if (DAG.isUndef(Opc, {N0, N1})) 3701 return DAG.getUNDEF(VT); 3702 3703 // undef / X -> 0 3704 // undef % X -> 0 3705 if (N0.isUndef()) 3706 return DAG.getConstant(0, DL, VT); 3707 3708 // 0 / X -> 0 3709 // 0 % X -> 0 3710 ConstantSDNode *N0C = isConstOrConstSplat(N0); 3711 if (N0C && N0C->isNullValue()) 3712 return N0; 3713 3714 // X / X -> 1 3715 // X % X -> 0 3716 if (N0 == N1) 3717 return DAG.getConstant(IsDiv ? 1 : 0, DL, VT); 3718 3719 // X / 1 -> X 3720 // X % 1 -> 0 3721 // If this is a boolean op (single-bit element type), we can't have 3722 // division-by-zero or remainder-by-zero, so assume the divisor is 1. 3723 // TODO: Similarly, if we're zero-extending a boolean divisor, then assume 3724 // it's a 1. 3725 if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1)) 3726 return IsDiv ? N0 : DAG.getConstant(0, DL, VT); 3727 3728 return SDValue(); 3729 } 3730 3731 SDValue DAGCombiner::visitSDIV(SDNode *N) { 3732 SDValue N0 = N->getOperand(0); 3733 SDValue N1 = N->getOperand(1); 3734 EVT VT = N->getValueType(0); 3735 EVT CCVT = getSetCCResultType(VT); 3736 3737 // fold vector ops 3738 if (VT.isVector()) 3739 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3740 return FoldedVOp; 3741 3742 SDLoc DL(N); 3743 3744 // fold (sdiv c1, c2) -> c1/c2 3745 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3746 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1})) 3747 return C; 3748 3749 // fold (sdiv X, -1) -> 0-X 3750 if (N1C && N1C->isAllOnesValue()) 3751 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); 3752 3753 // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) 3754 if (N1C && N1C->getAPIntValue().isMinSignedValue()) 3755 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 3756 DAG.getConstant(1, DL, VT), 3757 DAG.getConstant(0, DL, VT)); 3758 3759 if (SDValue V = simplifyDivRem(N, DAG)) 3760 return V; 3761 3762 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3763 return NewSel; 3764 3765 // If we know the sign bits of both operands are zero, strength reduce to a 3766 // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 3767 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 3768 return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); 3769 3770 if (SDValue V = visitSDIVLike(N0, N1, N)) { 3771 // If the corresponding remainder node exists, update its users with 3772 // (Dividend - (Quotient * Divisor). 3773 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(), 3774 { N0, N1 })) { 3775 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 3776 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 3777 AddToWorklist(Mul.getNode()); 3778 AddToWorklist(Sub.getNode()); 3779 CombineTo(RemNode, Sub); 3780 } 3781 return V; 3782 } 3783 3784 // sdiv, srem -> sdivrem 3785 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 3786 // true. Otherwise, we break the simplification logic in visitREM(). 3787 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3788 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 3789 if (SDValue DivRem = useDivRem(N)) 3790 return DivRem; 3791 3792 return SDValue(); 3793 } 3794 3795 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { 3796 SDLoc DL(N); 3797 EVT VT = N->getValueType(0); 3798 EVT CCVT = getSetCCResultType(VT); 3799 unsigned BitWidth = VT.getScalarSizeInBits(); 3800 3801 // Helper for determining whether a value is a power-2 constant scalar or a 3802 // vector of such elements. 3803 auto IsPowerOfTwo = [](ConstantSDNode *C) { 3804 if (C->isNullValue() || C->isOpaque()) 3805 return false; 3806 if (C->getAPIntValue().isPowerOf2()) 3807 return true; 3808 if ((-C->getAPIntValue()).isPowerOf2()) 3809 return true; 3810 return false; 3811 }; 3812 3813 // fold (sdiv X, pow2) -> simple ops after legalize 3814 // FIXME: We check for the exact bit here because the generic lowering gives 3815 // better results in that case. The target-specific lowering should learn how 3816 // to handle exact sdivs efficiently. 3817 if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) { 3818 // Target-specific implementation of sdiv x, pow2. 3819 if (SDValue Res = BuildSDIVPow2(N)) 3820 return Res; 3821 3822 // Create constants that are functions of the shift amount value. 3823 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 3824 SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy); 3825 SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1); 3826 C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy); 3827 SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1); 3828 if (!isConstantOrConstantVector(Inexact)) 3829 return SDValue(); 3830 3831 // Splat the sign bit into the register 3832 SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0, 3833 DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy)); 3834 AddToWorklist(Sign.getNode()); 3835 3836 // Add (N0 < 0) ? abs2 - 1 : 0; 3837 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact); 3838 AddToWorklist(Srl.getNode()); 3839 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl); 3840 AddToWorklist(Add.getNode()); 3841 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1); 3842 AddToWorklist(Sra.getNode()); 3843 3844 // Special case: (sdiv X, 1) -> X 3845 // Special Case: (sdiv X, -1) -> 0-X 3846 SDValue One = DAG.getConstant(1, DL, VT); 3847 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 3848 SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ); 3849 SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ); 3850 SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes); 3851 Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra); 3852 3853 // If dividing by a positive value, we're done. Otherwise, the result must 3854 // be negated. 3855 SDValue Zero = DAG.getConstant(0, DL, VT); 3856 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra); 3857 3858 // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding. 3859 SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT); 3860 SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra); 3861 return Res; 3862 } 3863 3864 // If integer divide is expensive and we satisfy the requirements, emit an 3865 // alternate sequence. Targets may check function attributes for size/speed 3866 // trade-offs. 3867 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3868 if (isConstantOrConstantVector(N1) && 3869 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 3870 if (SDValue Op = BuildSDIV(N)) 3871 return Op; 3872 3873 return SDValue(); 3874 } 3875 3876 SDValue DAGCombiner::visitUDIV(SDNode *N) { 3877 SDValue N0 = N->getOperand(0); 3878 SDValue N1 = N->getOperand(1); 3879 EVT VT = N->getValueType(0); 3880 EVT CCVT = getSetCCResultType(VT); 3881 3882 // fold vector ops 3883 if (VT.isVector()) 3884 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3885 return FoldedVOp; 3886 3887 SDLoc DL(N); 3888 3889 // fold (udiv c1, c2) -> c1/c2 3890 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3891 if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1})) 3892 return C; 3893 3894 // fold (udiv X, -1) -> select(X == -1, 1, 0) 3895 if (N1C && N1C->getAPIntValue().isAllOnesValue()) 3896 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 3897 DAG.getConstant(1, DL, VT), 3898 DAG.getConstant(0, DL, VT)); 3899 3900 if (SDValue V = simplifyDivRem(N, DAG)) 3901 return V; 3902 3903 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3904 return NewSel; 3905 3906 if (SDValue V = visitUDIVLike(N0, N1, N)) { 3907 // If the corresponding remainder node exists, update its users with 3908 // (Dividend - (Quotient * Divisor). 3909 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(), 3910 { N0, N1 })) { 3911 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 3912 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 3913 AddToWorklist(Mul.getNode()); 3914 AddToWorklist(Sub.getNode()); 3915 CombineTo(RemNode, Sub); 3916 } 3917 return V; 3918 } 3919 3920 // sdiv, srem -> sdivrem 3921 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 3922 // true. Otherwise, we break the simplification logic in visitREM(). 3923 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3924 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 3925 if (SDValue DivRem = useDivRem(N)) 3926 return DivRem; 3927 3928 return SDValue(); 3929 } 3930 3931 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { 3932 SDLoc DL(N); 3933 EVT VT = N->getValueType(0); 3934 3935 // fold (udiv x, (1 << c)) -> x >>u c 3936 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 3937 DAG.isKnownToBeAPowerOfTwo(N1)) { 3938 SDValue LogBase2 = BuildLogBase2(N1, DL); 3939 AddToWorklist(LogBase2.getNode()); 3940 3941 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 3942 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 3943 AddToWorklist(Trunc.getNode()); 3944 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 3945 } 3946 3947 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 3948 if (N1.getOpcode() == ISD::SHL) { 3949 SDValue N10 = N1.getOperand(0); 3950 if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && 3951 DAG.isKnownToBeAPowerOfTwo(N10)) { 3952 SDValue LogBase2 = BuildLogBase2(N10, DL); 3953 AddToWorklist(LogBase2.getNode()); 3954 3955 EVT ADDVT = N1.getOperand(1).getValueType(); 3956 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); 3957 AddToWorklist(Trunc.getNode()); 3958 SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); 3959 AddToWorklist(Add.getNode()); 3960 return DAG.getNode(ISD::SRL, DL, VT, N0, Add); 3961 } 3962 } 3963 3964 // fold (udiv x, c) -> alternate 3965 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3966 if (isConstantOrConstantVector(N1) && 3967 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 3968 if (SDValue Op = BuildUDIV(N)) 3969 return Op; 3970 3971 return SDValue(); 3972 } 3973 3974 // handles ISD::SREM and ISD::UREM 3975 SDValue DAGCombiner::visitREM(SDNode *N) { 3976 unsigned Opcode = N->getOpcode(); 3977 SDValue N0 = N->getOperand(0); 3978 SDValue N1 = N->getOperand(1); 3979 EVT VT = N->getValueType(0); 3980 EVT CCVT = getSetCCResultType(VT); 3981 3982 bool isSigned = (Opcode == ISD::SREM); 3983 SDLoc DL(N); 3984 3985 // fold (rem c1, c2) -> c1%c2 3986 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3987 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) 3988 return C; 3989 3990 // fold (urem X, -1) -> select(X == -1, 0, x) 3991 if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue()) 3992 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 3993 DAG.getConstant(0, DL, VT), N0); 3994 3995 if (SDValue V = simplifyDivRem(N, DAG)) 3996 return V; 3997 3998 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3999 return NewSel; 4000 4001 if (isSigned) { 4002 // If we know the sign bits of both operands are zero, strength reduce to a 4003 // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 4004 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 4005 return DAG.getNode(ISD::UREM, DL, VT, N0, N1); 4006 } else { 4007 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 4008 if (DAG.isKnownToBeAPowerOfTwo(N1)) { 4009 // fold (urem x, pow2) -> (and x, pow2-1) 4010 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4011 AddToWorklist(Add.getNode()); 4012 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4013 } 4014 if (N1.getOpcode() == ISD::SHL && 4015 DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { 4016 // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) 4017 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4018 AddToWorklist(Add.getNode()); 4019 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4020 } 4021 } 4022 4023 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4024 4025 // If X/C can be simplified by the division-by-constant logic, lower 4026 // X%C to the equivalent of X-X/C*C. 4027 // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the 4028 // speculative DIV must not cause a DIVREM conversion. We guard against this 4029 // by skipping the simplification if isIntDivCheap(). When div is not cheap, 4030 // combine will not return a DIVREM. Regardless, checking cheapness here 4031 // makes sense since the simplification results in fatter code. 4032 if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { 4033 SDValue OptimizedDiv = 4034 isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); 4035 if (OptimizedDiv.getNode()) { 4036 // If the equivalent Div node also exists, update its users. 4037 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 4038 if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), 4039 { N0, N1 })) 4040 CombineTo(DivNode, OptimizedDiv); 4041 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); 4042 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4043 AddToWorklist(OptimizedDiv.getNode()); 4044 AddToWorklist(Mul.getNode()); 4045 return Sub; 4046 } 4047 } 4048 4049 // sdiv, srem -> sdivrem 4050 if (SDValue DivRem = useDivRem(N)) 4051 return DivRem.getValue(1); 4052 4053 return SDValue(); 4054 } 4055 4056 SDValue DAGCombiner::visitMULHS(SDNode *N) { 4057 SDValue N0 = N->getOperand(0); 4058 SDValue N1 = N->getOperand(1); 4059 EVT VT = N->getValueType(0); 4060 SDLoc DL(N); 4061 4062 if (VT.isVector()) { 4063 // fold (mulhs x, 0) -> 0 4064 // do not return N0/N1, because undef node may exist. 4065 if (ISD::isBuildVectorAllZeros(N0.getNode()) || 4066 ISD::isBuildVectorAllZeros(N1.getNode())) 4067 return DAG.getConstant(0, DL, VT); 4068 } 4069 4070 // fold (mulhs x, 0) -> 0 4071 if (isNullConstant(N1)) 4072 return N1; 4073 // fold (mulhs x, 1) -> (sra x, size(x)-1) 4074 if (isOneConstant(N1)) 4075 return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, 4076 DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL, 4077 getShiftAmountTy(N0.getValueType()))); 4078 4079 // fold (mulhs x, undef) -> 0 4080 if (N0.isUndef() || N1.isUndef()) 4081 return DAG.getConstant(0, DL, VT); 4082 4083 // If the type twice as wide is legal, transform the mulhs to a wider multiply 4084 // plus a shift. 4085 if (VT.isSimple() && !VT.isVector()) { 4086 MVT Simple = VT.getSimpleVT(); 4087 unsigned SimpleSize = Simple.getSizeInBits(); 4088 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4089 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4090 N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); 4091 N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); 4092 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4093 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4094 DAG.getConstant(SimpleSize, DL, 4095 getShiftAmountTy(N1.getValueType()))); 4096 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4097 } 4098 } 4099 4100 return SDValue(); 4101 } 4102 4103 SDValue DAGCombiner::visitMULHU(SDNode *N) { 4104 SDValue N0 = N->getOperand(0); 4105 SDValue N1 = N->getOperand(1); 4106 EVT VT = N->getValueType(0); 4107 SDLoc DL(N); 4108 4109 if (VT.isVector()) { 4110 // fold (mulhu x, 0) -> 0 4111 // do not return N0/N1, because undef node may exist. 4112 if (ISD::isBuildVectorAllZeros(N0.getNode()) || 4113 ISD::isBuildVectorAllZeros(N1.getNode())) 4114 return DAG.getConstant(0, DL, VT); 4115 } 4116 4117 // fold (mulhu x, 0) -> 0 4118 if (isNullConstant(N1)) 4119 return N1; 4120 // fold (mulhu x, 1) -> 0 4121 if (isOneConstant(N1)) 4122 return DAG.getConstant(0, DL, N0.getValueType()); 4123 // fold (mulhu x, undef) -> 0 4124 if (N0.isUndef() || N1.isUndef()) 4125 return DAG.getConstant(0, DL, VT); 4126 4127 // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) 4128 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 4129 DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { 4130 unsigned NumEltBits = VT.getScalarSizeInBits(); 4131 SDValue LogBase2 = BuildLogBase2(N1, DL); 4132 SDValue SRLAmt = DAG.getNode( 4133 ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); 4134 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 4135 SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); 4136 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 4137 } 4138 4139 // If the type twice as wide is legal, transform the mulhu to a wider multiply 4140 // plus a shift. 4141 if (VT.isSimple() && !VT.isVector()) { 4142 MVT Simple = VT.getSimpleVT(); 4143 unsigned SimpleSize = Simple.getSizeInBits(); 4144 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4145 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4146 N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); 4147 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); 4148 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4149 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4150 DAG.getConstant(SimpleSize, DL, 4151 getShiftAmountTy(N1.getValueType()))); 4152 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4153 } 4154 } 4155 4156 return SDValue(); 4157 } 4158 4159 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp 4160 /// give the opcodes for the two computations that are being performed. Return 4161 /// true if a simplification was made. 4162 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 4163 unsigned HiOp) { 4164 // If the high half is not needed, just compute the low half. 4165 bool HiExists = N->hasAnyUseOfValue(1); 4166 if (!HiExists && (!LegalOperations || 4167 TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { 4168 SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4169 return CombineTo(N, Res, Res); 4170 } 4171 4172 // If the low half is not needed, just compute the high half. 4173 bool LoExists = N->hasAnyUseOfValue(0); 4174 if (!LoExists && (!LegalOperations || 4175 TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) { 4176 SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4177 return CombineTo(N, Res, Res); 4178 } 4179 4180 // If both halves are used, return as it is. 4181 if (LoExists && HiExists) 4182 return SDValue(); 4183 4184 // If the two computed results can be simplified separately, separate them. 4185 if (LoExists) { 4186 SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4187 AddToWorklist(Lo.getNode()); 4188 SDValue LoOpt = combine(Lo.getNode()); 4189 if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && 4190 (!LegalOperations || 4191 TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType()))) 4192 return CombineTo(N, LoOpt, LoOpt); 4193 } 4194 4195 if (HiExists) { 4196 SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4197 AddToWorklist(Hi.getNode()); 4198 SDValue HiOpt = combine(Hi.getNode()); 4199 if (HiOpt.getNode() && HiOpt != Hi && 4200 (!LegalOperations || 4201 TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType()))) 4202 return CombineTo(N, HiOpt, HiOpt); 4203 } 4204 4205 return SDValue(); 4206 } 4207 4208 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { 4209 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) 4210 return Res; 4211 4212 EVT VT = N->getValueType(0); 4213 SDLoc DL(N); 4214 4215 // If the type is twice as wide is legal, transform the mulhu to a wider 4216 // multiply plus a shift. 4217 if (VT.isSimple() && !VT.isVector()) { 4218 MVT Simple = VT.getSimpleVT(); 4219 unsigned SimpleSize = Simple.getSizeInBits(); 4220 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4221 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4222 SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); 4223 SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); 4224 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4225 // Compute the high part as N1. 4226 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4227 DAG.getConstant(SimpleSize, DL, 4228 getShiftAmountTy(Lo.getValueType()))); 4229 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4230 // Compute the low part as N0. 4231 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4232 return CombineTo(N, Lo, Hi); 4233 } 4234 } 4235 4236 return SDValue(); 4237 } 4238 4239 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { 4240 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) 4241 return Res; 4242 4243 EVT VT = N->getValueType(0); 4244 SDLoc DL(N); 4245 4246 // (umul_lohi N0, 0) -> (0, 0) 4247 if (isNullConstant(N->getOperand(1))) { 4248 SDValue Zero = DAG.getConstant(0, DL, VT); 4249 return CombineTo(N, Zero, Zero); 4250 } 4251 4252 // (umul_lohi N0, 1) -> (N0, 0) 4253 if (isOneConstant(N->getOperand(1))) { 4254 SDValue Zero = DAG.getConstant(0, DL, VT); 4255 return CombineTo(N, N->getOperand(0), Zero); 4256 } 4257 4258 // If the type is twice as wide is legal, transform the mulhu to a wider 4259 // multiply plus a shift. 4260 if (VT.isSimple() && !VT.isVector()) { 4261 MVT Simple = VT.getSimpleVT(); 4262 unsigned SimpleSize = Simple.getSizeInBits(); 4263 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4264 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4265 SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); 4266 SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); 4267 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4268 // Compute the high part as N1. 4269 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4270 DAG.getConstant(SimpleSize, DL, 4271 getShiftAmountTy(Lo.getValueType()))); 4272 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4273 // Compute the low part as N0. 4274 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4275 return CombineTo(N, Lo, Hi); 4276 } 4277 } 4278 4279 return SDValue(); 4280 } 4281 4282 SDValue DAGCombiner::visitMULO(SDNode *N) { 4283 SDValue N0 = N->getOperand(0); 4284 SDValue N1 = N->getOperand(1); 4285 EVT VT = N0.getValueType(); 4286 bool IsSigned = (ISD::SMULO == N->getOpcode()); 4287 4288 EVT CarryVT = N->getValueType(1); 4289 SDLoc DL(N); 4290 4291 // canonicalize constant to RHS. 4292 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4293 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4294 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 4295 4296 // fold (mulo x, 0) -> 0 + no carry out 4297 if (isNullOrNullSplat(N1)) 4298 return CombineTo(N, DAG.getConstant(0, DL, VT), 4299 DAG.getConstant(0, DL, CarryVT)); 4300 4301 // (mulo x, 2) -> (addo x, x) 4302 if (ConstantSDNode *C2 = isConstOrConstSplat(N1)) 4303 if (C2->getAPIntValue() == 2) 4304 return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL, 4305 N->getVTList(), N0, N0); 4306 4307 return SDValue(); 4308 } 4309 4310 SDValue DAGCombiner::visitIMINMAX(SDNode *N) { 4311 SDValue N0 = N->getOperand(0); 4312 SDValue N1 = N->getOperand(1); 4313 EVT VT = N0.getValueType(); 4314 unsigned Opcode = N->getOpcode(); 4315 4316 // fold vector ops 4317 if (VT.isVector()) 4318 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 4319 return FoldedVOp; 4320 4321 // fold operation with constant operands. 4322 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1})) 4323 return C; 4324 4325 // canonicalize constant to RHS 4326 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4327 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4328 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 4329 4330 // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. 4331 // Only do this if the current op isn't legal and the flipped is. 4332 if (!TLI.isOperationLegal(Opcode, VT) && 4333 (N0.isUndef() || DAG.SignBitIsZero(N0)) && 4334 (N1.isUndef() || DAG.SignBitIsZero(N1))) { 4335 unsigned AltOpcode; 4336 switch (Opcode) { 4337 case ISD::SMIN: AltOpcode = ISD::UMIN; break; 4338 case ISD::SMAX: AltOpcode = ISD::UMAX; break; 4339 case ISD::UMIN: AltOpcode = ISD::SMIN; break; 4340 case ISD::UMAX: AltOpcode = ISD::SMAX; break; 4341 default: llvm_unreachable("Unknown MINMAX opcode"); 4342 } 4343 if (TLI.isOperationLegal(AltOpcode, VT)) 4344 return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1); 4345 } 4346 4347 return SDValue(); 4348 } 4349 4350 /// If this is a bitwise logic instruction and both operands have the same 4351 /// opcode, try to sink the other opcode after the logic instruction. 4352 SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { 4353 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 4354 EVT VT = N0.getValueType(); 4355 unsigned LogicOpcode = N->getOpcode(); 4356 unsigned HandOpcode = N0.getOpcode(); 4357 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || 4358 LogicOpcode == ISD::XOR) && "Expected logic opcode"); 4359 assert(HandOpcode == N1.getOpcode() && "Bad input!"); 4360 4361 // Bail early if none of these transforms apply. 4362 if (N0.getNumOperands() == 0) 4363 return SDValue(); 4364 4365 // FIXME: We should check number of uses of the operands to not increase 4366 // the instruction count for all transforms. 4367 4368 // Handle size-changing casts. 4369 SDValue X = N0.getOperand(0); 4370 SDValue Y = N1.getOperand(0); 4371 EVT XVT = X.getValueType(); 4372 SDLoc DL(N); 4373 if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND || 4374 HandOpcode == ISD::SIGN_EXTEND) { 4375 // If both operands have other uses, this transform would create extra 4376 // instructions without eliminating anything. 4377 if (!N0.hasOneUse() && !N1.hasOneUse()) 4378 return SDValue(); 4379 // We need matching integer source types. 4380 if (XVT != Y.getValueType()) 4381 return SDValue(); 4382 // Don't create an illegal op during or after legalization. Don't ever 4383 // create an unsupported vector op. 4384 if ((VT.isVector() || LegalOperations) && 4385 !TLI.isOperationLegalOrCustom(LogicOpcode, XVT)) 4386 return SDValue(); 4387 // Avoid infinite looping with PromoteIntBinOp. 4388 // TODO: Should we apply desirable/legal constraints to all opcodes? 4389 if (HandOpcode == ISD::ANY_EXTEND && LegalTypes && 4390 !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) 4391 return SDValue(); 4392 // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) 4393 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4394 return DAG.getNode(HandOpcode, DL, VT, Logic); 4395 } 4396 4397 // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y) 4398 if (HandOpcode == ISD::TRUNCATE) { 4399 // If both operands have other uses, this transform would create extra 4400 // instructions without eliminating anything. 4401 if (!N0.hasOneUse() && !N1.hasOneUse()) 4402 return SDValue(); 4403 // We need matching source types. 4404 if (XVT != Y.getValueType()) 4405 return SDValue(); 4406 // Don't create an illegal op during or after legalization. 4407 if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) 4408 return SDValue(); 4409 // Be extra careful sinking truncate. If it's free, there's no benefit in 4410 // widening a binop. Also, don't create a logic op on an illegal type. 4411 if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT)) 4412 return SDValue(); 4413 if (!TLI.isTypeLegal(XVT)) 4414 return SDValue(); 4415 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4416 return DAG.getNode(HandOpcode, DL, VT, Logic); 4417 } 4418 4419 // For binops SHL/SRL/SRA/AND: 4420 // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z 4421 if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL || 4422 HandOpcode == ISD::SRA || HandOpcode == ISD::AND) && 4423 N0.getOperand(1) == N1.getOperand(1)) { 4424 // If either operand has other uses, this transform is not an improvement. 4425 if (!N0.hasOneUse() || !N1.hasOneUse()) 4426 return SDValue(); 4427 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4428 return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); 4429 } 4430 4431 // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y) 4432 if (HandOpcode == ISD::BSWAP) { 4433 // If either operand has other uses, this transform is not an improvement. 4434 if (!N0.hasOneUse() || !N1.hasOneUse()) 4435 return SDValue(); 4436 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4437 return DAG.getNode(HandOpcode, DL, VT, Logic); 4438 } 4439 4440 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) 4441 // Only perform this optimization up until type legalization, before 4442 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by 4443 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and 4444 // we don't want to undo this promotion. 4445 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper 4446 // on scalars. 4447 if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) && 4448 Level <= AfterLegalizeTypes) { 4449 // Input types must be integer and the same. 4450 if (XVT.isInteger() && XVT == Y.getValueType() && 4451 !(VT.isVector() && TLI.isTypeLegal(VT) && 4452 !XVT.isVector() && !TLI.isTypeLegal(XVT))) { 4453 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4454 return DAG.getNode(HandOpcode, DL, VT, Logic); 4455 } 4456 } 4457 4458 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). 4459 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) 4460 // If both shuffles use the same mask, and both shuffle within a single 4461 // vector, then it is worthwhile to move the swizzle after the operation. 4462 // The type-legalizer generates this pattern when loading illegal 4463 // vector types from memory. In many cases this allows additional shuffle 4464 // optimizations. 4465 // There are other cases where moving the shuffle after the xor/and/or 4466 // is profitable even if shuffles don't perform a swizzle. 4467 // If both shuffles use the same mask, and both shuffles have the same first 4468 // or second operand, then it might still be profitable to move the shuffle 4469 // after the xor/and/or operation. 4470 if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { 4471 auto *SVN0 = cast<ShuffleVectorSDNode>(N0); 4472 auto *SVN1 = cast<ShuffleVectorSDNode>(N1); 4473 assert(X.getValueType() == Y.getValueType() && 4474 "Inputs to shuffles are not the same type"); 4475 4476 // Check that both shuffles use the same mask. The masks are known to be of 4477 // the same length because the result vector type is the same. 4478 // Check also that shuffles have only one use to avoid introducing extra 4479 // instructions. 4480 if (!SVN0->hasOneUse() || !SVN1->hasOneUse() || 4481 !SVN0->getMask().equals(SVN1->getMask())) 4482 return SDValue(); 4483 4484 // Don't try to fold this node if it requires introducing a 4485 // build vector of all zeros that might be illegal at this stage. 4486 SDValue ShOp = N0.getOperand(1); 4487 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4488 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4489 4490 // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C) 4491 if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { 4492 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, 4493 N0.getOperand(0), N1.getOperand(0)); 4494 return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask()); 4495 } 4496 4497 // Don't try to fold this node if it requires introducing a 4498 // build vector of all zeros that might be illegal at this stage. 4499 ShOp = N0.getOperand(0); 4500 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4501 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4502 4503 // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B)) 4504 if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) { 4505 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1), 4506 N1.getOperand(1)); 4507 return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask()); 4508 } 4509 } 4510 4511 return SDValue(); 4512 } 4513 4514 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. 4515 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 4516 const SDLoc &DL) { 4517 SDValue LL, LR, RL, RR, N0CC, N1CC; 4518 if (!isSetCCEquivalent(N0, LL, LR, N0CC) || 4519 !isSetCCEquivalent(N1, RL, RR, N1CC)) 4520 return SDValue(); 4521 4522 assert(N0.getValueType() == N1.getValueType() && 4523 "Unexpected operand types for bitwise logic op"); 4524 assert(LL.getValueType() == LR.getValueType() && 4525 RL.getValueType() == RR.getValueType() && 4526 "Unexpected operand types for setcc"); 4527 4528 // If we're here post-legalization or the logic op type is not i1, the logic 4529 // op type must match a setcc result type. Also, all folds require new 4530 // operations on the left and right operands, so those types must match. 4531 EVT VT = N0.getValueType(); 4532 EVT OpVT = LL.getValueType(); 4533 if (LegalOperations || VT.getScalarType() != MVT::i1) 4534 if (VT != getSetCCResultType(OpVT)) 4535 return SDValue(); 4536 if (OpVT != RL.getValueType()) 4537 return SDValue(); 4538 4539 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); 4540 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); 4541 bool IsInteger = OpVT.isInteger(); 4542 if (LR == RR && CC0 == CC1 && IsInteger) { 4543 bool IsZero = isNullOrNullSplat(LR); 4544 bool IsNeg1 = isAllOnesOrAllOnesSplat(LR); 4545 4546 // All bits clear? 4547 bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; 4548 // All sign bits clear? 4549 bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; 4550 // Any bits set? 4551 bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; 4552 // Any sign bits set? 4553 bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; 4554 4555 // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) 4556 // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) 4557 // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) 4558 // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) 4559 if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { 4560 SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); 4561 AddToWorklist(Or.getNode()); 4562 return DAG.getSetCC(DL, VT, Or, LR, CC1); 4563 } 4564 4565 // All bits set? 4566 bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; 4567 // All sign bits set? 4568 bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; 4569 // Any bits clear? 4570 bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; 4571 // Any sign bits clear? 4572 bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; 4573 4574 // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) 4575 // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) 4576 // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) 4577 // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) 4578 if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { 4579 SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); 4580 AddToWorklist(And.getNode()); 4581 return DAG.getSetCC(DL, VT, And, LR, CC1); 4582 } 4583 } 4584 4585 // TODO: What is the 'or' equivalent of this fold? 4586 // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) 4587 if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 && 4588 IsInteger && CC0 == ISD::SETNE && 4589 ((isNullConstant(LR) && isAllOnesConstant(RR)) || 4590 (isAllOnesConstant(LR) && isNullConstant(RR)))) { 4591 SDValue One = DAG.getConstant(1, DL, OpVT); 4592 SDValue Two = DAG.getConstant(2, DL, OpVT); 4593 SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); 4594 AddToWorklist(Add.getNode()); 4595 return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); 4596 } 4597 4598 // Try more general transforms if the predicates match and the only user of 4599 // the compares is the 'and' or 'or'. 4600 if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && 4601 N0.hasOneUse() && N1.hasOneUse()) { 4602 // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 4603 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 4604 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { 4605 SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); 4606 SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); 4607 SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); 4608 SDValue Zero = DAG.getConstant(0, DL, OpVT); 4609 return DAG.getSetCC(DL, VT, Or, Zero, CC1); 4610 } 4611 4612 // Turn compare of constants whose difference is 1 bit into add+and+setcc. 4613 // TODO - support non-uniform vector amounts. 4614 if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) { 4615 // Match a shared variable operand and 2 non-opaque constant operands. 4616 ConstantSDNode *C0 = isConstOrConstSplat(LR); 4617 ConstantSDNode *C1 = isConstOrConstSplat(RR); 4618 if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) { 4619 // Canonicalize larger constant as C0. 4620 if (C1->getAPIntValue().ugt(C0->getAPIntValue())) 4621 std::swap(C0, C1); 4622 4623 // The difference of the constants must be a single bit. 4624 const APInt &C0Val = C0->getAPIntValue(); 4625 const APInt &C1Val = C1->getAPIntValue(); 4626 if ((C0Val - C1Val).isPowerOf2()) { 4627 // and/or (setcc X, C0, ne), (setcc X, C1, ne/eq) --> 4628 // setcc ((add X, -C1), ~(C0 - C1)), 0, ne/eq 4629 SDValue OffsetC = DAG.getConstant(-C1Val, DL, OpVT); 4630 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LL, OffsetC); 4631 SDValue MaskC = DAG.getConstant(~(C0Val - C1Val), DL, OpVT); 4632 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Add, MaskC); 4633 SDValue Zero = DAG.getConstant(0, DL, OpVT); 4634 return DAG.getSetCC(DL, VT, And, Zero, CC0); 4635 } 4636 } 4637 } 4638 } 4639 4640 // Canonicalize equivalent operands to LL == RL. 4641 if (LL == RR && LR == RL) { 4642 CC1 = ISD::getSetCCSwappedOperands(CC1); 4643 std::swap(RL, RR); 4644 } 4645 4646 // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 4647 // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 4648 if (LL == RL && LR == RR) { 4649 ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT) 4650 : ISD::getSetCCOrOperation(CC0, CC1, OpVT); 4651 if (NewCC != ISD::SETCC_INVALID && 4652 (!LegalOperations || 4653 (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && 4654 TLI.isOperationLegal(ISD::SETCC, OpVT)))) 4655 return DAG.getSetCC(DL, VT, LL, LR, NewCC); 4656 } 4657 4658 return SDValue(); 4659 } 4660 4661 /// This contains all DAGCombine rules which reduce two values combined by 4662 /// an And operation to a single value. This makes them reusable in the context 4663 /// of visitSELECT(). Rules involving constants are not included as 4664 /// visitSELECT() already handles those cases. 4665 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { 4666 EVT VT = N1.getValueType(); 4667 SDLoc DL(N); 4668 4669 // fold (and x, undef) -> 0 4670 if (N0.isUndef() || N1.isUndef()) 4671 return DAG.getConstant(0, DL, VT); 4672 4673 if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) 4674 return V; 4675 4676 if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && 4677 VT.getSizeInBits() <= 64) { 4678 if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 4679 if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) { 4680 // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal 4681 // immediate for an add, but it is legal if its top c2 bits are set, 4682 // transform the ADD so the immediate doesn't need to be materialized 4683 // in a register. 4684 APInt ADDC = ADDI->getAPIntValue(); 4685 APInt SRLC = SRLI->getAPIntValue(); 4686 if (ADDC.getMinSignedBits() <= 64 && 4687 SRLC.ult(VT.getSizeInBits()) && 4688 !TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 4689 APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 4690 SRLC.getZExtValue()); 4691 if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { 4692 ADDC |= Mask; 4693 if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 4694 SDLoc DL0(N0); 4695 SDValue NewAdd = 4696 DAG.getNode(ISD::ADD, DL0, VT, 4697 N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); 4698 CombineTo(N0.getNode(), NewAdd); 4699 // Return N so it doesn't get rechecked! 4700 return SDValue(N, 0); 4701 } 4702 } 4703 } 4704 } 4705 } 4706 } 4707 4708 // Reduce bit extract of low half of an integer to the narrower type. 4709 // (and (srl i64:x, K), KMask) -> 4710 // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) 4711 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 4712 if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) { 4713 if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 4714 unsigned Size = VT.getSizeInBits(); 4715 const APInt &AndMask = CAnd->getAPIntValue(); 4716 unsigned ShiftBits = CShift->getZExtValue(); 4717 4718 // Bail out, this node will probably disappear anyway. 4719 if (ShiftBits == 0) 4720 return SDValue(); 4721 4722 unsigned MaskBits = AndMask.countTrailingOnes(); 4723 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); 4724 4725 if (AndMask.isMask() && 4726 // Required bits must not span the two halves of the integer and 4727 // must fit in the half size type. 4728 (ShiftBits + MaskBits <= Size / 2) && 4729 TLI.isNarrowingProfitable(VT, HalfVT) && 4730 TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && 4731 TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && 4732 TLI.isTruncateFree(VT, HalfVT) && 4733 TLI.isZExtFree(HalfVT, VT)) { 4734 // The isNarrowingProfitable is to avoid regressions on PPC and 4735 // AArch64 which match a few 64-bit bit insert / bit extract patterns 4736 // on downstream users of this. Those patterns could probably be 4737 // extended to handle extensions mixed in. 4738 4739 SDValue SL(N0); 4740 assert(MaskBits <= Size); 4741 4742 // Extracting the highest bit of the low half. 4743 EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); 4744 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, 4745 N0.getOperand(0)); 4746 4747 SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); 4748 SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); 4749 SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); 4750 SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); 4751 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); 4752 } 4753 } 4754 } 4755 } 4756 4757 return SDValue(); 4758 } 4759 4760 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 4761 EVT LoadResultTy, EVT &ExtVT) { 4762 if (!AndC->getAPIntValue().isMask()) 4763 return false; 4764 4765 unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); 4766 4767 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 4768 EVT LoadedVT = LoadN->getMemoryVT(); 4769 4770 if (ExtVT == LoadedVT && 4771 (!LegalOperations || 4772 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { 4773 // ZEXTLOAD will match without needing to change the size of the value being 4774 // loaded. 4775 return true; 4776 } 4777 4778 // Do not change the width of a volatile or atomic loads. 4779 if (!LoadN->isSimple()) 4780 return false; 4781 4782 // Do not generate loads of non-round integer types since these can 4783 // be expensive (and would be wrong if the type is not byte sized). 4784 if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) 4785 return false; 4786 4787 if (LegalOperations && 4788 !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) 4789 return false; 4790 4791 if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) 4792 return false; 4793 4794 return true; 4795 } 4796 4797 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, 4798 ISD::LoadExtType ExtType, EVT &MemVT, 4799 unsigned ShAmt) { 4800 if (!LDST) 4801 return false; 4802 // Only allow byte offsets. 4803 if (ShAmt % 8) 4804 return false; 4805 4806 // Do not generate loads of non-round integer types since these can 4807 // be expensive (and would be wrong if the type is not byte sized). 4808 if (!MemVT.isRound()) 4809 return false; 4810 4811 // Don't change the width of a volatile or atomic loads. 4812 if (!LDST->isSimple()) 4813 return false; 4814 4815 // Verify that we are actually reducing a load width here. 4816 if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits()) 4817 return false; 4818 4819 // Ensure that this isn't going to produce an unsupported memory access. 4820 if (ShAmt && 4821 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 4822 LDST->getAddressSpace(), ShAmt / 8, 4823 LDST->getMemOperand()->getFlags())) 4824 return false; 4825 4826 // It's not possible to generate a constant of extended or untyped type. 4827 EVT PtrType = LDST->getBasePtr().getValueType(); 4828 if (PtrType == MVT::Untyped || PtrType.isExtended()) 4829 return false; 4830 4831 if (isa<LoadSDNode>(LDST)) { 4832 LoadSDNode *Load = cast<LoadSDNode>(LDST); 4833 // Don't transform one with multiple uses, this would require adding a new 4834 // load. 4835 if (!SDValue(Load, 0).hasOneUse()) 4836 return false; 4837 4838 if (LegalOperations && 4839 !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT)) 4840 return false; 4841 4842 // For the transform to be legal, the load must produce only two values 4843 // (the value loaded and the chain). Don't transform a pre-increment 4844 // load, for example, which produces an extra value. Otherwise the 4845 // transformation is not equivalent, and the downstream logic to replace 4846 // uses gets things wrong. 4847 if (Load->getNumValues() > 2) 4848 return false; 4849 4850 // If the load that we're shrinking is an extload and we're not just 4851 // discarding the extension we can't simply shrink the load. Bail. 4852 // TODO: It would be possible to merge the extensions in some cases. 4853 if (Load->getExtensionType() != ISD::NON_EXTLOAD && 4854 Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 4855 return false; 4856 4857 if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT)) 4858 return false; 4859 } else { 4860 assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode"); 4861 StoreSDNode *Store = cast<StoreSDNode>(LDST); 4862 // Can't write outside the original store 4863 if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 4864 return false; 4865 4866 if (LegalOperations && 4867 !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT)) 4868 return false; 4869 } 4870 return true; 4871 } 4872 4873 bool DAGCombiner::SearchForAndLoads(SDNode *N, 4874 SmallVectorImpl<LoadSDNode*> &Loads, 4875 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 4876 ConstantSDNode *Mask, 4877 SDNode *&NodeToMask) { 4878 // Recursively search for the operands, looking for loads which can be 4879 // narrowed. 4880 for (SDValue Op : N->op_values()) { 4881 if (Op.getValueType().isVector()) 4882 return false; 4883 4884 // Some constants may need fixing up later if they are too large. 4885 if (auto *C = dyn_cast<ConstantSDNode>(Op)) { 4886 if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && 4887 (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) 4888 NodesWithConsts.insert(N); 4889 continue; 4890 } 4891 4892 if (!Op.hasOneUse()) 4893 return false; 4894 4895 switch(Op.getOpcode()) { 4896 case ISD::LOAD: { 4897 auto *Load = cast<LoadSDNode>(Op); 4898 EVT ExtVT; 4899 if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && 4900 isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) { 4901 4902 // ZEXTLOAD is already small enough. 4903 if (Load->getExtensionType() == ISD::ZEXTLOAD && 4904 ExtVT.bitsGE(Load->getMemoryVT())) 4905 continue; 4906 4907 // Use LE to convert equal sized loads to zext. 4908 if (ExtVT.bitsLE(Load->getMemoryVT())) 4909 Loads.push_back(Load); 4910 4911 continue; 4912 } 4913 return false; 4914 } 4915 case ISD::ZERO_EXTEND: 4916 case ISD::AssertZext: { 4917 unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); 4918 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 4919 EVT VT = Op.getOpcode() == ISD::AssertZext ? 4920 cast<VTSDNode>(Op.getOperand(1))->getVT() : 4921 Op.getOperand(0).getValueType(); 4922 4923 // We can accept extending nodes if the mask is wider or an equal 4924 // width to the original type. 4925 if (ExtVT.bitsGE(VT)) 4926 continue; 4927 break; 4928 } 4929 case ISD::OR: 4930 case ISD::XOR: 4931 case ISD::AND: 4932 if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, 4933 NodeToMask)) 4934 return false; 4935 continue; 4936 } 4937 4938 // Allow one node which will masked along with any loads found. 4939 if (NodeToMask) 4940 return false; 4941 4942 // Also ensure that the node to be masked only produces one data result. 4943 NodeToMask = Op.getNode(); 4944 if (NodeToMask->getNumValues() > 1) { 4945 bool HasValue = false; 4946 for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) { 4947 MVT VT = SDValue(NodeToMask, i).getSimpleValueType(); 4948 if (VT != MVT::Glue && VT != MVT::Other) { 4949 if (HasValue) { 4950 NodeToMask = nullptr; 4951 return false; 4952 } 4953 HasValue = true; 4954 } 4955 } 4956 assert(HasValue && "Node to be masked has no data result?"); 4957 } 4958 } 4959 return true; 4960 } 4961 4962 bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { 4963 auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4964 if (!Mask) 4965 return false; 4966 4967 if (!Mask->getAPIntValue().isMask()) 4968 return false; 4969 4970 // No need to do anything if the and directly uses a load. 4971 if (isa<LoadSDNode>(N->getOperand(0))) 4972 return false; 4973 4974 SmallVector<LoadSDNode*, 8> Loads; 4975 SmallPtrSet<SDNode*, 2> NodesWithConsts; 4976 SDNode *FixupNode = nullptr; 4977 if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { 4978 if (Loads.size() == 0) 4979 return false; 4980 4981 LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); 4982 SDValue MaskOp = N->getOperand(1); 4983 4984 // If it exists, fixup the single node we allow in the tree that needs 4985 // masking. 4986 if (FixupNode) { 4987 LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); 4988 SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), 4989 FixupNode->getValueType(0), 4990 SDValue(FixupNode, 0), MaskOp); 4991 DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); 4992 if (And.getOpcode() == ISD ::AND) 4993 DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); 4994 } 4995 4996 // Narrow any constants that need it. 4997 for (auto *LogicN : NodesWithConsts) { 4998 SDValue Op0 = LogicN->getOperand(0); 4999 SDValue Op1 = LogicN->getOperand(1); 5000 5001 if (isa<ConstantSDNode>(Op0)) 5002 std::swap(Op0, Op1); 5003 5004 SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), 5005 Op1, MaskOp); 5006 5007 DAG.UpdateNodeOperands(LogicN, Op0, And); 5008 } 5009 5010 // Create narrow loads. 5011 for (auto *Load : Loads) { 5012 LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); 5013 SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), 5014 SDValue(Load, 0), MaskOp); 5015 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); 5016 if (And.getOpcode() == ISD ::AND) 5017 And = SDValue( 5018 DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0); 5019 SDValue NewLoad = ReduceLoadWidth(And.getNode()); 5020 assert(NewLoad && 5021 "Shouldn't be masking the load if it can't be narrowed"); 5022 CombineTo(Load, NewLoad, NewLoad.getValue(1)); 5023 } 5024 DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); 5025 return true; 5026 } 5027 return false; 5028 } 5029 5030 // Unfold 5031 // x & (-1 'logical shift' y) 5032 // To 5033 // (x 'opposite logical shift' y) 'logical shift' y 5034 // if it is better for performance. 5035 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { 5036 assert(N->getOpcode() == ISD::AND); 5037 5038 SDValue N0 = N->getOperand(0); 5039 SDValue N1 = N->getOperand(1); 5040 5041 // Do we actually prefer shifts over mask? 5042 if (!TLI.shouldFoldMaskToVariableShiftPair(N0)) 5043 return SDValue(); 5044 5045 // Try to match (-1 '[outer] logical shift' y) 5046 unsigned OuterShift; 5047 unsigned InnerShift; // The opposite direction to the OuterShift. 5048 SDValue Y; // Shift amount. 5049 auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool { 5050 if (!M.hasOneUse()) 5051 return false; 5052 OuterShift = M->getOpcode(); 5053 if (OuterShift == ISD::SHL) 5054 InnerShift = ISD::SRL; 5055 else if (OuterShift == ISD::SRL) 5056 InnerShift = ISD::SHL; 5057 else 5058 return false; 5059 if (!isAllOnesConstant(M->getOperand(0))) 5060 return false; 5061 Y = M->getOperand(1); 5062 return true; 5063 }; 5064 5065 SDValue X; 5066 if (matchMask(N1)) 5067 X = N0; 5068 else if (matchMask(N0)) 5069 X = N1; 5070 else 5071 return SDValue(); 5072 5073 SDLoc DL(N); 5074 EVT VT = N->getValueType(0); 5075 5076 // tmp = x 'opposite logical shift' y 5077 SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y); 5078 // ret = tmp 'logical shift' y 5079 SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y); 5080 5081 return T1; 5082 } 5083 5084 /// Try to replace shift/logic that tests if a bit is clear with mask + setcc. 5085 /// For a target with a bit test, this is expected to become test + set and save 5086 /// at least 1 instruction. 5087 static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { 5088 assert(And->getOpcode() == ISD::AND && "Expected an 'and' op"); 5089 5090 // This is probably not worthwhile without a supported type. 5091 EVT VT = And->getValueType(0); 5092 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5093 if (!TLI.isTypeLegal(VT)) 5094 return SDValue(); 5095 5096 // Look through an optional extension and find a 'not'. 5097 // TODO: Should we favor test+set even without the 'not' op? 5098 SDValue Not = And->getOperand(0), And1 = And->getOperand(1); 5099 if (Not.getOpcode() == ISD::ANY_EXTEND) 5100 Not = Not.getOperand(0); 5101 if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1)) 5102 return SDValue(); 5103 5104 // Look though an optional truncation. The source operand may not be the same 5105 // type as the original 'and', but that is ok because we are masking off 5106 // everything but the low bit. 5107 SDValue Srl = Not.getOperand(0); 5108 if (Srl.getOpcode() == ISD::TRUNCATE) 5109 Srl = Srl.getOperand(0); 5110 5111 // Match a shift-right by constant. 5112 if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() || 5113 !isa<ConstantSDNode>(Srl.getOperand(1))) 5114 return SDValue(); 5115 5116 // We might have looked through casts that make this transform invalid. 5117 // TODO: If the source type is wider than the result type, do the mask and 5118 // compare in the source type. 5119 const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1); 5120 unsigned VTBitWidth = VT.getSizeInBits(); 5121 if (ShiftAmt.uge(VTBitWidth)) 5122 return SDValue(); 5123 5124 // Turn this into a bit-test pattern using mask op + setcc: 5125 // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0 5126 SDLoc DL(And); 5127 SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT); 5128 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 5129 SDValue Mask = DAG.getConstant( 5130 APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT); 5131 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask); 5132 SDValue Zero = DAG.getConstant(0, DL, VT); 5133 SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ); 5134 return DAG.getZExtOrTrunc(Setcc, DL, VT); 5135 } 5136 5137 SDValue DAGCombiner::visitAND(SDNode *N) { 5138 SDValue N0 = N->getOperand(0); 5139 SDValue N1 = N->getOperand(1); 5140 EVT VT = N1.getValueType(); 5141 5142 // x & x --> x 5143 if (N0 == N1) 5144 return N0; 5145 5146 // fold vector ops 5147 if (VT.isVector()) { 5148 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 5149 return FoldedVOp; 5150 5151 // fold (and x, 0) -> 0, vector edition 5152 if (ISD::isBuildVectorAllZeros(N0.getNode())) 5153 // do not return N0, because undef node may exist in N0 5154 return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()), 5155 SDLoc(N), N0.getValueType()); 5156 if (ISD::isBuildVectorAllZeros(N1.getNode())) 5157 // do not return N1, because undef node may exist in N1 5158 return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()), 5159 SDLoc(N), N1.getValueType()); 5160 5161 // fold (and x, -1) -> x, vector edition 5162 if (ISD::isBuildVectorAllOnes(N0.getNode())) 5163 return N1; 5164 if (ISD::isBuildVectorAllOnes(N1.getNode())) 5165 return N0; 5166 } 5167 5168 // fold (and c1, c2) -> c1&c2 5169 ConstantSDNode *N1C = isConstOrConstSplat(N1); 5170 if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1})) 5171 return C; 5172 5173 // canonicalize constant to RHS 5174 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 5175 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 5176 return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); 5177 5178 // fold (and x, -1) -> x 5179 if (isAllOnesConstant(N1)) 5180 return N0; 5181 5182 // if (and x, c) is known to be zero, return 0 5183 unsigned BitWidth = VT.getScalarSizeInBits(); 5184 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 5185 APInt::getAllOnesValue(BitWidth))) 5186 return DAG.getConstant(0, SDLoc(N), VT); 5187 5188 if (SDValue NewSel = foldBinOpIntoSelect(N)) 5189 return NewSel; 5190 5191 // reassociate and 5192 if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) 5193 return RAND; 5194 5195 // Try to convert a constant mask AND into a shuffle clear mask. 5196 if (VT.isVector()) 5197 if (SDValue Shuffle = XformToShuffleWithZero(N)) 5198 return Shuffle; 5199 5200 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 5201 return Combined; 5202 5203 // fold (and (or x, C), D) -> D if (C & D) == D 5204 auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { 5205 return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); 5206 }; 5207 if (N0.getOpcode() == ISD::OR && 5208 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) 5209 return N1; 5210 // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. 5211 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 5212 SDValue N0Op0 = N0.getOperand(0); 5213 APInt Mask = ~N1C->getAPIntValue(); 5214 Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); 5215 if (DAG.MaskedValueIsZero(N0Op0, Mask)) { 5216 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5217 N0.getValueType(), N0Op0); 5218 5219 // Replace uses of the AND with uses of the Zero extend node. 5220 CombineTo(N, Zext); 5221 5222 // We actually want to replace all uses of the any_extend with the 5223 // zero_extend, to avoid duplicating things. This will later cause this 5224 // AND to be folded. 5225 CombineTo(N0.getNode(), Zext); 5226 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5227 } 5228 } 5229 5230 // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> 5231 // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must 5232 // already be zero by virtue of the width of the base type of the load. 5233 // 5234 // the 'X' node here can either be nothing or an extract_vector_elt to catch 5235 // more cases. 5236 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5237 N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && 5238 N0.getOperand(0).getOpcode() == ISD::LOAD && 5239 N0.getOperand(0).getResNo() == 0) || 5240 (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { 5241 LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ? 5242 N0 : N0.getOperand(0) ); 5243 5244 // Get the constant (if applicable) the zero'th operand is being ANDed with. 5245 // This can be a pure constant or a vector splat, in which case we treat the 5246 // vector as a scalar and use the splat value. 5247 APInt Constant = APInt::getNullValue(1); 5248 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 5249 Constant = C->getAPIntValue(); 5250 } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) { 5251 APInt SplatValue, SplatUndef; 5252 unsigned SplatBitSize; 5253 bool HasAnyUndefs; 5254 bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, 5255 SplatBitSize, HasAnyUndefs); 5256 if (IsSplat) { 5257 // Undef bits can contribute to a possible optimisation if set, so 5258 // set them. 5259 SplatValue |= SplatUndef; 5260 5261 // The splat value may be something like "0x00FFFFFF", which means 0 for 5262 // the first vector value and FF for the rest, repeating. We need a mask 5263 // that will apply equally to all members of the vector, so AND all the 5264 // lanes of the constant together. 5265 unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits(); 5266 5267 // If the splat value has been compressed to a bitlength lower 5268 // than the size of the vector lane, we need to re-expand it to 5269 // the lane size. 5270 if (EltBitWidth > SplatBitSize) 5271 for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth); 5272 SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2) 5273 SplatValue |= SplatValue.shl(SplatBitSize); 5274 5275 // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a 5276 // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. 5277 if ((SplatBitSize % EltBitWidth) == 0) { 5278 Constant = APInt::getAllOnesValue(EltBitWidth); 5279 for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i) 5280 Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth); 5281 } 5282 } 5283 } 5284 5285 // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is 5286 // actually legal and isn't going to get expanded, else this is a false 5287 // optimisation. 5288 bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, 5289 Load->getValueType(0), 5290 Load->getMemoryVT()); 5291 5292 // Resize the constant to the same size as the original memory access before 5293 // extension. If it is still the AllOnesValue then this AND is completely 5294 // unneeded. 5295 Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); 5296 5297 bool B; 5298 switch (Load->getExtensionType()) { 5299 default: B = false; break; 5300 case ISD::EXTLOAD: B = CanZextLoadProfitably; break; 5301 case ISD::ZEXTLOAD: 5302 case ISD::NON_EXTLOAD: B = true; break; 5303 } 5304 5305 if (B && Constant.isAllOnesValue()) { 5306 // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to 5307 // preserve semantics once we get rid of the AND. 5308 SDValue NewLoad(Load, 0); 5309 5310 // Fold the AND away. NewLoad may get replaced immediately. 5311 CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); 5312 5313 if (Load->getExtensionType() == ISD::EXTLOAD) { 5314 NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, 5315 Load->getValueType(0), SDLoc(Load), 5316 Load->getChain(), Load->getBasePtr(), 5317 Load->getOffset(), Load->getMemoryVT(), 5318 Load->getMemOperand()); 5319 // Replace uses of the EXTLOAD with the new ZEXTLOAD. 5320 if (Load->getNumValues() == 3) { 5321 // PRE/POST_INC loads have 3 values. 5322 SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), 5323 NewLoad.getValue(2) }; 5324 CombineTo(Load, To, 3, true); 5325 } else { 5326 CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); 5327 } 5328 } 5329 5330 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5331 } 5332 } 5333 5334 // fold (and (load x), 255) -> (zextload x, i8) 5335 // fold (and (extload x, i16), 255) -> (zextload x, i8) 5336 // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) 5337 if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || 5338 (N0.getOpcode() == ISD::ANY_EXTEND && 5339 N0.getOperand(0).getOpcode() == ISD::LOAD))) { 5340 if (SDValue Res = ReduceLoadWidth(N)) { 5341 LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND 5342 ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); 5343 AddToWorklist(N); 5344 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res); 5345 return SDValue(N, 0); 5346 } 5347 } 5348 5349 if (LegalTypes) { 5350 // Attempt to propagate the AND back up to the leaves which, if they're 5351 // loads, can be combined to narrow loads and the AND node can be removed. 5352 // Perform after legalization so that extend nodes will already be 5353 // combined into the loads. 5354 if (BackwardsPropagateMask(N)) 5355 return SDValue(N, 0); 5356 } 5357 5358 if (SDValue Combined = visitANDLike(N0, N1, N)) 5359 return Combined; 5360 5361 // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) 5362 if (N0.getOpcode() == N1.getOpcode()) 5363 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 5364 return V; 5365 5366 // Masking the negated extension of a boolean is just the zero-extended 5367 // boolean: 5368 // and (sub 0, zext(bool X)), 1 --> zext(bool X) 5369 // and (sub 0, sext(bool X)), 1 --> zext(bool X) 5370 // 5371 // Note: the SimplifyDemandedBits fold below can make an information-losing 5372 // transform, and then we have no way to find this better fold. 5373 if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { 5374 if (isNullOrNullSplat(N0.getOperand(0))) { 5375 SDValue SubRHS = N0.getOperand(1); 5376 if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && 5377 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5378 return SubRHS; 5379 if (SubRHS.getOpcode() == ISD::SIGN_EXTEND && 5380 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5381 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0)); 5382 } 5383 } 5384 5385 // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) 5386 // fold (and (sra)) -> (and (srl)) when possible. 5387 if (SimplifyDemandedBits(SDValue(N, 0))) 5388 return SDValue(N, 0); 5389 5390 // fold (zext_inreg (extload x)) -> (zextload x) 5391 // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use 5392 if (ISD::isUNINDEXEDLoad(N0.getNode()) && 5393 (ISD::isEXTLoad(N0.getNode()) || 5394 (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) { 5395 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 5396 EVT MemVT = LN0->getMemoryVT(); 5397 // If we zero all the possible extended bits, then we can turn this into 5398 // a zextload if we are running before legalize or the operation is legal. 5399 unsigned ExtBitSize = N1.getScalarValueSizeInBits(); 5400 unsigned MemBitSize = MemVT.getScalarSizeInBits(); 5401 APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); 5402 if (DAG.MaskedValueIsZero(N1, ExtBits) && 5403 ((!LegalOperations && LN0->isSimple()) || 5404 TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { 5405 SDValue ExtLoad = 5406 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), 5407 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 5408 AddToWorklist(N); 5409 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 5410 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5411 } 5412 } 5413 5414 // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) 5415 if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { 5416 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 5417 N0.getOperand(1), false)) 5418 return BSwap; 5419 } 5420 5421 if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N)) 5422 return Shifts; 5423 5424 if (TLI.hasBitTest(N0, N1)) 5425 if (SDValue V = combineShiftAnd1ToBitTest(N, DAG)) 5426 return V; 5427 5428 return SDValue(); 5429 } 5430 5431 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. 5432 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 5433 bool DemandHighBits) { 5434 if (!LegalOperations) 5435 return SDValue(); 5436 5437 EVT VT = N->getValueType(0); 5438 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) 5439 return SDValue(); 5440 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 5441 return SDValue(); 5442 5443 // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) 5444 bool LookPassAnd0 = false; 5445 bool LookPassAnd1 = false; 5446 if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) 5447 std::swap(N0, N1); 5448 if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) 5449 std::swap(N0, N1); 5450 if (N0.getOpcode() == ISD::AND) { 5451 if (!N0.getNode()->hasOneUse()) 5452 return SDValue(); 5453 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5454 // Also handle 0xffff since the LHS is guaranteed to have zeros there. 5455 // This is needed for X86. 5456 if (!N01C || (N01C->getZExtValue() != 0xFF00 && 5457 N01C->getZExtValue() != 0xFFFF)) 5458 return SDValue(); 5459 N0 = N0.getOperand(0); 5460 LookPassAnd0 = true; 5461 } 5462 5463 if (N1.getOpcode() == ISD::AND) { 5464 if (!N1.getNode()->hasOneUse()) 5465 return SDValue(); 5466 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5467 if (!N11C || N11C->getZExtValue() != 0xFF) 5468 return SDValue(); 5469 N1 = N1.getOperand(0); 5470 LookPassAnd1 = true; 5471 } 5472 5473 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 5474 std::swap(N0, N1); 5475 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 5476 return SDValue(); 5477 if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) 5478 return SDValue(); 5479 5480 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5481 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5482 if (!N01C || !N11C) 5483 return SDValue(); 5484 if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) 5485 return SDValue(); 5486 5487 // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) 5488 SDValue N00 = N0->getOperand(0); 5489 if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { 5490 if (!N00.getNode()->hasOneUse()) 5491 return SDValue(); 5492 ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1)); 5493 if (!N001C || N001C->getZExtValue() != 0xFF) 5494 return SDValue(); 5495 N00 = N00.getOperand(0); 5496 LookPassAnd0 = true; 5497 } 5498 5499 SDValue N10 = N1->getOperand(0); 5500 if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { 5501 if (!N10.getNode()->hasOneUse()) 5502 return SDValue(); 5503 ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1)); 5504 // Also allow 0xFFFF since the bits will be shifted out. This is needed 5505 // for X86. 5506 if (!N101C || (N101C->getZExtValue() != 0xFF00 && 5507 N101C->getZExtValue() != 0xFFFF)) 5508 return SDValue(); 5509 N10 = N10.getOperand(0); 5510 LookPassAnd1 = true; 5511 } 5512 5513 if (N00 != N10) 5514 return SDValue(); 5515 5516 // Make sure everything beyond the low halfword gets set to zero since the SRL 5517 // 16 will clear the top bits. 5518 unsigned OpSizeInBits = VT.getSizeInBits(); 5519 if (DemandHighBits && OpSizeInBits > 16) { 5520 // If the left-shift isn't masked out then the only way this is a bswap is 5521 // if all bits beyond the low 8 are 0. In that case the entire pattern 5522 // reduces to a left shift anyway: leave it for other parts of the combiner. 5523 if (!LookPassAnd0) 5524 return SDValue(); 5525 5526 // However, if the right shift isn't masked out then it might be because 5527 // it's not needed. See if we can spot that too. 5528 if (!LookPassAnd1 && 5529 !DAG.MaskedValueIsZero( 5530 N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) 5531 return SDValue(); 5532 } 5533 5534 SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); 5535 if (OpSizeInBits > 16) { 5536 SDLoc DL(N); 5537 Res = DAG.getNode(ISD::SRL, DL, VT, Res, 5538 DAG.getConstant(OpSizeInBits - 16, DL, 5539 getShiftAmountTy(VT))); 5540 } 5541 return Res; 5542 } 5543 5544 /// Return true if the specified node is an element that makes up a 32-bit 5545 /// packed halfword byteswap. 5546 /// ((x & 0x000000ff) << 8) | 5547 /// ((x & 0x0000ff00) >> 8) | 5548 /// ((x & 0x00ff0000) << 8) | 5549 /// ((x & 0xff000000) >> 8) 5550 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { 5551 if (!N.getNode()->hasOneUse()) 5552 return false; 5553 5554 unsigned Opc = N.getOpcode(); 5555 if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) 5556 return false; 5557 5558 SDValue N0 = N.getOperand(0); 5559 unsigned Opc0 = N0.getOpcode(); 5560 if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) 5561 return false; 5562 5563 ConstantSDNode *N1C = nullptr; 5564 // SHL or SRL: look upstream for AND mask operand 5565 if (Opc == ISD::AND) 5566 N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5567 else if (Opc0 == ISD::AND) 5568 N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5569 if (!N1C) 5570 return false; 5571 5572 unsigned MaskByteOffset; 5573 switch (N1C->getZExtValue()) { 5574 default: 5575 return false; 5576 case 0xFF: MaskByteOffset = 0; break; 5577 case 0xFF00: MaskByteOffset = 1; break; 5578 case 0xFFFF: 5579 // In case demanded bits didn't clear the bits that will be shifted out. 5580 // This is needed for X86. 5581 if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) { 5582 MaskByteOffset = 1; 5583 break; 5584 } 5585 return false; 5586 case 0xFF0000: MaskByteOffset = 2; break; 5587 case 0xFF000000: MaskByteOffset = 3; break; 5588 } 5589 5590 // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). 5591 if (Opc == ISD::AND) { 5592 if (MaskByteOffset == 0 || MaskByteOffset == 2) { 5593 // (x >> 8) & 0xff 5594 // (x >> 8) & 0xff0000 5595 if (Opc0 != ISD::SRL) 5596 return false; 5597 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5598 if (!C || C->getZExtValue() != 8) 5599 return false; 5600 } else { 5601 // (x << 8) & 0xff00 5602 // (x << 8) & 0xff000000 5603 if (Opc0 != ISD::SHL) 5604 return false; 5605 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5606 if (!C || C->getZExtValue() != 8) 5607 return false; 5608 } 5609 } else if (Opc == ISD::SHL) { 5610 // (x & 0xff) << 8 5611 // (x & 0xff0000) << 8 5612 if (MaskByteOffset != 0 && MaskByteOffset != 2) 5613 return false; 5614 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5615 if (!C || C->getZExtValue() != 8) 5616 return false; 5617 } else { // Opc == ISD::SRL 5618 // (x & 0xff00) >> 8 5619 // (x & 0xff000000) >> 8 5620 if (MaskByteOffset != 1 && MaskByteOffset != 3) 5621 return false; 5622 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5623 if (!C || C->getZExtValue() != 8) 5624 return false; 5625 } 5626 5627 if (Parts[MaskByteOffset]) 5628 return false; 5629 5630 Parts[MaskByteOffset] = N0.getOperand(0).getNode(); 5631 return true; 5632 } 5633 5634 // Match 2 elements of a packed halfword bswap. 5635 static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) { 5636 if (N.getOpcode() == ISD::OR) 5637 return isBSwapHWordElement(N.getOperand(0), Parts) && 5638 isBSwapHWordElement(N.getOperand(1), Parts); 5639 5640 if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) { 5641 ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1)); 5642 if (!C || C->getAPIntValue() != 16) 5643 return false; 5644 Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode(); 5645 return true; 5646 } 5647 5648 return false; 5649 } 5650 5651 /// Match a 32-bit packed halfword bswap. That is 5652 /// ((x & 0x000000ff) << 8) | 5653 /// ((x & 0x0000ff00) >> 8) | 5654 /// ((x & 0x00ff0000) << 8) | 5655 /// ((x & 0xff000000) >> 8) 5656 /// => (rotl (bswap x), 16) 5657 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { 5658 if (!LegalOperations) 5659 return SDValue(); 5660 5661 EVT VT = N->getValueType(0); 5662 if (VT != MVT::i32) 5663 return SDValue(); 5664 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 5665 return SDValue(); 5666 5667 // Look for either 5668 // (or (bswaphpair), (bswaphpair)) 5669 // (or (or (bswaphpair), (and)), (and)) 5670 // (or (or (and), (bswaphpair)), (and)) 5671 SDNode *Parts[4] = {}; 5672 5673 if (isBSwapHWordPair(N0, Parts)) { 5674 // (or (or (and), (and)), (or (and), (and))) 5675 if (!isBSwapHWordPair(N1, Parts)) 5676 return SDValue(); 5677 } else if (N0.getOpcode() == ISD::OR) { 5678 // (or (or (or (and), (and)), (and)), (and)) 5679 if (!isBSwapHWordElement(N1, Parts)) 5680 return SDValue(); 5681 SDValue N00 = N0.getOperand(0); 5682 SDValue N01 = N0.getOperand(1); 5683 if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) && 5684 !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts))) 5685 return SDValue(); 5686 } else 5687 return SDValue(); 5688 5689 // Make sure the parts are all coming from the same node. 5690 if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) 5691 return SDValue(); 5692 5693 SDLoc DL(N); 5694 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, 5695 SDValue(Parts[0], 0)); 5696 5697 // Result of the bswap should be rotated by 16. If it's not legal, then 5698 // do (x << 16) | (x >> 16). 5699 SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT)); 5700 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) 5701 return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt); 5702 if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) 5703 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); 5704 return DAG.getNode(ISD::OR, DL, VT, 5705 DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt), 5706 DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt)); 5707 } 5708 5709 /// This contains all DAGCombine rules which reduce two values combined by 5710 /// an Or operation to a single value \see visitANDLike(). 5711 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { 5712 EVT VT = N1.getValueType(); 5713 SDLoc DL(N); 5714 5715 // fold (or x, undef) -> -1 5716 if (!LegalOperations && (N0.isUndef() || N1.isUndef())) 5717 return DAG.getAllOnesConstant(DL, VT); 5718 5719 if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) 5720 return V; 5721 5722 // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. 5723 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && 5724 // Don't increase # computations. 5725 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 5726 // We can only do this xform if we know that bits from X that are set in C2 5727 // but not in C1 are already zero. Likewise for Y. 5728 if (const ConstantSDNode *N0O1C = 5729 getAsNonOpaqueConstant(N0.getOperand(1))) { 5730 if (const ConstantSDNode *N1O1C = 5731 getAsNonOpaqueConstant(N1.getOperand(1))) { 5732 // We can only do this xform if we know that bits from X that are set in 5733 // C2 but not in C1 are already zero. Likewise for Y. 5734 const APInt &LHSMask = N0O1C->getAPIntValue(); 5735 const APInt &RHSMask = N1O1C->getAPIntValue(); 5736 5737 if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && 5738 DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { 5739 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 5740 N0.getOperand(0), N1.getOperand(0)); 5741 return DAG.getNode(ISD::AND, DL, VT, X, 5742 DAG.getConstant(LHSMask | RHSMask, DL, VT)); 5743 } 5744 } 5745 } 5746 } 5747 5748 // (or (and X, M), (and X, N)) -> (and X, (or M, N)) 5749 if (N0.getOpcode() == ISD::AND && 5750 N1.getOpcode() == ISD::AND && 5751 N0.getOperand(0) == N1.getOperand(0) && 5752 // Don't increase # computations. 5753 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 5754 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 5755 N0.getOperand(1), N1.getOperand(1)); 5756 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); 5757 } 5758 5759 return SDValue(); 5760 } 5761 5762 /// OR combines for which the commuted variant will be tried as well. 5763 static SDValue visitORCommutative( 5764 SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) { 5765 EVT VT = N0.getValueType(); 5766 if (N0.getOpcode() == ISD::AND) { 5767 // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y) 5768 if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1) 5769 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1); 5770 5771 // fold (or (and (xor Y, -1), X), Y) -> (or X, Y) 5772 if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1) 5773 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1); 5774 } 5775 5776 return SDValue(); 5777 } 5778 5779 SDValue DAGCombiner::visitOR(SDNode *N) { 5780 SDValue N0 = N->getOperand(0); 5781 SDValue N1 = N->getOperand(1); 5782 EVT VT = N1.getValueType(); 5783 5784 // x | x --> x 5785 if (N0 == N1) 5786 return N0; 5787 5788 // fold vector ops 5789 if (VT.isVector()) { 5790 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 5791 return FoldedVOp; 5792 5793 // fold (or x, 0) -> x, vector edition 5794 if (ISD::isBuildVectorAllZeros(N0.getNode())) 5795 return N1; 5796 if (ISD::isBuildVectorAllZeros(N1.getNode())) 5797 return N0; 5798 5799 // fold (or x, -1) -> -1, vector edition 5800 if (ISD::isBuildVectorAllOnes(N0.getNode())) 5801 // do not return N0, because undef node may exist in N0 5802 return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); 5803 if (ISD::isBuildVectorAllOnes(N1.getNode())) 5804 // do not return N1, because undef node may exist in N1 5805 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); 5806 5807 // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) 5808 // Do this only if the resulting shuffle is legal. 5809 if (isa<ShuffleVectorSDNode>(N0) && 5810 isa<ShuffleVectorSDNode>(N1) && 5811 // Avoid folding a node with illegal type. 5812 TLI.isTypeLegal(VT)) { 5813 bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode()); 5814 bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()); 5815 bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 5816 bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); 5817 // Ensure both shuffles have a zero input. 5818 if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { 5819 assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); 5820 assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); 5821 const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); 5822 const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1); 5823 bool CanFold = true; 5824 int NumElts = VT.getVectorNumElements(); 5825 SmallVector<int, 4> Mask(NumElts); 5826 5827 for (int i = 0; i != NumElts; ++i) { 5828 int M0 = SV0->getMaskElt(i); 5829 int M1 = SV1->getMaskElt(i); 5830 5831 // Determine if either index is pointing to a zero vector. 5832 bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts)); 5833 bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts)); 5834 5835 // If one element is zero and the otherside is undef, keep undef. 5836 // This also handles the case that both are undef. 5837 if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) { 5838 Mask[i] = -1; 5839 continue; 5840 } 5841 5842 // Make sure only one of the elements is zero. 5843 if (M0Zero == M1Zero) { 5844 CanFold = false; 5845 break; 5846 } 5847 5848 assert((M0 >= 0 || M1 >= 0) && "Undef index!"); 5849 5850 // We have a zero and non-zero element. If the non-zero came from 5851 // SV0 make the index a LHS index. If it came from SV1, make it 5852 // a RHS index. We need to mod by NumElts because we don't care 5853 // which operand it came from in the original shuffles. 5854 Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts; 5855 } 5856 5857 if (CanFold) { 5858 SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0); 5859 SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0); 5860 5861 SDValue LegalShuffle = 5862 TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, 5863 Mask, DAG); 5864 if (LegalShuffle) 5865 return LegalShuffle; 5866 } 5867 } 5868 } 5869 } 5870 5871 // fold (or c1, c2) -> c1|c2 5872 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 5873 if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1})) 5874 return C; 5875 5876 // canonicalize constant to RHS 5877 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 5878 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 5879 return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); 5880 5881 // fold (or x, 0) -> x 5882 if (isNullConstant(N1)) 5883 return N0; 5884 5885 // fold (or x, -1) -> -1 5886 if (isAllOnesConstant(N1)) 5887 return N1; 5888 5889 if (SDValue NewSel = foldBinOpIntoSelect(N)) 5890 return NewSel; 5891 5892 // fold (or x, c) -> c iff (x & ~c) == 0 5893 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) 5894 return N1; 5895 5896 if (SDValue Combined = visitORLike(N0, N1, N)) 5897 return Combined; 5898 5899 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 5900 return Combined; 5901 5902 // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) 5903 if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) 5904 return BSwap; 5905 if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) 5906 return BSwap; 5907 5908 // reassociate or 5909 if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) 5910 return ROR; 5911 5912 // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) 5913 // iff (c1 & c2) != 0 or c1/c2 are undef. 5914 auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) { 5915 return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue()); 5916 }; 5917 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && 5918 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) { 5919 if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, 5920 {N1, N0.getOperand(1)})) { 5921 SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); 5922 AddToWorklist(IOR.getNode()); 5923 return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); 5924 } 5925 } 5926 5927 if (SDValue Combined = visitORCommutative(DAG, N0, N1, N)) 5928 return Combined; 5929 if (SDValue Combined = visitORCommutative(DAG, N1, N0, N)) 5930 return Combined; 5931 5932 // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) 5933 if (N0.getOpcode() == N1.getOpcode()) 5934 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 5935 return V; 5936 5937 // See if this is some rotate idiom. 5938 if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N))) 5939 return Rot; 5940 5941 if (SDValue Load = MatchLoadCombine(N)) 5942 return Load; 5943 5944 // Simplify the operands using demanded-bits information. 5945 if (SimplifyDemandedBits(SDValue(N, 0))) 5946 return SDValue(N, 0); 5947 5948 // If OR can be rewritten into ADD, try combines based on ADD. 5949 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) && 5950 DAG.haveNoCommonBitsSet(N0, N1)) 5951 if (SDValue Combined = visitADDLike(N)) 5952 return Combined; 5953 5954 return SDValue(); 5955 } 5956 5957 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) { 5958 if (Op.getOpcode() == ISD::AND && 5959 DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { 5960 Mask = Op.getOperand(1); 5961 return Op.getOperand(0); 5962 } 5963 return Op; 5964 } 5965 5966 /// Match "(X shl/srl V1) & V2" where V2 may not be present. 5967 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift, 5968 SDValue &Mask) { 5969 Op = stripConstantMask(DAG, Op, Mask); 5970 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { 5971 Shift = Op; 5972 return true; 5973 } 5974 return false; 5975 } 5976 5977 /// Helper function for visitOR to extract the needed side of a rotate idiom 5978 /// from a shl/srl/mul/udiv. This is meant to handle cases where 5979 /// InstCombine merged some outside op with one of the shifts from 5980 /// the rotate pattern. 5981 /// \returns An empty \c SDValue if the needed shift couldn't be extracted. 5982 /// Otherwise, returns an expansion of \p ExtractFrom based on the following 5983 /// patterns: 5984 /// 5985 /// (or (add v v) (shrl v bitwidth-1)): 5986 /// expands (add v v) -> (shl v 1) 5987 /// 5988 /// (or (mul v c0) (shrl (mul v c1) c2)): 5989 /// expands (mul v c0) -> (shl (mul v c1) c3) 5990 /// 5991 /// (or (udiv v c0) (shl (udiv v c1) c2)): 5992 /// expands (udiv v c0) -> (shrl (udiv v c1) c3) 5993 /// 5994 /// (or (shl v c0) (shrl (shl v c1) c2)): 5995 /// expands (shl v c0) -> (shl (shl v c1) c3) 5996 /// 5997 /// (or (shrl v c0) (shl (shrl v c1) c2)): 5998 /// expands (shrl v c0) -> (shrl (shrl v c1) c3) 5999 /// 6000 /// Such that in all cases, c3+c2==bitwidth(op v c1). 6001 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, 6002 SDValue ExtractFrom, SDValue &Mask, 6003 const SDLoc &DL) { 6004 assert(OppShift && ExtractFrom && "Empty SDValue"); 6005 assert( 6006 (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) && 6007 "Existing shift must be valid as a rotate half"); 6008 6009 ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask); 6010 6011 // Value and Type of the shift. 6012 SDValue OppShiftLHS = OppShift.getOperand(0); 6013 EVT ShiftedVT = OppShiftLHS.getValueType(); 6014 6015 // Amount of the existing shift. 6016 ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1)); 6017 6018 // (add v v) -> (shl v 1) 6019 if (OppShift.getOpcode() == ISD::SRL && OppShiftCst && 6020 ExtractFrom.getOpcode() == ISD::ADD && 6021 ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) && 6022 ExtractFrom.getOperand(0) == OppShiftLHS && 6023 OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1) 6024 return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS, 6025 DAG.getShiftAmountConstant(1, ShiftedVT, DL)); 6026 6027 // Preconditions: 6028 // (or (op0 v c0) (shiftl/r (op0 v c1) c2)) 6029 // 6030 // Find opcode of the needed shift to be extracted from (op0 v c0). 6031 unsigned Opcode = ISD::DELETED_NODE; 6032 bool IsMulOrDiv = false; 6033 // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift 6034 // opcode or its arithmetic (mul or udiv) variant. 6035 auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) { 6036 IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant; 6037 if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift) 6038 return false; 6039 Opcode = NeededShift; 6040 return true; 6041 }; 6042 // op0 must be either the needed shift opcode or the mul/udiv equivalent 6043 // that the needed shift can be extracted from. 6044 if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) && 6045 (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV))) 6046 return SDValue(); 6047 6048 // op0 must be the same opcode on both sides, have the same LHS argument, 6049 // and produce the same value type. 6050 if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() || 6051 OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) || 6052 ShiftedVT != ExtractFrom.getValueType()) 6053 return SDValue(); 6054 6055 // Constant mul/udiv/shift amount from the RHS of the shift's LHS op. 6056 ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1)); 6057 // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op. 6058 ConstantSDNode *ExtractFromCst = 6059 isConstOrConstSplat(ExtractFrom.getOperand(1)); 6060 // TODO: We should be able to handle non-uniform constant vectors for these values 6061 // Check that we have constant values. 6062 if (!OppShiftCst || !OppShiftCst->getAPIntValue() || 6063 !OppLHSCst || !OppLHSCst->getAPIntValue() || 6064 !ExtractFromCst || !ExtractFromCst->getAPIntValue()) 6065 return SDValue(); 6066 6067 // Compute the shift amount we need to extract to complete the rotate. 6068 const unsigned VTWidth = ShiftedVT.getScalarSizeInBits(); 6069 if (OppShiftCst->getAPIntValue().ugt(VTWidth)) 6070 return SDValue(); 6071 APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue(); 6072 // Normalize the bitwidth of the two mul/udiv/shift constant operands. 6073 APInt ExtractFromAmt = ExtractFromCst->getAPIntValue(); 6074 APInt OppLHSAmt = OppLHSCst->getAPIntValue(); 6075 zeroExtendToMatch(ExtractFromAmt, OppLHSAmt); 6076 6077 // Now try extract the needed shift from the ExtractFrom op and see if the 6078 // result matches up with the existing shift's LHS op. 6079 if (IsMulOrDiv) { 6080 // Op to extract from is a mul or udiv by a constant. 6081 // Check: 6082 // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0 6083 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0 6084 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(), 6085 NeededShiftAmt.getZExtValue()); 6086 APInt ResultAmt; 6087 APInt Rem; 6088 APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem); 6089 if (Rem != 0 || ResultAmt != OppLHSAmt) 6090 return SDValue(); 6091 } else { 6092 // Op to extract from is a shift by a constant. 6093 // Check: 6094 // c2 - (bitwidth(op0 v c0) - c1) == c0 6095 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc( 6096 ExtractFromAmt.getBitWidth())) 6097 return SDValue(); 6098 } 6099 6100 // Return the expanded shift op that should allow a rotate to be formed. 6101 EVT ShiftVT = OppShift.getOperand(1).getValueType(); 6102 EVT ResVT = ExtractFrom.getValueType(); 6103 SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT); 6104 return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode); 6105 } 6106 6107 // Return true if we can prove that, whenever Neg and Pos are both in the 6108 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that 6109 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: 6110 // 6111 // (or (shift1 X, Neg), (shift2 X, Pos)) 6112 // 6113 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate 6114 // in direction shift1 by Neg. The range [0, EltSize) means that we only need 6115 // to consider shift amounts with defined behavior. 6116 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, 6117 SelectionDAG &DAG) { 6118 // If EltSize is a power of 2 then: 6119 // 6120 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) 6121 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). 6122 // 6123 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check 6124 // for the stronger condition: 6125 // 6126 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] 6127 // 6128 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) 6129 // we can just replace Neg with Neg' for the rest of the function. 6130 // 6131 // In other cases we check for the even stronger condition: 6132 // 6133 // Neg == EltSize - Pos [B] 6134 // 6135 // for all Neg and Pos. Note that the (or ...) then invokes undefined 6136 // behavior if Pos == 0 (and consequently Neg == EltSize). 6137 // 6138 // We could actually use [A] whenever EltSize is a power of 2, but the 6139 // only extra cases that it would match are those uninteresting ones 6140 // where Neg and Pos are never in range at the same time. E.g. for 6141 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) 6142 // as well as (sub 32, Pos), but: 6143 // 6144 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) 6145 // 6146 // always invokes undefined behavior for 32-bit X. 6147 // 6148 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. 6149 unsigned MaskLoBits = 0; 6150 if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { 6151 if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { 6152 KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0)); 6153 unsigned Bits = Log2_64(EltSize); 6154 if (NegC->getAPIntValue().getActiveBits() <= Bits && 6155 ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { 6156 Neg = Neg.getOperand(0); 6157 MaskLoBits = Bits; 6158 } 6159 } 6160 } 6161 6162 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. 6163 if (Neg.getOpcode() != ISD::SUB) 6164 return false; 6165 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); 6166 if (!NegC) 6167 return false; 6168 SDValue NegOp1 = Neg.getOperand(1); 6169 6170 // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with 6171 // Pos'. The truncation is redundant for the purpose of the equality. 6172 if (MaskLoBits && Pos.getOpcode() == ISD::AND) { 6173 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { 6174 KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0)); 6175 if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && 6176 ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= 6177 MaskLoBits)) 6178 Pos = Pos.getOperand(0); 6179 } 6180 } 6181 6182 // The condition we need is now: 6183 // 6184 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask 6185 // 6186 // If NegOp1 == Pos then we need: 6187 // 6188 // EltSize & Mask == NegC & Mask 6189 // 6190 // (because "x & Mask" is a truncation and distributes through subtraction). 6191 APInt Width; 6192 if (Pos == NegOp1) 6193 Width = NegC->getAPIntValue(); 6194 6195 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. 6196 // Then the condition we want to prove becomes: 6197 // 6198 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask 6199 // 6200 // which, again because "x & Mask" is a truncation, becomes: 6201 // 6202 // NegC & Mask == (EltSize - PosC) & Mask 6203 // EltSize & Mask == (NegC + PosC) & Mask 6204 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { 6205 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) 6206 Width = PosC->getAPIntValue() + NegC->getAPIntValue(); 6207 else 6208 return false; 6209 } else 6210 return false; 6211 6212 // Now we just need to check that EltSize & Mask == Width & Mask. 6213 if (MaskLoBits) 6214 // EltSize & Mask is 0 since Mask is EltSize - 1. 6215 return Width.getLoBits(MaskLoBits) == 0; 6216 return Width == EltSize; 6217 } 6218 6219 // A subroutine of MatchRotate used once we have found an OR of two opposite 6220 // shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces 6221 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the 6222 // former being preferred if supported. InnerPos and InnerNeg are Pos and 6223 // Neg with outer conversions stripped away. 6224 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, 6225 SDValue Neg, SDValue InnerPos, 6226 SDValue InnerNeg, unsigned PosOpcode, 6227 unsigned NegOpcode, const SDLoc &DL) { 6228 // fold (or (shl x, (*ext y)), 6229 // (srl x, (*ext (sub 32, y)))) -> 6230 // (rotl x, y) or (rotr x, (sub 32, y)) 6231 // 6232 // fold (or (shl x, (*ext (sub 32, y))), 6233 // (srl x, (*ext y))) -> 6234 // (rotr x, y) or (rotl x, (sub 32, y)) 6235 EVT VT = Shifted.getValueType(); 6236 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) { 6237 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); 6238 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, 6239 HasPos ? Pos : Neg); 6240 } 6241 6242 return SDValue(); 6243 } 6244 6245 // MatchRotate - Handle an 'or' of two operands. If this is one of the many 6246 // idioms for rotate, and if the target supports rotation instructions, generate 6247 // a rot[lr]. 6248 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { 6249 // Must be a legal type. Expanded 'n promoted things won't work with rotates. 6250 EVT VT = LHS.getValueType(); 6251 if (!TLI.isTypeLegal(VT)) 6252 return SDValue(); 6253 6254 // The target must have at least one rotate flavor. 6255 bool HasROTL = hasOperation(ISD::ROTL, VT); 6256 bool HasROTR = hasOperation(ISD::ROTR, VT); 6257 if (!HasROTL && !HasROTR) 6258 return SDValue(); 6259 6260 // Check for truncated rotate. 6261 if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && 6262 LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { 6263 assert(LHS.getValueType() == RHS.getValueType()); 6264 if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { 6265 return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot); 6266 } 6267 } 6268 6269 // Match "(X shl/srl V1) & V2" where V2 may not be present. 6270 SDValue LHSShift; // The shift. 6271 SDValue LHSMask; // AND value if any. 6272 matchRotateHalf(DAG, LHS, LHSShift, LHSMask); 6273 6274 SDValue RHSShift; // The shift. 6275 SDValue RHSMask; // AND value if any. 6276 matchRotateHalf(DAG, RHS, RHSShift, RHSMask); 6277 6278 // If neither side matched a rotate half, bail 6279 if (!LHSShift && !RHSShift) 6280 return SDValue(); 6281 6282 // InstCombine may have combined a constant shl, srl, mul, or udiv with one 6283 // side of the rotate, so try to handle that here. In all cases we need to 6284 // pass the matched shift from the opposite side to compute the opcode and 6285 // needed shift amount to extract. We still want to do this if both sides 6286 // matched a rotate half because one half may be a potential overshift that 6287 // can be broken down (ie if InstCombine merged two shl or srl ops into a 6288 // single one). 6289 6290 // Have LHS side of the rotate, try to extract the needed shift from the RHS. 6291 if (LHSShift) 6292 if (SDValue NewRHSShift = 6293 extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL)) 6294 RHSShift = NewRHSShift; 6295 // Have RHS side of the rotate, try to extract the needed shift from the LHS. 6296 if (RHSShift) 6297 if (SDValue NewLHSShift = 6298 extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL)) 6299 LHSShift = NewLHSShift; 6300 6301 // If a side is still missing, nothing else we can do. 6302 if (!RHSShift || !LHSShift) 6303 return SDValue(); 6304 6305 // At this point we've matched or extracted a shift op on each side. 6306 6307 if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) 6308 return SDValue(); // Not shifting the same value. 6309 6310 if (LHSShift.getOpcode() == RHSShift.getOpcode()) 6311 return SDValue(); // Shifts must disagree. 6312 6313 // Canonicalize shl to left side in a shl/srl pair. 6314 if (RHSShift.getOpcode() == ISD::SHL) { 6315 std::swap(LHS, RHS); 6316 std::swap(LHSShift, RHSShift); 6317 std::swap(LHSMask, RHSMask); 6318 } 6319 6320 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 6321 SDValue LHSShiftArg = LHSShift.getOperand(0); 6322 SDValue LHSShiftAmt = LHSShift.getOperand(1); 6323 SDValue RHSShiftArg = RHSShift.getOperand(0); 6324 SDValue RHSShiftAmt = RHSShift.getOperand(1); 6325 6326 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) 6327 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) 6328 auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, 6329 ConstantSDNode *RHS) { 6330 return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; 6331 }; 6332 if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { 6333 SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, 6334 LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); 6335 6336 // If there is an AND of either shifted operand, apply it to the result. 6337 if (LHSMask.getNode() || RHSMask.getNode()) { 6338 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 6339 SDValue Mask = AllOnes; 6340 6341 if (LHSMask.getNode()) { 6342 SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); 6343 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 6344 DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); 6345 } 6346 if (RHSMask.getNode()) { 6347 SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); 6348 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 6349 DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); 6350 } 6351 6352 Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); 6353 } 6354 6355 return Rot; 6356 } 6357 6358 // If there is a mask here, and we have a variable shift, we can't be sure 6359 // that we're masking out the right stuff. 6360 if (LHSMask.getNode() || RHSMask.getNode()) 6361 return SDValue(); 6362 6363 // If the shift amount is sign/zext/any-extended just peel it off. 6364 SDValue LExtOp0 = LHSShiftAmt; 6365 SDValue RExtOp0 = RHSShiftAmt; 6366 if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 6367 LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 6368 LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 6369 LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && 6370 (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 6371 RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 6372 RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 6373 RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { 6374 LExtOp0 = LHSShiftAmt.getOperand(0); 6375 RExtOp0 = RHSShiftAmt.getOperand(0); 6376 } 6377 6378 SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, 6379 LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL); 6380 if (TryL) 6381 return TryL; 6382 6383 SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, 6384 RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL); 6385 if (TryR) 6386 return TryR; 6387 6388 return SDValue(); 6389 } 6390 6391 namespace { 6392 6393 /// Represents known origin of an individual byte in load combine pattern. The 6394 /// value of the byte is either constant zero or comes from memory. 6395 struct ByteProvider { 6396 // For constant zero providers Load is set to nullptr. For memory providers 6397 // Load represents the node which loads the byte from memory. 6398 // ByteOffset is the offset of the byte in the value produced by the load. 6399 LoadSDNode *Load = nullptr; 6400 unsigned ByteOffset = 0; 6401 6402 ByteProvider() = default; 6403 6404 static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { 6405 return ByteProvider(Load, ByteOffset); 6406 } 6407 6408 static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } 6409 6410 bool isConstantZero() const { return !Load; } 6411 bool isMemory() const { return Load; } 6412 6413 bool operator==(const ByteProvider &Other) const { 6414 return Other.Load == Load && Other.ByteOffset == ByteOffset; 6415 } 6416 6417 private: 6418 ByteProvider(LoadSDNode *Load, unsigned ByteOffset) 6419 : Load(Load), ByteOffset(ByteOffset) {} 6420 }; 6421 6422 } // end anonymous namespace 6423 6424 /// Recursively traverses the expression calculating the origin of the requested 6425 /// byte of the given value. Returns None if the provider can't be calculated. 6426 /// 6427 /// For all the values except the root of the expression verifies that the value 6428 /// has exactly one use and if it's not true return None. This way if the origin 6429 /// of the byte is returned it's guaranteed that the values which contribute to 6430 /// the byte are not used outside of this expression. 6431 /// 6432 /// Because the parts of the expression are not allowed to have more than one 6433 /// use this function iterates over trees, not DAGs. So it never visits the same 6434 /// node more than once. 6435 static const Optional<ByteProvider> 6436 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, 6437 bool Root = false) { 6438 // Typical i64 by i8 pattern requires recursion up to 8 calls depth 6439 if (Depth == 10) 6440 return None; 6441 6442 if (!Root && !Op.hasOneUse()) 6443 return None; 6444 6445 assert(Op.getValueType().isScalarInteger() && "can't handle other types"); 6446 unsigned BitWidth = Op.getValueSizeInBits(); 6447 if (BitWidth % 8 != 0) 6448 return None; 6449 unsigned ByteWidth = BitWidth / 8; 6450 assert(Index < ByteWidth && "invalid index requested"); 6451 (void) ByteWidth; 6452 6453 switch (Op.getOpcode()) { 6454 case ISD::OR: { 6455 auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); 6456 if (!LHS) 6457 return None; 6458 auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); 6459 if (!RHS) 6460 return None; 6461 6462 if (LHS->isConstantZero()) 6463 return RHS; 6464 if (RHS->isConstantZero()) 6465 return LHS; 6466 return None; 6467 } 6468 case ISD::SHL: { 6469 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 6470 if (!ShiftOp) 6471 return None; 6472 6473 uint64_t BitShift = ShiftOp->getZExtValue(); 6474 if (BitShift % 8 != 0) 6475 return None; 6476 uint64_t ByteShift = BitShift / 8; 6477 6478 return Index < ByteShift 6479 ? ByteProvider::getConstantZero() 6480 : calculateByteProvider(Op->getOperand(0), Index - ByteShift, 6481 Depth + 1); 6482 } 6483 case ISD::ANY_EXTEND: 6484 case ISD::SIGN_EXTEND: 6485 case ISD::ZERO_EXTEND: { 6486 SDValue NarrowOp = Op->getOperand(0); 6487 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); 6488 if (NarrowBitWidth % 8 != 0) 6489 return None; 6490 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 6491 6492 if (Index >= NarrowByteWidth) 6493 return Op.getOpcode() == ISD::ZERO_EXTEND 6494 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 6495 : None; 6496 return calculateByteProvider(NarrowOp, Index, Depth + 1); 6497 } 6498 case ISD::BSWAP: 6499 return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, 6500 Depth + 1); 6501 case ISD::LOAD: { 6502 auto L = cast<LoadSDNode>(Op.getNode()); 6503 if (!L->isSimple() || L->isIndexed()) 6504 return None; 6505 6506 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 6507 if (NarrowBitWidth % 8 != 0) 6508 return None; 6509 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 6510 6511 if (Index >= NarrowByteWidth) 6512 return L->getExtensionType() == ISD::ZEXTLOAD 6513 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 6514 : None; 6515 return ByteProvider::getMemory(L, Index); 6516 } 6517 } 6518 6519 return None; 6520 } 6521 6522 static unsigned LittleEndianByteAt(unsigned BW, unsigned i) { 6523 return i; 6524 } 6525 6526 static unsigned BigEndianByteAt(unsigned BW, unsigned i) { 6527 return BW - i - 1; 6528 } 6529 6530 // Check if the bytes offsets we are looking at match with either big or 6531 // little endian value loaded. Return true for big endian, false for little 6532 // endian, and None if match failed. 6533 static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, 6534 int64_t FirstOffset) { 6535 // The endian can be decided only when it is 2 bytes at least. 6536 unsigned Width = ByteOffsets.size(); 6537 if (Width < 2) 6538 return None; 6539 6540 bool BigEndian = true, LittleEndian = true; 6541 for (unsigned i = 0; i < Width; i++) { 6542 int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; 6543 LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i); 6544 BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i); 6545 if (!BigEndian && !LittleEndian) 6546 return None; 6547 } 6548 6549 assert((BigEndian != LittleEndian) && "It should be either big endian or" 6550 "little endian"); 6551 return BigEndian; 6552 } 6553 6554 static SDValue stripTruncAndExt(SDValue Value) { 6555 switch (Value.getOpcode()) { 6556 case ISD::TRUNCATE: 6557 case ISD::ZERO_EXTEND: 6558 case ISD::SIGN_EXTEND: 6559 case ISD::ANY_EXTEND: 6560 return stripTruncAndExt(Value.getOperand(0)); 6561 } 6562 return Value; 6563 } 6564 6565 /// Match a pattern where a wide type scalar value is stored by several narrow 6566 /// stores. Fold it into a single store or a BSWAP and a store if the targets 6567 /// supports it. 6568 /// 6569 /// Assuming little endian target: 6570 /// i8 *p = ... 6571 /// i32 val = ... 6572 /// p[0] = (val >> 0) & 0xFF; 6573 /// p[1] = (val >> 8) & 0xFF; 6574 /// p[2] = (val >> 16) & 0xFF; 6575 /// p[3] = (val >> 24) & 0xFF; 6576 /// => 6577 /// *((i32)p) = val; 6578 /// 6579 /// i8 *p = ... 6580 /// i32 val = ... 6581 /// p[0] = (val >> 24) & 0xFF; 6582 /// p[1] = (val >> 16) & 0xFF; 6583 /// p[2] = (val >> 8) & 0xFF; 6584 /// p[3] = (val >> 0) & 0xFF; 6585 /// => 6586 /// *((i32)p) = BSWAP(val); 6587 SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { 6588 // Collect all the stores in the chain. 6589 SDValue Chain; 6590 SmallVector<StoreSDNode *, 8> Stores; 6591 for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) { 6592 // TODO: Allow unordered atomics when wider type is legal (see D66309) 6593 if (Store->getMemoryVT() != MVT::i8 || 6594 !Store->isSimple() || Store->isIndexed()) 6595 return SDValue(); 6596 Stores.push_back(Store); 6597 Chain = Store->getChain(); 6598 } 6599 // Handle the simple type only. 6600 unsigned Width = Stores.size(); 6601 EVT VT = EVT::getIntegerVT( 6602 *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits()); 6603 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 6604 return SDValue(); 6605 6606 if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) 6607 return SDValue(); 6608 6609 // Check if all the bytes of the combined value we are looking at are stored 6610 // to the same base address. Collect bytes offsets from Base address into 6611 // ByteOffsets. 6612 SDValue CombinedValue; 6613 SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX); 6614 int64_t FirstOffset = INT64_MAX; 6615 StoreSDNode *FirstStore = nullptr; 6616 Optional<BaseIndexOffset> Base; 6617 for (auto Store : Stores) { 6618 // All the stores store different byte of the CombinedValue. A truncate is 6619 // required to get that byte value. 6620 SDValue Trunc = Store->getValue(); 6621 if (Trunc.getOpcode() != ISD::TRUNCATE) 6622 return SDValue(); 6623 // A shift operation is required to get the right byte offset, except the 6624 // first byte. 6625 int64_t Offset = 0; 6626 SDValue Value = Trunc.getOperand(0); 6627 if (Value.getOpcode() == ISD::SRL || 6628 Value.getOpcode() == ISD::SRA) { 6629 ConstantSDNode *ShiftOffset = 6630 dyn_cast<ConstantSDNode>(Value.getOperand(1)); 6631 // Trying to match the following pattern. The shift offset must be 6632 // a constant and a multiple of 8. It is the byte offset in "y". 6633 // 6634 // x = srl y, offset 6635 // i8 z = trunc x 6636 // store z, ... 6637 if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8)) 6638 return SDValue(); 6639 6640 Offset = ShiftOffset->getSExtValue()/8; 6641 Value = Value.getOperand(0); 6642 } 6643 6644 // Stores must share the same combined value with different offsets. 6645 if (!CombinedValue) 6646 CombinedValue = Value; 6647 else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value)) 6648 return SDValue(); 6649 6650 // The trunc and all the extend operation should be stripped to get the 6651 // real value we are stored. 6652 else if (CombinedValue.getValueType() != VT) { 6653 if (Value.getValueType() == VT || 6654 Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits()) 6655 CombinedValue = Value; 6656 // Give up if the combined value type is smaller than the store size. 6657 if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits()) 6658 return SDValue(); 6659 } 6660 6661 // Stores must share the same base address 6662 BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); 6663 int64_t ByteOffsetFromBase = 0; 6664 if (!Base) 6665 Base = Ptr; 6666 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 6667 return SDValue(); 6668 6669 // Remember the first byte store 6670 if (ByteOffsetFromBase < FirstOffset) { 6671 FirstStore = Store; 6672 FirstOffset = ByteOffsetFromBase; 6673 } 6674 // Map the offset in the store and the offset in the combined value, and 6675 // early return if it has been set before. 6676 if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX) 6677 return SDValue(); 6678 ByteOffsets[Offset] = ByteOffsetFromBase; 6679 } 6680 6681 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 6682 assert(FirstStore && "First store must be set"); 6683 6684 // Check if the bytes of the combined value we are looking at match with 6685 // either big or little endian value store. 6686 Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); 6687 if (!IsBigEndian.hasValue()) 6688 return SDValue(); 6689 6690 // The node we are looking at matches with the pattern, check if we can 6691 // replace it with a single bswap if needed and store. 6692 6693 // If the store needs byte swap check if the target supports it 6694 bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian; 6695 6696 // Before legalize we can introduce illegal bswaps which will be later 6697 // converted to an explicit bswap sequence. This way we end up with a single 6698 // store and byte shuffling instead of several stores and byte shuffling. 6699 if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) 6700 return SDValue(); 6701 6702 // Check that a store of the wide type is both allowed and fast on the target 6703 bool Fast = false; 6704 bool Allowed = 6705 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 6706 *FirstStore->getMemOperand(), &Fast); 6707 if (!Allowed || !Fast) 6708 return SDValue(); 6709 6710 if (VT != CombinedValue.getValueType()) { 6711 assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() && 6712 "Get unexpected store value to combine"); 6713 CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, 6714 CombinedValue); 6715 } 6716 6717 if (NeedsBswap) 6718 CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue); 6719 6720 SDValue NewStore = 6721 DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(), 6722 FirstStore->getPointerInfo(), FirstStore->getAlignment()); 6723 6724 // Rely on other DAG combine rules to remove the other individual stores. 6725 DAG.ReplaceAllUsesWith(N, NewStore.getNode()); 6726 return NewStore; 6727 } 6728 6729 /// Match a pattern where a wide type scalar value is loaded by several narrow 6730 /// loads and combined by shifts and ors. Fold it into a single load or a load 6731 /// and a BSWAP if the targets supports it. 6732 /// 6733 /// Assuming little endian target: 6734 /// i8 *a = ... 6735 /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) 6736 /// => 6737 /// i32 val = *((i32)a) 6738 /// 6739 /// i8 *a = ... 6740 /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] 6741 /// => 6742 /// i32 val = BSWAP(*((i32)a)) 6743 /// 6744 /// TODO: This rule matches complex patterns with OR node roots and doesn't 6745 /// interact well with the worklist mechanism. When a part of the pattern is 6746 /// updated (e.g. one of the loads) its direct users are put into the worklist, 6747 /// but the root node of the pattern which triggers the load combine is not 6748 /// necessarily a direct user of the changed node. For example, once the address 6749 /// of t28 load is reassociated load combine won't be triggered: 6750 /// t25: i32 = add t4, Constant:i32<2> 6751 /// t26: i64 = sign_extend t25 6752 /// t27: i64 = add t2, t26 6753 /// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 6754 /// t29: i32 = zero_extend t28 6755 /// t32: i32 = shl t29, Constant:i8<8> 6756 /// t33: i32 = or t23, t32 6757 /// As a possible fix visitLoad can check if the load can be a part of a load 6758 /// combine pattern and add corresponding OR roots to the worklist. 6759 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { 6760 assert(N->getOpcode() == ISD::OR && 6761 "Can only match load combining against OR nodes"); 6762 6763 // Handles simple types only 6764 EVT VT = N->getValueType(0); 6765 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 6766 return SDValue(); 6767 unsigned ByteWidth = VT.getSizeInBits() / 8; 6768 6769 bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); 6770 auto MemoryByteOffset = [&] (ByteProvider P) { 6771 assert(P.isMemory() && "Must be a memory byte provider"); 6772 unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); 6773 assert(LoadBitWidth % 8 == 0 && 6774 "can only analyze providers for individual bytes not bit"); 6775 unsigned LoadByteWidth = LoadBitWidth / 8; 6776 return IsBigEndianTarget 6777 ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) 6778 : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); 6779 }; 6780 6781 Optional<BaseIndexOffset> Base; 6782 SDValue Chain; 6783 6784 SmallPtrSet<LoadSDNode *, 8> Loads; 6785 Optional<ByteProvider> FirstByteProvider; 6786 int64_t FirstOffset = INT64_MAX; 6787 6788 // Check if all the bytes of the OR we are looking at are loaded from the same 6789 // base address. Collect bytes offsets from Base address in ByteOffsets. 6790 SmallVector<int64_t, 8> ByteOffsets(ByteWidth); 6791 unsigned ZeroExtendedBytes = 0; 6792 for (int i = ByteWidth - 1; i >= 0; --i) { 6793 auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); 6794 if (!P) 6795 return SDValue(); 6796 6797 if (P->isConstantZero()) { 6798 // It's OK for the N most significant bytes to be 0, we can just 6799 // zero-extend the load. 6800 if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i))) 6801 return SDValue(); 6802 continue; 6803 } 6804 assert(P->isMemory() && "provenance should either be memory or zero"); 6805 6806 LoadSDNode *L = P->Load; 6807 assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && 6808 !L->isIndexed() && 6809 "Must be enforced by calculateByteProvider"); 6810 assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); 6811 6812 // All loads must share the same chain 6813 SDValue LChain = L->getChain(); 6814 if (!Chain) 6815 Chain = LChain; 6816 else if (Chain != LChain) 6817 return SDValue(); 6818 6819 // Loads must share the same base address 6820 BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); 6821 int64_t ByteOffsetFromBase = 0; 6822 if (!Base) 6823 Base = Ptr; 6824 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 6825 return SDValue(); 6826 6827 // Calculate the offset of the current byte from the base address 6828 ByteOffsetFromBase += MemoryByteOffset(*P); 6829 ByteOffsets[i] = ByteOffsetFromBase; 6830 6831 // Remember the first byte load 6832 if (ByteOffsetFromBase < FirstOffset) { 6833 FirstByteProvider = P; 6834 FirstOffset = ByteOffsetFromBase; 6835 } 6836 6837 Loads.insert(L); 6838 } 6839 assert(!Loads.empty() && "All the bytes of the value must be loaded from " 6840 "memory, so there must be at least one load which produces the value"); 6841 assert(Base && "Base address of the accessed memory location must be set"); 6842 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 6843 6844 bool NeedsZext = ZeroExtendedBytes > 0; 6845 6846 EVT MemVT = 6847 EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); 6848 6849 if (!MemVT.isSimple()) 6850 return SDValue(); 6851 6852 // Before legalize we can introduce too wide illegal loads which will be later 6853 // split into legal sized loads. This enables us to combine i64 load by i8 6854 // patterns to a couple of i32 loads on 32 bit targets. 6855 if (LegalOperations && 6856 !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, 6857 MemVT)) 6858 return SDValue(); 6859 6860 // Check if the bytes of the OR we are looking at match with either big or 6861 // little endian value load 6862 Optional<bool> IsBigEndian = isBigEndian( 6863 makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); 6864 if (!IsBigEndian.hasValue()) 6865 return SDValue(); 6866 6867 assert(FirstByteProvider && "must be set"); 6868 6869 // Ensure that the first byte is loaded from zero offset of the first load. 6870 // So the combined value can be loaded from the first load address. 6871 if (MemoryByteOffset(*FirstByteProvider) != 0) 6872 return SDValue(); 6873 LoadSDNode *FirstLoad = FirstByteProvider->Load; 6874 6875 // The node we are looking at matches with the pattern, check if we can 6876 // replace it with a single (possibly zero-extended) load and bswap + shift if 6877 // needed. 6878 6879 // If the load needs byte swap check if the target supports it 6880 bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; 6881 6882 // Before legalize we can introduce illegal bswaps which will be later 6883 // converted to an explicit bswap sequence. This way we end up with a single 6884 // load and byte shuffling instead of several loads and byte shuffling. 6885 // We do not introduce illegal bswaps when zero-extending as this tends to 6886 // introduce too many arithmetic instructions. 6887 if (NeedsBswap && (LegalOperations || NeedsZext) && 6888 !TLI.isOperationLegal(ISD::BSWAP, VT)) 6889 return SDValue(); 6890 6891 // If we need to bswap and zero extend, we have to insert a shift. Check that 6892 // it is legal. 6893 if (NeedsBswap && NeedsZext && LegalOperations && 6894 !TLI.isOperationLegal(ISD::SHL, VT)) 6895 return SDValue(); 6896 6897 // Check that a load of the wide type is both allowed and fast on the target 6898 bool Fast = false; 6899 bool Allowed = 6900 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 6901 *FirstLoad->getMemOperand(), &Fast); 6902 if (!Allowed || !Fast) 6903 return SDValue(); 6904 6905 SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, 6906 SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), 6907 FirstLoad->getPointerInfo(), MemVT, 6908 FirstLoad->getAlignment()); 6909 6910 // Transfer chain users from old loads to the new load. 6911 for (LoadSDNode *L : Loads) 6912 DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); 6913 6914 if (!NeedsBswap) 6915 return NewLoad; 6916 6917 SDValue ShiftedLoad = 6918 NeedsZext 6919 ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, 6920 DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, 6921 SDLoc(N), LegalOperations)) 6922 : NewLoad; 6923 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); 6924 } 6925 6926 // If the target has andn, bsl, or a similar bit-select instruction, 6927 // we want to unfold masked merge, with canonical pattern of: 6928 // | A | |B| 6929 // ((x ^ y) & m) ^ y 6930 // | D | 6931 // Into: 6932 // (x & m) | (y & ~m) 6933 // If y is a constant, and the 'andn' does not work with immediates, 6934 // we unfold into a different pattern: 6935 // ~(~x & m) & (m | y) 6936 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at 6937 // the very least that breaks andnpd / andnps patterns, and because those 6938 // patterns are simplified in IR and shouldn't be created in the DAG 6939 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { 6940 assert(N->getOpcode() == ISD::XOR); 6941 6942 // Don't touch 'not' (i.e. where y = -1). 6943 if (isAllOnesOrAllOnesSplat(N->getOperand(1))) 6944 return SDValue(); 6945 6946 EVT VT = N->getValueType(0); 6947 6948 // There are 3 commutable operators in the pattern, 6949 // so we have to deal with 8 possible variants of the basic pattern. 6950 SDValue X, Y, M; 6951 auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) { 6952 if (And.getOpcode() != ISD::AND || !And.hasOneUse()) 6953 return false; 6954 SDValue Xor = And.getOperand(XorIdx); 6955 if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse()) 6956 return false; 6957 SDValue Xor0 = Xor.getOperand(0); 6958 SDValue Xor1 = Xor.getOperand(1); 6959 // Don't touch 'not' (i.e. where y = -1). 6960 if (isAllOnesOrAllOnesSplat(Xor1)) 6961 return false; 6962 if (Other == Xor0) 6963 std::swap(Xor0, Xor1); 6964 if (Other != Xor1) 6965 return false; 6966 X = Xor0; 6967 Y = Xor1; 6968 M = And.getOperand(XorIdx ? 0 : 1); 6969 return true; 6970 }; 6971 6972 SDValue N0 = N->getOperand(0); 6973 SDValue N1 = N->getOperand(1); 6974 if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) && 6975 !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0)) 6976 return SDValue(); 6977 6978 // Don't do anything if the mask is constant. This should not be reachable. 6979 // InstCombine should have already unfolded this pattern, and DAGCombiner 6980 // probably shouldn't produce it, too. 6981 if (isa<ConstantSDNode>(M.getNode())) 6982 return SDValue(); 6983 6984 // We can transform if the target has AndNot 6985 if (!TLI.hasAndNot(M)) 6986 return SDValue(); 6987 6988 SDLoc DL(N); 6989 6990 // If Y is a constant, check that 'andn' works with immediates. 6991 if (!TLI.hasAndNot(Y)) { 6992 assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable."); 6993 // If not, we need to do a bit more work to make sure andn is still used. 6994 SDValue NotX = DAG.getNOT(DL, X, VT); 6995 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M); 6996 SDValue NotLHS = DAG.getNOT(DL, LHS, VT); 6997 SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y); 6998 return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS); 6999 } 7000 7001 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M); 7002 SDValue NotM = DAG.getNOT(DL, M, VT); 7003 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM); 7004 7005 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); 7006 } 7007 7008 SDValue DAGCombiner::visitXOR(SDNode *N) { 7009 SDValue N0 = N->getOperand(0); 7010 SDValue N1 = N->getOperand(1); 7011 EVT VT = N0.getValueType(); 7012 7013 // fold vector ops 7014 if (VT.isVector()) { 7015 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7016 return FoldedVOp; 7017 7018 // fold (xor x, 0) -> x, vector edition 7019 if (ISD::isBuildVectorAllZeros(N0.getNode())) 7020 return N1; 7021 if (ISD::isBuildVectorAllZeros(N1.getNode())) 7022 return N0; 7023 } 7024 7025 // fold (xor undef, undef) -> 0. This is a common idiom (misuse). 7026 SDLoc DL(N); 7027 if (N0.isUndef() && N1.isUndef()) 7028 return DAG.getConstant(0, DL, VT); 7029 7030 // fold (xor x, undef) -> undef 7031 if (N0.isUndef()) 7032 return N0; 7033 if (N1.isUndef()) 7034 return N1; 7035 7036 // fold (xor c1, c2) -> c1^c2 7037 if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1})) 7038 return C; 7039 7040 // canonicalize constant to RHS 7041 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 7042 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 7043 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 7044 7045 // fold (xor x, 0) -> x 7046 if (isNullConstant(N1)) 7047 return N0; 7048 7049 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7050 return NewSel; 7051 7052 // reassociate xor 7053 if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) 7054 return RXOR; 7055 7056 // fold !(x cc y) -> (x !cc y) 7057 unsigned N0Opcode = N0.getOpcode(); 7058 SDValue LHS, RHS, CC; 7059 if (TLI.isConstTrueVal(N1.getNode()) && 7060 isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) { 7061 ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 7062 LHS.getValueType()); 7063 if (!LegalOperations || 7064 TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { 7065 switch (N0Opcode) { 7066 default: 7067 llvm_unreachable("Unhandled SetCC Equivalent!"); 7068 case ISD::SETCC: 7069 return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); 7070 case ISD::SELECT_CC: 7071 return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), 7072 N0.getOperand(3), NotCC); 7073 case ISD::STRICT_FSETCC: 7074 case ISD::STRICT_FSETCCS: { 7075 if (N0.hasOneUse()) { 7076 // FIXME Can we handle multiple uses? Could we token factor the chain 7077 // results from the new/old setcc? 7078 SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, 7079 N0.getOperand(0), 7080 N0Opcode == ISD::STRICT_FSETCCS); 7081 CombineTo(N, SetCC); 7082 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1)); 7083 recursivelyDeleteUnusedNodes(N0.getNode()); 7084 return SDValue(N, 0); // Return N so it doesn't get rechecked! 7085 } 7086 break; 7087 } 7088 } 7089 } 7090 } 7091 7092 // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) 7093 if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() && 7094 isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ 7095 SDValue V = N0.getOperand(0); 7096 SDLoc DL0(N0); 7097 V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V, 7098 DAG.getConstant(1, DL0, V.getValueType())); 7099 AddToWorklist(V.getNode()); 7100 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V); 7101 } 7102 7103 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc 7104 if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && 7105 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7106 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7107 if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) { 7108 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7109 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7110 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7111 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7112 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7113 } 7114 } 7115 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants 7116 if (isAllOnesConstant(N1) && N0.hasOneUse() && 7117 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7118 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7119 if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) { 7120 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7121 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7122 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7123 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7124 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7125 } 7126 } 7127 7128 // fold (not (neg x)) -> (add X, -1) 7129 // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if 7130 // Y is a constant or the subtract has a single use. 7131 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && 7132 isNullConstant(N0.getOperand(0))) { 7133 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), 7134 DAG.getAllOnesConstant(DL, VT)); 7135 } 7136 7137 // fold (not (add X, -1)) -> (neg X) 7138 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD && 7139 isAllOnesOrAllOnesSplat(N0.getOperand(1))) { 7140 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 7141 N0.getOperand(0)); 7142 } 7143 7144 // fold (xor (and x, y), y) -> (and (not x), y) 7145 if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { 7146 SDValue X = N0.getOperand(0); 7147 SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); 7148 AddToWorklist(NotX.getNode()); 7149 return DAG.getNode(ISD::AND, DL, VT, NotX, N1); 7150 } 7151 7152 if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) { 7153 ConstantSDNode *XorC = isConstOrConstSplat(N1); 7154 ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1)); 7155 unsigned BitWidth = VT.getScalarSizeInBits(); 7156 if (XorC && ShiftC) { 7157 // Don't crash on an oversized shift. We can not guarantee that a bogus 7158 // shift has been simplified to undef. 7159 uint64_t ShiftAmt = ShiftC->getLimitedValue(); 7160 if (ShiftAmt < BitWidth) { 7161 APInt Ones = APInt::getAllOnesValue(BitWidth); 7162 Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt); 7163 if (XorC->getAPIntValue() == Ones) { 7164 // If the xor constant is a shifted -1, do a 'not' before the shift: 7165 // xor (X << ShiftC), XorC --> (not X) << ShiftC 7166 // xor (X >> ShiftC), XorC --> (not X) >> ShiftC 7167 SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT); 7168 return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1)); 7169 } 7170 } 7171 } 7172 } 7173 7174 // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) 7175 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 7176 SDValue A = N0Opcode == ISD::ADD ? N0 : N1; 7177 SDValue S = N0Opcode == ISD::SRA ? N0 : N1; 7178 if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { 7179 SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); 7180 SDValue S0 = S.getOperand(0); 7181 if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) { 7182 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7183 if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) 7184 if (C->getAPIntValue() == (OpSizeInBits - 1)) 7185 return DAG.getNode(ISD::ABS, DL, VT, S0); 7186 } 7187 } 7188 } 7189 7190 // fold (xor x, x) -> 0 7191 if (N0 == N1) 7192 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 7193 7194 // fold (xor (shl 1, x), -1) -> (rotl ~1, x) 7195 // Here is a concrete example of this equivalence: 7196 // i16 x == 14 7197 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000 7198 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111 7199 // 7200 // => 7201 // 7202 // i16 ~1 == 0b1111111111111110 7203 // i16 rol(~1, 14) == 0b1011111111111111 7204 // 7205 // Some additional tips to help conceptualize this transform: 7206 // - Try to see the operation as placing a single zero in a value of all ones. 7207 // - There exists no value for x which would allow the result to contain zero. 7208 // - Values of x larger than the bitwidth are undefined and do not require a 7209 // consistent result. 7210 // - Pushing the zero left requires shifting one bits in from the right. 7211 // A rotate left of ~1 is a nice way of achieving the desired result. 7212 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL && 7213 isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { 7214 return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), 7215 N0.getOperand(1)); 7216 } 7217 7218 // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) 7219 if (N0Opcode == N1.getOpcode()) 7220 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 7221 return V; 7222 7223 // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable 7224 if (SDValue MM = unfoldMaskedMerge(N)) 7225 return MM; 7226 7227 // Simplify the expression using non-local knowledge. 7228 if (SimplifyDemandedBits(SDValue(N, 0))) 7229 return SDValue(N, 0); 7230 7231 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 7232 return Combined; 7233 7234 return SDValue(); 7235 } 7236 7237 /// If we have a shift-by-constant of a bitwise logic op that itself has a 7238 /// shift-by-constant operand with identical opcode, we may be able to convert 7239 /// that into 2 independent shifts followed by the logic op. This is a 7240 /// throughput improvement. 7241 static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) { 7242 // Match a one-use bitwise logic op. 7243 SDValue LogicOp = Shift->getOperand(0); 7244 if (!LogicOp.hasOneUse()) 7245 return SDValue(); 7246 7247 unsigned LogicOpcode = LogicOp.getOpcode(); 7248 if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR && 7249 LogicOpcode != ISD::XOR) 7250 return SDValue(); 7251 7252 // Find a matching one-use shift by constant. 7253 unsigned ShiftOpcode = Shift->getOpcode(); 7254 SDValue C1 = Shift->getOperand(1); 7255 ConstantSDNode *C1Node = isConstOrConstSplat(C1); 7256 assert(C1Node && "Expected a shift with constant operand"); 7257 const APInt &C1Val = C1Node->getAPIntValue(); 7258 auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp, 7259 const APInt *&ShiftAmtVal) { 7260 if (V.getOpcode() != ShiftOpcode || !V.hasOneUse()) 7261 return false; 7262 7263 ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1)); 7264 if (!ShiftCNode) 7265 return false; 7266 7267 // Capture the shifted operand and shift amount value. 7268 ShiftOp = V.getOperand(0); 7269 ShiftAmtVal = &ShiftCNode->getAPIntValue(); 7270 7271 // Shift amount types do not have to match their operand type, so check that 7272 // the constants are the same width. 7273 if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth()) 7274 return false; 7275 7276 // The fold is not valid if the sum of the shift values exceeds bitwidth. 7277 if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits())) 7278 return false; 7279 7280 return true; 7281 }; 7282 7283 // Logic ops are commutative, so check each operand for a match. 7284 SDValue X, Y; 7285 const APInt *C0Val; 7286 if (matchFirstShift(LogicOp.getOperand(0), X, C0Val)) 7287 Y = LogicOp.getOperand(1); 7288 else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val)) 7289 Y = LogicOp.getOperand(0); 7290 else 7291 return SDValue(); 7292 7293 // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) 7294 SDLoc DL(Shift); 7295 EVT VT = Shift->getValueType(0); 7296 EVT ShiftAmtVT = Shift->getOperand(1).getValueType(); 7297 SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT); 7298 SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC); 7299 SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1); 7300 return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); 7301 } 7302 7303 /// Handle transforms common to the three shifts, when the shift amount is a 7304 /// constant. 7305 /// We are looking for: (shift being one of shl/sra/srl) 7306 /// shift (binop X, C0), C1 7307 /// And want to transform into: 7308 /// binop (shift X, C1), (shift C0, C1) 7309 SDValue DAGCombiner::visitShiftByConstant(SDNode *N) { 7310 assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand"); 7311 7312 // Do not turn a 'not' into a regular xor. 7313 if (isBitwiseNot(N->getOperand(0))) 7314 return SDValue(); 7315 7316 // The inner binop must be one-use, since we want to replace it. 7317 SDValue LHS = N->getOperand(0); 7318 if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level)) 7319 return SDValue(); 7320 7321 // TODO: This is limited to early combining because it may reveal regressions 7322 // otherwise. But since we just checked a target hook to see if this is 7323 // desirable, that should have filtered out cases where this interferes 7324 // with some other pattern matching. 7325 if (!LegalTypes) 7326 if (SDValue R = combineShiftOfShiftedLogic(N, DAG)) 7327 return R; 7328 7329 // We want to pull some binops through shifts, so that we have (and (shift)) 7330 // instead of (shift (and)), likewise for add, or, xor, etc. This sort of 7331 // thing happens with address calculations, so it's important to canonicalize 7332 // it. 7333 switch (LHS.getOpcode()) { 7334 default: 7335 return SDValue(); 7336 case ISD::OR: 7337 case ISD::XOR: 7338 case ISD::AND: 7339 break; 7340 case ISD::ADD: 7341 if (N->getOpcode() != ISD::SHL) 7342 return SDValue(); // only shl(add) not sr[al](add). 7343 break; 7344 } 7345 7346 // We require the RHS of the binop to be a constant and not opaque as well. 7347 ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1)); 7348 if (!BinOpCst) 7349 return SDValue(); 7350 7351 // FIXME: disable this unless the input to the binop is a shift by a constant 7352 // or is copy/select. Enable this in other cases when figure out it's exactly 7353 // profitable. 7354 SDValue BinOpLHSVal = LHS.getOperand(0); 7355 bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL || 7356 BinOpLHSVal.getOpcode() == ISD::SRA || 7357 BinOpLHSVal.getOpcode() == ISD::SRL) && 7358 isa<ConstantSDNode>(BinOpLHSVal.getOperand(1)); 7359 bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg || 7360 BinOpLHSVal.getOpcode() == ISD::SELECT; 7361 7362 if (!IsShiftByConstant && !IsCopyOrSelect) 7363 return SDValue(); 7364 7365 if (IsCopyOrSelect && N->hasOneUse()) 7366 return SDValue(); 7367 7368 // Fold the constants, shifting the binop RHS by the shift amount. 7369 SDLoc DL(N); 7370 EVT VT = N->getValueType(0); 7371 SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1), 7372 N->getOperand(1)); 7373 assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!"); 7374 7375 SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0), 7376 N->getOperand(1)); 7377 return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS); 7378 } 7379 7380 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { 7381 assert(N->getOpcode() == ISD::TRUNCATE); 7382 assert(N->getOperand(0).getOpcode() == ISD::AND); 7383 7384 // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) 7385 EVT TruncVT = N->getValueType(0); 7386 if (N->hasOneUse() && N->getOperand(0).hasOneUse() && 7387 TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { 7388 SDValue N01 = N->getOperand(0).getOperand(1); 7389 if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { 7390 SDLoc DL(N); 7391 SDValue N00 = N->getOperand(0).getOperand(0); 7392 SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); 7393 SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); 7394 AddToWorklist(Trunc00.getNode()); 7395 AddToWorklist(Trunc01.getNode()); 7396 return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); 7397 } 7398 } 7399 7400 return SDValue(); 7401 } 7402 7403 SDValue DAGCombiner::visitRotate(SDNode *N) { 7404 SDLoc dl(N); 7405 SDValue N0 = N->getOperand(0); 7406 SDValue N1 = N->getOperand(1); 7407 EVT VT = N->getValueType(0); 7408 unsigned Bitsize = VT.getScalarSizeInBits(); 7409 7410 // fold (rot x, 0) -> x 7411 if (isNullOrNullSplat(N1)) 7412 return N0; 7413 7414 // fold (rot x, c) -> x iff (c % BitSize) == 0 7415 if (isPowerOf2_32(Bitsize) && Bitsize > 1) { 7416 APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1); 7417 if (DAG.MaskedValueIsZero(N1, ModuloMask)) 7418 return N0; 7419 } 7420 7421 // fold (rot x, c) -> (rot x, c % BitSize) 7422 // TODO - support non-uniform vector amounts. 7423 if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { 7424 if (Cst->getAPIntValue().uge(Bitsize)) { 7425 uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); 7426 return DAG.getNode(N->getOpcode(), dl, VT, N0, 7427 DAG.getConstant(RotAmt, dl, N1.getValueType())); 7428 } 7429 } 7430 7431 // Simplify the operands using demanded-bits information. 7432 if (SimplifyDemandedBits(SDValue(N, 0))) 7433 return SDValue(N, 0); 7434 7435 // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). 7436 if (N1.getOpcode() == ISD::TRUNCATE && 7437 N1.getOperand(0).getOpcode() == ISD::AND) { 7438 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 7439 return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); 7440 } 7441 7442 unsigned NextOp = N0.getOpcode(); 7443 // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) 7444 if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { 7445 SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); 7446 SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); 7447 if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { 7448 EVT ShiftVT = C1->getValueType(0); 7449 bool SameSide = (N->getOpcode() == NextOp); 7450 unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; 7451 if (SDValue CombinedShift = DAG.FoldConstantArithmetic( 7452 CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) { 7453 SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); 7454 SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( 7455 ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC}); 7456 return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), 7457 CombinedShiftNorm); 7458 } 7459 } 7460 } 7461 return SDValue(); 7462 } 7463 7464 SDValue DAGCombiner::visitSHL(SDNode *N) { 7465 SDValue N0 = N->getOperand(0); 7466 SDValue N1 = N->getOperand(1); 7467 if (SDValue V = DAG.simplifyShift(N0, N1)) 7468 return V; 7469 7470 EVT VT = N0.getValueType(); 7471 EVT ShiftVT = N1.getValueType(); 7472 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7473 7474 // fold vector ops 7475 if (VT.isVector()) { 7476 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7477 return FoldedVOp; 7478 7479 BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1); 7480 // If setcc produces all-one true value then: 7481 // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV) 7482 if (N1CV && N1CV->isConstant()) { 7483 if (N0.getOpcode() == ISD::AND) { 7484 SDValue N00 = N0->getOperand(0); 7485 SDValue N01 = N0->getOperand(1); 7486 BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01); 7487 7488 if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && 7489 TLI.getBooleanContents(N00.getOperand(0).getValueType()) == 7490 TargetLowering::ZeroOrNegativeOneBooleanContent) { 7491 if (SDValue C = 7492 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1})) 7493 return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); 7494 } 7495 } 7496 } 7497 } 7498 7499 ConstantSDNode *N1C = isConstOrConstSplat(N1); 7500 7501 // fold (shl c1, c2) -> c1<<c2 7502 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) 7503 return C; 7504 7505 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7506 return NewSel; 7507 7508 // if (shl x, c) is known to be zero, return 0 7509 if (DAG.MaskedValueIsZero(SDValue(N, 0), 7510 APInt::getAllOnesValue(OpSizeInBits))) 7511 return DAG.getConstant(0, SDLoc(N), VT); 7512 7513 // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). 7514 if (N1.getOpcode() == ISD::TRUNCATE && 7515 N1.getOperand(0).getOpcode() == ISD::AND) { 7516 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 7517 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); 7518 } 7519 7520 // TODO - support non-uniform vector shift amounts. 7521 if (N1C && SimplifyDemandedBits(SDValue(N, 0))) 7522 return SDValue(N, 0); 7523 7524 // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) 7525 if (N0.getOpcode() == ISD::SHL) { 7526 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 7527 ConstantSDNode *RHS) { 7528 APInt c1 = LHS->getAPIntValue(); 7529 APInt c2 = RHS->getAPIntValue(); 7530 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7531 return (c1 + c2).uge(OpSizeInBits); 7532 }; 7533 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 7534 return DAG.getConstant(0, SDLoc(N), VT); 7535 7536 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 7537 ConstantSDNode *RHS) { 7538 APInt c1 = LHS->getAPIntValue(); 7539 APInt c2 = RHS->getAPIntValue(); 7540 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7541 return (c1 + c2).ult(OpSizeInBits); 7542 }; 7543 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 7544 SDLoc DL(N); 7545 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 7546 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); 7547 } 7548 } 7549 7550 // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) 7551 // For this to be valid, the second form must not preserve any of the bits 7552 // that are shifted out by the inner shift in the first form. This means 7553 // the outer shift size must be >= the number of bits added by the ext. 7554 // As a corollary, we don't care what kind of ext it is. 7555 if ((N0.getOpcode() == ISD::ZERO_EXTEND || 7556 N0.getOpcode() == ISD::ANY_EXTEND || 7557 N0.getOpcode() == ISD::SIGN_EXTEND) && 7558 N0.getOperand(0).getOpcode() == ISD::SHL) { 7559 SDValue N0Op0 = N0.getOperand(0); 7560 SDValue InnerShiftAmt = N0Op0.getOperand(1); 7561 EVT InnerVT = N0Op0.getValueType(); 7562 uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits(); 7563 7564 auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 7565 ConstantSDNode *RHS) { 7566 APInt c1 = LHS->getAPIntValue(); 7567 APInt c2 = RHS->getAPIntValue(); 7568 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7569 return c2.uge(OpSizeInBits - InnerBitwidth) && 7570 (c1 + c2).uge(OpSizeInBits); 7571 }; 7572 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange, 7573 /*AllowUndefs*/ false, 7574 /*AllowTypeMismatch*/ true)) 7575 return DAG.getConstant(0, SDLoc(N), VT); 7576 7577 auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 7578 ConstantSDNode *RHS) { 7579 APInt c1 = LHS->getAPIntValue(); 7580 APInt c2 = RHS->getAPIntValue(); 7581 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7582 return c2.uge(OpSizeInBits - InnerBitwidth) && 7583 (c1 + c2).ult(OpSizeInBits); 7584 }; 7585 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange, 7586 /*AllowUndefs*/ false, 7587 /*AllowTypeMismatch*/ true)) { 7588 SDLoc DL(N); 7589 SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0)); 7590 SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT); 7591 Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1); 7592 return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum); 7593 } 7594 } 7595 7596 // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) 7597 // Only fold this if the inner zext has no other uses to avoid increasing 7598 // the total number of instructions. 7599 if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && 7600 N0.getOperand(0).getOpcode() == ISD::SRL) { 7601 SDValue N0Op0 = N0.getOperand(0); 7602 SDValue InnerShiftAmt = N0Op0.getOperand(1); 7603 7604 auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) { 7605 APInt c1 = LHS->getAPIntValue(); 7606 APInt c2 = RHS->getAPIntValue(); 7607 zeroExtendToMatch(c1, c2); 7608 return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2); 7609 }; 7610 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual, 7611 /*AllowUndefs*/ false, 7612 /*AllowTypeMismatch*/ true)) { 7613 SDLoc DL(N); 7614 EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType(); 7615 SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT); 7616 NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL); 7617 AddToWorklist(NewSHL.getNode()); 7618 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); 7619 } 7620 } 7621 7622 // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 7623 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 7624 // TODO - support non-uniform vector shift amounts. 7625 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && 7626 N0->getFlags().hasExact()) { 7627 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 7628 uint64_t C1 = N0C1->getZExtValue(); 7629 uint64_t C2 = N1C->getZExtValue(); 7630 SDLoc DL(N); 7631 if (C1 <= C2) 7632 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 7633 DAG.getConstant(C2 - C1, DL, ShiftVT)); 7634 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), 7635 DAG.getConstant(C1 - C2, DL, ShiftVT)); 7636 } 7637 } 7638 7639 // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or 7640 // (and (srl x, (sub c1, c2), MASK) 7641 // Only fold this if the inner shift has no other uses -- if it does, folding 7642 // this will increase the total number of instructions. 7643 // TODO - drop hasOneUse requirement if c1 == c2? 7644 // TODO - support non-uniform vector shift amounts. 7645 if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && 7646 TLI.shouldFoldConstantShiftPairToMask(N, Level)) { 7647 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 7648 if (N0C1->getAPIntValue().ult(OpSizeInBits)) { 7649 uint64_t c1 = N0C1->getZExtValue(); 7650 uint64_t c2 = N1C->getZExtValue(); 7651 APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); 7652 SDValue Shift; 7653 if (c2 > c1) { 7654 Mask <<= c2 - c1; 7655 SDLoc DL(N); 7656 Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 7657 DAG.getConstant(c2 - c1, DL, ShiftVT)); 7658 } else { 7659 Mask.lshrInPlace(c1 - c2); 7660 SDLoc DL(N); 7661 Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), 7662 DAG.getConstant(c1 - c2, DL, ShiftVT)); 7663 } 7664 SDLoc DL(N0); 7665 return DAG.getNode(ISD::AND, DL, VT, Shift, 7666 DAG.getConstant(Mask, DL, VT)); 7667 } 7668 } 7669 } 7670 7671 // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) 7672 if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && 7673 isConstantOrConstantVector(N1, /* No Opaques */ true)) { 7674 SDLoc DL(N); 7675 SDValue AllBits = DAG.getAllOnesConstant(DL, VT); 7676 SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); 7677 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); 7678 } 7679 7680 // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 7681 // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) 7682 // Variant of version done on multiply, except mul by a power of 2 is turned 7683 // into a shift. 7684 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && 7685 N0.getNode()->hasOneUse() && 7686 isConstantOrConstantVector(N1, /* No Opaques */ true) && 7687 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) && 7688 TLI.isDesirableToCommuteWithShift(N, Level)) { 7689 SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); 7690 SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 7691 AddToWorklist(Shl0.getNode()); 7692 AddToWorklist(Shl1.getNode()); 7693 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); 7694 } 7695 7696 // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) 7697 if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() && 7698 isConstantOrConstantVector(N1, /* No Opaques */ true) && 7699 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { 7700 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 7701 if (isConstantOrConstantVector(Shl)) 7702 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); 7703 } 7704 7705 if (N1C && !N1C->isOpaque()) 7706 if (SDValue NewSHL = visitShiftByConstant(N)) 7707 return NewSHL; 7708 7709 return SDValue(); 7710 } 7711 7712 SDValue DAGCombiner::visitSRA(SDNode *N) { 7713 SDValue N0 = N->getOperand(0); 7714 SDValue N1 = N->getOperand(1); 7715 if (SDValue V = DAG.simplifyShift(N0, N1)) 7716 return V; 7717 7718 EVT VT = N0.getValueType(); 7719 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7720 7721 // Arithmetic shifting an all-sign-bit value is a no-op. 7722 // fold (sra 0, x) -> 0 7723 // fold (sra -1, x) -> -1 7724 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) 7725 return N0; 7726 7727 // fold vector ops 7728 if (VT.isVector()) 7729 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7730 return FoldedVOp; 7731 7732 ConstantSDNode *N1C = isConstOrConstSplat(N1); 7733 7734 // fold (sra c1, c2) -> (sra c1, c2) 7735 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) 7736 return C; 7737 7738 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7739 return NewSel; 7740 7741 // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports 7742 // sext_inreg. 7743 if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { 7744 unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); 7745 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); 7746 if (VT.isVector()) 7747 ExtVT = EVT::getVectorVT(*DAG.getContext(), 7748 ExtVT, VT.getVectorNumElements()); 7749 if (!LegalOperations || 7750 TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == 7751 TargetLowering::Legal) 7752 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, 7753 N0.getOperand(0), DAG.getValueType(ExtVT)); 7754 } 7755 7756 // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) 7757 // clamp (add c1, c2) to max shift. 7758 if (N0.getOpcode() == ISD::SRA) { 7759 SDLoc DL(N); 7760 EVT ShiftVT = N1.getValueType(); 7761 EVT ShiftSVT = ShiftVT.getScalarType(); 7762 SmallVector<SDValue, 16> ShiftValues; 7763 7764 auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) { 7765 APInt c1 = LHS->getAPIntValue(); 7766 APInt c2 = RHS->getAPIntValue(); 7767 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7768 APInt Sum = c1 + c2; 7769 unsigned ShiftSum = 7770 Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue(); 7771 ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT)); 7772 return true; 7773 }; 7774 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) { 7775 SDValue ShiftValue; 7776 if (VT.isVector()) 7777 ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues); 7778 else 7779 ShiftValue = ShiftValues[0]; 7780 return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue); 7781 } 7782 } 7783 7784 // fold (sra (shl X, m), (sub result_size, n)) 7785 // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for 7786 // result_size - n != m. 7787 // If truncate is free for the target sext(shl) is likely to result in better 7788 // code. 7789 if (N0.getOpcode() == ISD::SHL && N1C) { 7790 // Get the two constanst of the shifts, CN0 = m, CN = n. 7791 const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); 7792 if (N01C) { 7793 LLVMContext &Ctx = *DAG.getContext(); 7794 // Determine what the truncate's result bitsize and type would be. 7795 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); 7796 7797 if (VT.isVector()) 7798 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); 7799 7800 // Determine the residual right-shift amount. 7801 int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); 7802 7803 // If the shift is not a no-op (in which case this should be just a sign 7804 // extend already), the truncated to type is legal, sign_extend is legal 7805 // on that type, and the truncate to that type is both legal and free, 7806 // perform the transform. 7807 if ((ShiftAmt > 0) && 7808 TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && 7809 TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && 7810 TLI.isTruncateFree(VT, TruncVT)) { 7811 SDLoc DL(N); 7812 SDValue Amt = DAG.getConstant(ShiftAmt, DL, 7813 getShiftAmountTy(N0.getOperand(0).getValueType())); 7814 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, 7815 N0.getOperand(0), Amt); 7816 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, 7817 Shift); 7818 return DAG.getNode(ISD::SIGN_EXTEND, DL, 7819 N->getValueType(0), Trunc); 7820 } 7821 } 7822 } 7823 7824 // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper. 7825 // sra (add (shl X, N1C), AddC), N1C --> 7826 // sext (add (trunc X to (width - N1C)), AddC') 7827 if (!LegalTypes && N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C && 7828 N0.getOperand(0).getOpcode() == ISD::SHL && 7829 N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) { 7830 if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) { 7831 SDValue Shl = N0.getOperand(0); 7832 // Determine what the truncate's type would be and ask the target if that 7833 // is a free operation. 7834 LLVMContext &Ctx = *DAG.getContext(); 7835 unsigned ShiftAmt = N1C->getZExtValue(); 7836 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt); 7837 if (VT.isVector()) 7838 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); 7839 7840 // TODO: The simple type check probably belongs in the default hook 7841 // implementation and/or target-specific overrides (because 7842 // non-simple types likely require masking when legalized), but that 7843 // restriction may conflict with other transforms. 7844 if (TruncVT.isSimple() && TLI.isTruncateFree(VT, TruncVT)) { 7845 SDLoc DL(N); 7846 SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT); 7847 SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt). 7848 trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT); 7849 SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC); 7850 return DAG.getSExtOrTrunc(Add, DL, VT); 7851 } 7852 } 7853 } 7854 7855 // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). 7856 if (N1.getOpcode() == ISD::TRUNCATE && 7857 N1.getOperand(0).getOpcode() == ISD::AND) { 7858 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 7859 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); 7860 } 7861 7862 // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) 7863 // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) 7864 // if c1 is equal to the number of bits the trunc removes 7865 // TODO - support non-uniform vector shift amounts. 7866 if (N0.getOpcode() == ISD::TRUNCATE && 7867 (N0.getOperand(0).getOpcode() == ISD::SRL || 7868 N0.getOperand(0).getOpcode() == ISD::SRA) && 7869 N0.getOperand(0).hasOneUse() && 7870 N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { 7871 SDValue N0Op0 = N0.getOperand(0); 7872 if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { 7873 EVT LargeVT = N0Op0.getValueType(); 7874 unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits; 7875 if (LargeShift->getAPIntValue() == TruncBits) { 7876 SDLoc DL(N); 7877 SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL, 7878 getShiftAmountTy(LargeVT)); 7879 SDValue SRA = 7880 DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); 7881 return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); 7882 } 7883 } 7884 } 7885 7886 // Simplify, based on bits shifted out of the LHS. 7887 // TODO - support non-uniform vector shift amounts. 7888 if (N1C && SimplifyDemandedBits(SDValue(N, 0))) 7889 return SDValue(N, 0); 7890 7891 // If the sign bit is known to be zero, switch this to a SRL. 7892 if (DAG.SignBitIsZero(N0)) 7893 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); 7894 7895 if (N1C && !N1C->isOpaque()) 7896 if (SDValue NewSRA = visitShiftByConstant(N)) 7897 return NewSRA; 7898 7899 return SDValue(); 7900 } 7901 7902 SDValue DAGCombiner::visitSRL(SDNode *N) { 7903 SDValue N0 = N->getOperand(0); 7904 SDValue N1 = N->getOperand(1); 7905 if (SDValue V = DAG.simplifyShift(N0, N1)) 7906 return V; 7907 7908 EVT VT = N0.getValueType(); 7909 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7910 7911 // fold vector ops 7912 if (VT.isVector()) 7913 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7914 return FoldedVOp; 7915 7916 ConstantSDNode *N1C = isConstOrConstSplat(N1); 7917 7918 // fold (srl c1, c2) -> c1 >>u c2 7919 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) 7920 return C; 7921 7922 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7923 return NewSel; 7924 7925 // if (srl x, c) is known to be zero, return 0 7926 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 7927 APInt::getAllOnesValue(OpSizeInBits))) 7928 return DAG.getConstant(0, SDLoc(N), VT); 7929 7930 // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) 7931 if (N0.getOpcode() == ISD::SRL) { 7932 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 7933 ConstantSDNode *RHS) { 7934 APInt c1 = LHS->getAPIntValue(); 7935 APInt c2 = RHS->getAPIntValue(); 7936 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7937 return (c1 + c2).uge(OpSizeInBits); 7938 }; 7939 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 7940 return DAG.getConstant(0, SDLoc(N), VT); 7941 7942 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 7943 ConstantSDNode *RHS) { 7944 APInt c1 = LHS->getAPIntValue(); 7945 APInt c2 = RHS->getAPIntValue(); 7946 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7947 return (c1 + c2).ult(OpSizeInBits); 7948 }; 7949 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 7950 SDLoc DL(N); 7951 EVT ShiftVT = N1.getValueType(); 7952 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 7953 return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); 7954 } 7955 } 7956 7957 if (N1C && N0.getOpcode() == ISD::TRUNCATE && 7958 N0.getOperand(0).getOpcode() == ISD::SRL) { 7959 SDValue InnerShift = N0.getOperand(0); 7960 // TODO - support non-uniform vector shift amounts. 7961 if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) { 7962 uint64_t c1 = N001C->getZExtValue(); 7963 uint64_t c2 = N1C->getZExtValue(); 7964 EVT InnerShiftVT = InnerShift.getValueType(); 7965 EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType(); 7966 uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); 7967 // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2))) 7968 // This is only valid if the OpSizeInBits + c1 = size of inner shift. 7969 if (c1 + OpSizeInBits == InnerShiftSize) { 7970 SDLoc DL(N); 7971 if (c1 + c2 >= InnerShiftSize) 7972 return DAG.getConstant(0, DL, VT); 7973 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 7974 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 7975 InnerShift.getOperand(0), NewShiftAmt); 7976 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift); 7977 } 7978 // In the more general case, we can clear the high bits after the shift: 7979 // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask) 7980 if (N0.hasOneUse() && InnerShift.hasOneUse() && 7981 c1 + c2 < InnerShiftSize) { 7982 SDLoc DL(N); 7983 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 7984 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 7985 InnerShift.getOperand(0), NewShiftAmt); 7986 SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize, 7987 OpSizeInBits - c2), 7988 DL, InnerShiftVT); 7989 SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask); 7990 return DAG.getNode(ISD::TRUNCATE, DL, VT, And); 7991 } 7992 } 7993 } 7994 7995 // fold (srl (shl x, c), c) -> (and x, cst2) 7996 // TODO - (srl (shl x, c1), c2). 7997 if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && 7998 isConstantOrConstantVector(N1, /* NoOpaques */ true)) { 7999 SDLoc DL(N); 8000 SDValue Mask = 8001 DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); 8002 AddToWorklist(Mask.getNode()); 8003 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); 8004 } 8005 8006 // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) 8007 // TODO - support non-uniform vector shift amounts. 8008 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 8009 // Shifting in all undef bits? 8010 EVT SmallVT = N0.getOperand(0).getValueType(); 8011 unsigned BitSize = SmallVT.getScalarSizeInBits(); 8012 if (N1C->getAPIntValue().uge(BitSize)) 8013 return DAG.getUNDEF(VT); 8014 8015 if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { 8016 uint64_t ShiftAmt = N1C->getZExtValue(); 8017 SDLoc DL0(N0); 8018 SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT, 8019 N0.getOperand(0), 8020 DAG.getConstant(ShiftAmt, DL0, 8021 getShiftAmountTy(SmallVT))); 8022 AddToWorklist(SmallShift.getNode()); 8023 APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); 8024 SDLoc DL(N); 8025 return DAG.getNode(ISD::AND, DL, VT, 8026 DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), 8027 DAG.getConstant(Mask, DL, VT)); 8028 } 8029 } 8030 8031 // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign 8032 // bit, which is unmodified by sra. 8033 if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) { 8034 if (N0.getOpcode() == ISD::SRA) 8035 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); 8036 } 8037 8038 // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). 8039 if (N1C && N0.getOpcode() == ISD::CTLZ && 8040 N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { 8041 KnownBits Known = DAG.computeKnownBits(N0.getOperand(0)); 8042 8043 // If any of the input bits are KnownOne, then the input couldn't be all 8044 // zeros, thus the result of the srl will always be zero. 8045 if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); 8046 8047 // If all of the bits input the to ctlz node are known to be zero, then 8048 // the result of the ctlz is "32" and the result of the shift is one. 8049 APInt UnknownBits = ~Known.Zero; 8050 if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); 8051 8052 // Otherwise, check to see if there is exactly one bit input to the ctlz. 8053 if (UnknownBits.isPowerOf2()) { 8054 // Okay, we know that only that the single bit specified by UnknownBits 8055 // could be set on input to the CTLZ node. If this bit is set, the SRL 8056 // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair 8057 // to an SRL/XOR pair, which is likely to simplify more. 8058 unsigned ShAmt = UnknownBits.countTrailingZeros(); 8059 SDValue Op = N0.getOperand(0); 8060 8061 if (ShAmt) { 8062 SDLoc DL(N0); 8063 Op = DAG.getNode(ISD::SRL, DL, VT, Op, 8064 DAG.getConstant(ShAmt, DL, 8065 getShiftAmountTy(Op.getValueType()))); 8066 AddToWorklist(Op.getNode()); 8067 } 8068 8069 SDLoc DL(N); 8070 return DAG.getNode(ISD::XOR, DL, VT, 8071 Op, DAG.getConstant(1, DL, VT)); 8072 } 8073 } 8074 8075 // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). 8076 if (N1.getOpcode() == ISD::TRUNCATE && 8077 N1.getOperand(0).getOpcode() == ISD::AND) { 8078 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8079 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); 8080 } 8081 8082 // fold operands of srl based on knowledge that the low bits are not 8083 // demanded. 8084 // TODO - support non-uniform vector shift amounts. 8085 if (N1C && SimplifyDemandedBits(SDValue(N, 0))) 8086 return SDValue(N, 0); 8087 8088 if (N1C && !N1C->isOpaque()) 8089 if (SDValue NewSRL = visitShiftByConstant(N)) 8090 return NewSRL; 8091 8092 // Attempt to convert a srl of a load into a narrower zero-extending load. 8093 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 8094 return NarrowLoad; 8095 8096 // Here is a common situation. We want to optimize: 8097 // 8098 // %a = ... 8099 // %b = and i32 %a, 2 8100 // %c = srl i32 %b, 1 8101 // brcond i32 %c ... 8102 // 8103 // into 8104 // 8105 // %a = ... 8106 // %b = and %a, 2 8107 // %c = setcc eq %b, 0 8108 // brcond %c ... 8109 // 8110 // However when after the source operand of SRL is optimized into AND, the SRL 8111 // itself may not be optimized further. Look for it and add the BRCOND into 8112 // the worklist. 8113 if (N->hasOneUse()) { 8114 SDNode *Use = *N->use_begin(); 8115 if (Use->getOpcode() == ISD::BRCOND) 8116 AddToWorklist(Use); 8117 else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { 8118 // Also look pass the truncate. 8119 Use = *Use->use_begin(); 8120 if (Use->getOpcode() == ISD::BRCOND) 8121 AddToWorklist(Use); 8122 } 8123 } 8124 8125 return SDValue(); 8126 } 8127 8128 SDValue DAGCombiner::visitFunnelShift(SDNode *N) { 8129 EVT VT = N->getValueType(0); 8130 SDValue N0 = N->getOperand(0); 8131 SDValue N1 = N->getOperand(1); 8132 SDValue N2 = N->getOperand(2); 8133 bool IsFSHL = N->getOpcode() == ISD::FSHL; 8134 unsigned BitWidth = VT.getScalarSizeInBits(); 8135 8136 // fold (fshl N0, N1, 0) -> N0 8137 // fold (fshr N0, N1, 0) -> N1 8138 if (isPowerOf2_32(BitWidth)) 8139 if (DAG.MaskedValueIsZero( 8140 N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) 8141 return IsFSHL ? N0 : N1; 8142 8143 auto IsUndefOrZero = [](SDValue V) { 8144 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); 8145 }; 8146 8147 // TODO - support non-uniform vector shift amounts. 8148 if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { 8149 EVT ShAmtTy = N2.getValueType(); 8150 8151 // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) 8152 if (Cst->getAPIntValue().uge(BitWidth)) { 8153 uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); 8154 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, 8155 DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy)); 8156 } 8157 8158 unsigned ShAmt = Cst->getZExtValue(); 8159 if (ShAmt == 0) 8160 return IsFSHL ? N0 : N1; 8161 8162 // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C) 8163 // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C) 8164 // fold fshl(N0, undef_or_zero, C) -> shl(N0, C) 8165 // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C) 8166 if (IsUndefOrZero(N0)) 8167 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, 8168 DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, 8169 SDLoc(N), ShAmtTy)); 8170 if (IsUndefOrZero(N1)) 8171 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, 8172 DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, 8173 SDLoc(N), ShAmtTy)); 8174 } 8175 8176 // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) 8177 // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2) 8178 // iff We know the shift amount is in range. 8179 // TODO: when is it worth doing SUB(BW, N2) as well? 8180 if (isPowerOf2_32(BitWidth)) { 8181 APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1); 8182 if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 8183 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2); 8184 if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 8185 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2); 8186 } 8187 8188 // fold (fshl N0, N0, N2) -> (rotl N0, N2) 8189 // fold (fshr N0, N0, N2) -> (rotr N0, N2) 8190 // TODO: Investigate flipping this rotate if only one is legal, if funnel shift 8191 // is legal as well we might be better off avoiding non-constant (BW - N2). 8192 unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR; 8193 if (N0 == N1 && hasOperation(RotOpc, VT)) 8194 return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); 8195 8196 // Simplify, based on bits shifted out of N0/N1. 8197 if (SimplifyDemandedBits(SDValue(N, 0))) 8198 return SDValue(N, 0); 8199 8200 return SDValue(); 8201 } 8202 8203 SDValue DAGCombiner::visitABS(SDNode *N) { 8204 SDValue N0 = N->getOperand(0); 8205 EVT VT = N->getValueType(0); 8206 8207 // fold (abs c1) -> c2 8208 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8209 return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); 8210 // fold (abs (abs x)) -> (abs x) 8211 if (N0.getOpcode() == ISD::ABS) 8212 return N0; 8213 // fold (abs x) -> x iff not-negative 8214 if (DAG.SignBitIsZero(N0)) 8215 return N0; 8216 return SDValue(); 8217 } 8218 8219 SDValue DAGCombiner::visitBSWAP(SDNode *N) { 8220 SDValue N0 = N->getOperand(0); 8221 EVT VT = N->getValueType(0); 8222 8223 // fold (bswap c1) -> c2 8224 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8225 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0); 8226 // fold (bswap (bswap x)) -> x 8227 if (N0.getOpcode() == ISD::BSWAP) 8228 return N0->getOperand(0); 8229 return SDValue(); 8230 } 8231 8232 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { 8233 SDValue N0 = N->getOperand(0); 8234 EVT VT = N->getValueType(0); 8235 8236 // fold (bitreverse c1) -> c2 8237 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8238 return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); 8239 // fold (bitreverse (bitreverse x)) -> x 8240 if (N0.getOpcode() == ISD::BITREVERSE) 8241 return N0.getOperand(0); 8242 return SDValue(); 8243 } 8244 8245 SDValue DAGCombiner::visitCTLZ(SDNode *N) { 8246 SDValue N0 = N->getOperand(0); 8247 EVT VT = N->getValueType(0); 8248 8249 // fold (ctlz c1) -> c2 8250 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8251 return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); 8252 8253 // If the value is known never to be zero, switch to the undef version. 8254 if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) { 8255 if (DAG.isKnownNeverZero(N0)) 8256 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8257 } 8258 8259 return SDValue(); 8260 } 8261 8262 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { 8263 SDValue N0 = N->getOperand(0); 8264 EVT VT = N->getValueType(0); 8265 8266 // fold (ctlz_zero_undef c1) -> c2 8267 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8268 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8269 return SDValue(); 8270 } 8271 8272 SDValue DAGCombiner::visitCTTZ(SDNode *N) { 8273 SDValue N0 = N->getOperand(0); 8274 EVT VT = N->getValueType(0); 8275 8276 // fold (cttz c1) -> c2 8277 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8278 return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); 8279 8280 // If the value is known never to be zero, switch to the undef version. 8281 if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) { 8282 if (DAG.isKnownNeverZero(N0)) 8283 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8284 } 8285 8286 return SDValue(); 8287 } 8288 8289 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { 8290 SDValue N0 = N->getOperand(0); 8291 EVT VT = N->getValueType(0); 8292 8293 // fold (cttz_zero_undef c1) -> c2 8294 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8295 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8296 return SDValue(); 8297 } 8298 8299 SDValue DAGCombiner::visitCTPOP(SDNode *N) { 8300 SDValue N0 = N->getOperand(0); 8301 EVT VT = N->getValueType(0); 8302 8303 // fold (ctpop c1) -> c2 8304 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8305 return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); 8306 return SDValue(); 8307 } 8308 8309 // FIXME: This should be checking for no signed zeros on individual operands, as 8310 // well as no nans. 8311 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, 8312 SDValue RHS, 8313 const TargetLowering &TLI) { 8314 const TargetOptions &Options = DAG.getTarget().Options; 8315 EVT VT = LHS.getValueType(); 8316 8317 return Options.NoSignedZerosFPMath && VT.isFloatingPoint() && 8318 TLI.isProfitableToCombineMinNumMaxNum(VT) && 8319 DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS); 8320 } 8321 8322 /// Generate Min/Max node 8323 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, 8324 SDValue RHS, SDValue True, SDValue False, 8325 ISD::CondCode CC, const TargetLowering &TLI, 8326 SelectionDAG &DAG) { 8327 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 8328 return SDValue(); 8329 8330 EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 8331 switch (CC) { 8332 case ISD::SETOLT: 8333 case ISD::SETOLE: 8334 case ISD::SETLT: 8335 case ISD::SETLE: 8336 case ISD::SETULT: 8337 case ISD::SETULE: { 8338 // Since it's known never nan to get here already, either fminnum or 8339 // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is 8340 // expanded in terms of it. 8341 unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; 8342 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 8343 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 8344 8345 unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; 8346 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 8347 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 8348 return SDValue(); 8349 } 8350 case ISD::SETOGT: 8351 case ISD::SETOGE: 8352 case ISD::SETGT: 8353 case ISD::SETGE: 8354 case ISD::SETUGT: 8355 case ISD::SETUGE: { 8356 unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; 8357 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 8358 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 8359 8360 unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; 8361 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 8362 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 8363 return SDValue(); 8364 } 8365 default: 8366 return SDValue(); 8367 } 8368 } 8369 8370 /// If a (v)select has a condition value that is a sign-bit test, try to smear 8371 /// the condition operand sign-bit across the value width and use it as a mask. 8372 static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) { 8373 SDValue Cond = N->getOperand(0); 8374 SDValue C1 = N->getOperand(1); 8375 SDValue C2 = N->getOperand(2); 8376 assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) && 8377 "Expected select-of-constants"); 8378 8379 EVT VT = N->getValueType(0); 8380 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() || 8381 VT != Cond.getOperand(0).getValueType()) 8382 return SDValue(); 8383 8384 // The inverted-condition + commuted-select variants of these patterns are 8385 // canonicalized to these forms in IR. 8386 SDValue X = Cond.getOperand(0); 8387 SDValue CondC = Cond.getOperand(1); 8388 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8389 if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) && 8390 isAllOnesOrAllOnesSplat(C2)) { 8391 // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1 8392 SDLoc DL(N); 8393 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 8394 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 8395 return DAG.getNode(ISD::OR, DL, VT, Sra, C1); 8396 } 8397 if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) { 8398 // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1 8399 SDLoc DL(N); 8400 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 8401 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 8402 return DAG.getNode(ISD::AND, DL, VT, Sra, C1); 8403 } 8404 return SDValue(); 8405 } 8406 8407 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { 8408 SDValue Cond = N->getOperand(0); 8409 SDValue N1 = N->getOperand(1); 8410 SDValue N2 = N->getOperand(2); 8411 EVT VT = N->getValueType(0); 8412 EVT CondVT = Cond.getValueType(); 8413 SDLoc DL(N); 8414 8415 if (!VT.isInteger()) 8416 return SDValue(); 8417 8418 auto *C1 = dyn_cast<ConstantSDNode>(N1); 8419 auto *C2 = dyn_cast<ConstantSDNode>(N2); 8420 if (!C1 || !C2) 8421 return SDValue(); 8422 8423 // Only do this before legalization to avoid conflicting with target-specific 8424 // transforms in the other direction (create a select from a zext/sext). There 8425 // is also a target-independent combine here in DAGCombiner in the other 8426 // direction for (select Cond, -1, 0) when the condition is not i1. 8427 if (CondVT == MVT::i1 && !LegalOperations) { 8428 if (C1->isNullValue() && C2->isOne()) { 8429 // select Cond, 0, 1 --> zext (!Cond) 8430 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 8431 if (VT != MVT::i1) 8432 NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); 8433 return NotCond; 8434 } 8435 if (C1->isNullValue() && C2->isAllOnesValue()) { 8436 // select Cond, 0, -1 --> sext (!Cond) 8437 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 8438 if (VT != MVT::i1) 8439 NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); 8440 return NotCond; 8441 } 8442 if (C1->isOne() && C2->isNullValue()) { 8443 // select Cond, 1, 0 --> zext (Cond) 8444 if (VT != MVT::i1) 8445 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8446 return Cond; 8447 } 8448 if (C1->isAllOnesValue() && C2->isNullValue()) { 8449 // select Cond, -1, 0 --> sext (Cond) 8450 if (VT != MVT::i1) 8451 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 8452 return Cond; 8453 } 8454 8455 // Use a target hook because some targets may prefer to transform in the 8456 // other direction. 8457 if (TLI.convertSelectOfConstantsToMath(VT)) { 8458 // For any constants that differ by 1, we can transform the select into an 8459 // extend and add. 8460 const APInt &C1Val = C1->getAPIntValue(); 8461 const APInt &C2Val = C2->getAPIntValue(); 8462 if (C1Val - 1 == C2Val) { 8463 // select Cond, C1, C1-1 --> add (zext Cond), C1-1 8464 if (VT != MVT::i1) 8465 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8466 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 8467 } 8468 if (C1Val + 1 == C2Val) { 8469 // select Cond, C1, C1+1 --> add (sext Cond), C1+1 8470 if (VT != MVT::i1) 8471 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 8472 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 8473 } 8474 8475 // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2) 8476 if (C1Val.isPowerOf2() && C2Val.isNullValue()) { 8477 if (VT != MVT::i1) 8478 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8479 SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT); 8480 return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC); 8481 } 8482 8483 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 8484 return V; 8485 } 8486 8487 return SDValue(); 8488 } 8489 8490 // fold (select Cond, 0, 1) -> (xor Cond, 1) 8491 // We can't do this reliably if integer based booleans have different contents 8492 // to floating point based booleans. This is because we can't tell whether we 8493 // have an integer-based boolean or a floating-point-based boolean unless we 8494 // can find the SETCC that produced it and inspect its operands. This is 8495 // fairly easy if C is the SETCC node, but it can potentially be 8496 // undiscoverable (or not reasonably discoverable). For example, it could be 8497 // in another basic block or it could require searching a complicated 8498 // expression. 8499 if (CondVT.isInteger() && 8500 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) == 8501 TargetLowering::ZeroOrOneBooleanContent && 8502 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) == 8503 TargetLowering::ZeroOrOneBooleanContent && 8504 C1->isNullValue() && C2->isOne()) { 8505 SDValue NotCond = 8506 DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); 8507 if (VT.bitsEq(CondVT)) 8508 return NotCond; 8509 return DAG.getZExtOrTrunc(NotCond, DL, VT); 8510 } 8511 8512 return SDValue(); 8513 } 8514 8515 SDValue DAGCombiner::visitSELECT(SDNode *N) { 8516 SDValue N0 = N->getOperand(0); 8517 SDValue N1 = N->getOperand(1); 8518 SDValue N2 = N->getOperand(2); 8519 EVT VT = N->getValueType(0); 8520 EVT VT0 = N0.getValueType(); 8521 SDLoc DL(N); 8522 SDNodeFlags Flags = N->getFlags(); 8523 8524 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 8525 return V; 8526 8527 // fold (select X, X, Y) -> (or X, Y) 8528 // fold (select X, 1, Y) -> (or C, Y) 8529 if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1))) 8530 return DAG.getNode(ISD::OR, DL, VT, N0, N2); 8531 8532 if (SDValue V = foldSelectOfConstants(N)) 8533 return V; 8534 8535 // fold (select C, 0, X) -> (and (not C), X) 8536 if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) { 8537 SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); 8538 AddToWorklist(NOTNode.getNode()); 8539 return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2); 8540 } 8541 // fold (select C, X, 1) -> (or (not C), X) 8542 if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) { 8543 SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); 8544 AddToWorklist(NOTNode.getNode()); 8545 return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1); 8546 } 8547 // fold (select X, Y, X) -> (and X, Y) 8548 // fold (select X, Y, 0) -> (and X, Y) 8549 if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2))) 8550 return DAG.getNode(ISD::AND, DL, VT, N0, N1); 8551 8552 // If we can fold this based on the true/false value, do so. 8553 if (SimplifySelectOps(N, N1, N2)) 8554 return SDValue(N, 0); // Don't revisit N. 8555 8556 if (VT0 == MVT::i1) { 8557 // The code in this block deals with the following 2 equivalences: 8558 // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) 8559 // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) 8560 // The target can specify its preferred form with the 8561 // shouldNormalizeToSelectSequence() callback. However we always transform 8562 // to the right anyway if we find the inner select exists in the DAG anyway 8563 // and we always transform to the left side if we know that we can further 8564 // optimize the combination of the conditions. 8565 bool normalizeToSequence = 8566 TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); 8567 // select (and Cond0, Cond1), X, Y 8568 // -> select Cond0, (select Cond1, X, Y), Y 8569 if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { 8570 SDValue Cond0 = N0->getOperand(0); 8571 SDValue Cond1 = N0->getOperand(1); 8572 SDValue InnerSelect = 8573 DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags); 8574 if (normalizeToSequence || !InnerSelect.use_empty()) 8575 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, 8576 InnerSelect, N2, Flags); 8577 // Cleanup on failure. 8578 if (InnerSelect.use_empty()) 8579 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 8580 } 8581 // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) 8582 if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { 8583 SDValue Cond0 = N0->getOperand(0); 8584 SDValue Cond1 = N0->getOperand(1); 8585 SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), 8586 Cond1, N1, N2, Flags); 8587 if (normalizeToSequence || !InnerSelect.use_empty()) 8588 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, 8589 InnerSelect, Flags); 8590 // Cleanup on failure. 8591 if (InnerSelect.use_empty()) 8592 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 8593 } 8594 8595 // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y 8596 if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { 8597 SDValue N1_0 = N1->getOperand(0); 8598 SDValue N1_1 = N1->getOperand(1); 8599 SDValue N1_2 = N1->getOperand(2); 8600 if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { 8601 // Create the actual and node if we can generate good code for it. 8602 if (!normalizeToSequence) { 8603 SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); 8604 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, 8605 N2, Flags); 8606 } 8607 // Otherwise see if we can optimize the "and" to a better pattern. 8608 if (SDValue Combined = visitANDLike(N0, N1_0, N)) { 8609 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, 8610 N2, Flags); 8611 } 8612 } 8613 } 8614 // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y 8615 if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { 8616 SDValue N2_0 = N2->getOperand(0); 8617 SDValue N2_1 = N2->getOperand(1); 8618 SDValue N2_2 = N2->getOperand(2); 8619 if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { 8620 // Create the actual or node if we can generate good code for it. 8621 if (!normalizeToSequence) { 8622 SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); 8623 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, 8624 N2_2, Flags); 8625 } 8626 // Otherwise see if we can optimize to a better pattern. 8627 if (SDValue Combined = visitORLike(N0, N2_0, N)) 8628 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, 8629 N2_2, Flags); 8630 } 8631 } 8632 } 8633 8634 // select (not Cond), N1, N2 -> select Cond, N2, N1 8635 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { 8636 SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1); 8637 SelectOp->setFlags(Flags); 8638 return SelectOp; 8639 } 8640 8641 // Fold selects based on a setcc into other things, such as min/max/abs. 8642 if (N0.getOpcode() == ISD::SETCC) { 8643 SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1); 8644 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 8645 8646 // select (fcmp lt x, y), x, y -> fminnum x, y 8647 // select (fcmp gt x, y), x, y -> fmaxnum x, y 8648 // 8649 // This is OK if we don't care what happens if either operand is a NaN. 8650 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI)) 8651 if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, 8652 CC, TLI, DAG)) 8653 return FMinMax; 8654 8655 // Use 'unsigned add with overflow' to optimize an unsigned saturating add. 8656 // This is conservatively limited to pre-legal-operations to give targets 8657 // a chance to reverse the transform if they want to do that. Also, it is 8658 // unlikely that the pattern would be formed late, so it's probably not 8659 // worth going through the other checks. 8660 if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) && 8661 CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) && 8662 N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) { 8663 auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1)); 8664 auto *NotC = dyn_cast<ConstantSDNode>(Cond1); 8665 if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) { 8666 // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) --> 8667 // uaddo Cond0, C; select uaddo.1, -1, uaddo.0 8668 // 8669 // The IR equivalent of this transform would have this form: 8670 // %a = add %x, C 8671 // %c = icmp ugt %x, ~C 8672 // %r = select %c, -1, %a 8673 // => 8674 // %u = call {iN,i1} llvm.uadd.with.overflow(%x, C) 8675 // %u0 = extractvalue %u, 0 8676 // %u1 = extractvalue %u, 1 8677 // %r = select %u1, -1, %u0 8678 SDVTList VTs = DAG.getVTList(VT, VT0); 8679 SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1)); 8680 return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0)); 8681 } 8682 } 8683 8684 if (TLI.isOperationLegal(ISD::SELECT_CC, VT) || 8685 (!LegalOperations && 8686 TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) { 8687 // Any flags available in a select/setcc fold will be on the setcc as they 8688 // migrated from fcmp 8689 Flags = N0.getNode()->getFlags(); 8690 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, 8691 N2, N0.getOperand(2)); 8692 SelectNode->setFlags(Flags); 8693 return SelectNode; 8694 } 8695 8696 return SimplifySelect(DL, N0, N1, N2); 8697 } 8698 8699 return SDValue(); 8700 } 8701 8702 // This function assumes all the vselect's arguments are CONCAT_VECTOR 8703 // nodes and that the condition is a BV of ConstantSDNodes (or undefs). 8704 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { 8705 SDLoc DL(N); 8706 SDValue Cond = N->getOperand(0); 8707 SDValue LHS = N->getOperand(1); 8708 SDValue RHS = N->getOperand(2); 8709 EVT VT = N->getValueType(0); 8710 int NumElems = VT.getVectorNumElements(); 8711 assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && 8712 RHS.getOpcode() == ISD::CONCAT_VECTORS && 8713 Cond.getOpcode() == ISD::BUILD_VECTOR); 8714 8715 // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about 8716 // binary ones here. 8717 if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) 8718 return SDValue(); 8719 8720 // We're sure we have an even number of elements due to the 8721 // concat_vectors we have as arguments to vselect. 8722 // Skip BV elements until we find one that's not an UNDEF 8723 // After we find an UNDEF element, keep looping until we get to half the 8724 // length of the BV and see if all the non-undef nodes are the same. 8725 ConstantSDNode *BottomHalf = nullptr; 8726 for (int i = 0; i < NumElems / 2; ++i) { 8727 if (Cond->getOperand(i)->isUndef()) 8728 continue; 8729 8730 if (BottomHalf == nullptr) 8731 BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 8732 else if (Cond->getOperand(i).getNode() != BottomHalf) 8733 return SDValue(); 8734 } 8735 8736 // Do the same for the second half of the BuildVector 8737 ConstantSDNode *TopHalf = nullptr; 8738 for (int i = NumElems / 2; i < NumElems; ++i) { 8739 if (Cond->getOperand(i)->isUndef()) 8740 continue; 8741 8742 if (TopHalf == nullptr) 8743 TopHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 8744 else if (Cond->getOperand(i).getNode() != TopHalf) 8745 return SDValue(); 8746 } 8747 8748 assert(TopHalf && BottomHalf && 8749 "One half of the selector was all UNDEFs and the other was all the " 8750 "same value. This should have been addressed before this function."); 8751 return DAG.getNode( 8752 ISD::CONCAT_VECTORS, DL, VT, 8753 BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), 8754 TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); 8755 } 8756 8757 SDValue DAGCombiner::visitMSCATTER(SDNode *N) { 8758 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); 8759 SDValue Mask = MSC->getMask(); 8760 SDValue Chain = MSC->getChain(); 8761 SDLoc DL(N); 8762 8763 // Zap scatters with a zero mask. 8764 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 8765 return Chain; 8766 8767 return SDValue(); 8768 } 8769 8770 SDValue DAGCombiner::visitMSTORE(SDNode *N) { 8771 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); 8772 SDValue Mask = MST->getMask(); 8773 SDValue Chain = MST->getChain(); 8774 SDLoc DL(N); 8775 8776 // Zap masked stores with a zero mask. 8777 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 8778 return Chain; 8779 8780 // Try transforming N to an indexed store. 8781 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 8782 return SDValue(N, 0); 8783 8784 return SDValue(); 8785 } 8786 8787 SDValue DAGCombiner::visitMGATHER(SDNode *N) { 8788 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); 8789 SDValue Mask = MGT->getMask(); 8790 SDLoc DL(N); 8791 8792 // Zap gathers with a zero mask. 8793 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 8794 return CombineTo(N, MGT->getPassThru(), MGT->getChain()); 8795 8796 return SDValue(); 8797 } 8798 8799 SDValue DAGCombiner::visitMLOAD(SDNode *N) { 8800 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N); 8801 SDValue Mask = MLD->getMask(); 8802 SDLoc DL(N); 8803 8804 // Zap masked loads with a zero mask. 8805 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 8806 return CombineTo(N, MLD->getPassThru(), MLD->getChain()); 8807 8808 // Try transforming N to an indexed load. 8809 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 8810 return SDValue(N, 0); 8811 8812 return SDValue(); 8813 } 8814 8815 /// A vector select of 2 constant vectors can be simplified to math/logic to 8816 /// avoid a variable select instruction and possibly avoid constant loads. 8817 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { 8818 SDValue Cond = N->getOperand(0); 8819 SDValue N1 = N->getOperand(1); 8820 SDValue N2 = N->getOperand(2); 8821 EVT VT = N->getValueType(0); 8822 if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || 8823 !TLI.convertSelectOfConstantsToMath(VT) || 8824 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || 8825 !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) 8826 return SDValue(); 8827 8828 // Check if we can use the condition value to increment/decrement a single 8829 // constant value. This simplifies a select to an add and removes a constant 8830 // load/materialization from the general case. 8831 bool AllAddOne = true; 8832 bool AllSubOne = true; 8833 unsigned Elts = VT.getVectorNumElements(); 8834 for (unsigned i = 0; i != Elts; ++i) { 8835 SDValue N1Elt = N1.getOperand(i); 8836 SDValue N2Elt = N2.getOperand(i); 8837 if (N1Elt.isUndef() || N2Elt.isUndef()) 8838 continue; 8839 8840 const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue(); 8841 const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue(); 8842 if (C1 != C2 + 1) 8843 AllAddOne = false; 8844 if (C1 != C2 - 1) 8845 AllSubOne = false; 8846 } 8847 8848 // Further simplifications for the extra-special cases where the constants are 8849 // all 0 or all -1 should be implemented as folds of these patterns. 8850 SDLoc DL(N); 8851 if (AllAddOne || AllSubOne) { 8852 // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C 8853 // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C 8854 auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 8855 SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); 8856 return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); 8857 } 8858 8859 // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C) 8860 APInt Pow2C; 8861 if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() && 8862 isNullOrNullSplat(N2)) { 8863 SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT); 8864 SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT); 8865 return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC); 8866 } 8867 8868 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 8869 return V; 8870 8871 // The general case for select-of-constants: 8872 // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 8873 // ...but that only makes sense if a vselect is slower than 2 logic ops, so 8874 // leave that to a machine-specific pass. 8875 return SDValue(); 8876 } 8877 8878 SDValue DAGCombiner::visitVSELECT(SDNode *N) { 8879 SDValue N0 = N->getOperand(0); 8880 SDValue N1 = N->getOperand(1); 8881 SDValue N2 = N->getOperand(2); 8882 EVT VT = N->getValueType(0); 8883 SDLoc DL(N); 8884 8885 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 8886 return V; 8887 8888 // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1 8889 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) 8890 return DAG.getSelect(DL, VT, F, N2, N1); 8891 8892 // Canonicalize integer abs. 8893 // vselect (setg[te] X, 0), X, -X -> 8894 // vselect (setgt X, -1), X, -X -> 8895 // vselect (setl[te] X, 0), -X, X -> 8896 // Y = sra (X, size(X)-1); xor (add (X, Y), Y) 8897 if (N0.getOpcode() == ISD::SETCC) { 8898 SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); 8899 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 8900 bool isAbs = false; 8901 bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 8902 8903 if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || 8904 (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && 8905 N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) 8906 isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); 8907 else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && 8908 N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) 8909 isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 8910 8911 if (isAbs) { 8912 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) 8913 return DAG.getNode(ISD::ABS, DL, VT, LHS); 8914 8915 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS, 8916 DAG.getConstant(VT.getScalarSizeInBits() - 1, 8917 DL, getShiftAmountTy(VT))); 8918 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); 8919 AddToWorklist(Shift.getNode()); 8920 AddToWorklist(Add.getNode()); 8921 return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); 8922 } 8923 8924 // vselect x, y (fcmp lt x, y) -> fminnum x, y 8925 // vselect x, y (fcmp gt x, y) -> fmaxnum x, y 8926 // 8927 // This is OK if we don't care about what happens if either operand is a 8928 // NaN. 8929 // 8930 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) { 8931 if (SDValue FMinMax = 8932 combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG)) 8933 return FMinMax; 8934 } 8935 8936 // If this select has a condition (setcc) with narrower operands than the 8937 // select, try to widen the compare to match the select width. 8938 // TODO: This should be extended to handle any constant. 8939 // TODO: This could be extended to handle non-loading patterns, but that 8940 // requires thorough testing to avoid regressions. 8941 if (isNullOrNullSplat(RHS)) { 8942 EVT NarrowVT = LHS.getValueType(); 8943 EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); 8944 EVT SetCCVT = getSetCCResultType(LHS.getValueType()); 8945 unsigned SetCCWidth = SetCCVT.getScalarSizeInBits(); 8946 unsigned WideWidth = WideVT.getScalarSizeInBits(); 8947 bool IsSigned = isSignedIntSetCC(CC); 8948 auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 8949 if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && 8950 SetCCWidth != 1 && SetCCWidth < WideWidth && 8951 TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && 8952 TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { 8953 // Both compare operands can be widened for free. The LHS can use an 8954 // extended load, and the RHS is a constant: 8955 // vselect (ext (setcc load(X), C)), N1, N2 --> 8956 // vselect (setcc extload(X), C'), N1, N2 8957 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8958 SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS); 8959 SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS); 8960 EVT WideSetCCVT = getSetCCResultType(WideVT); 8961 SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC); 8962 return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); 8963 } 8964 } 8965 } 8966 8967 if (SimplifySelectOps(N, N1, N2)) 8968 return SDValue(N, 0); // Don't revisit N. 8969 8970 // Fold (vselect (build_vector all_ones), N1, N2) -> N1 8971 if (ISD::isBuildVectorAllOnes(N0.getNode())) 8972 return N1; 8973 // Fold (vselect (build_vector all_zeros), N1, N2) -> N2 8974 if (ISD::isBuildVectorAllZeros(N0.getNode())) 8975 return N2; 8976 8977 // The ConvertSelectToConcatVector function is assuming both the above 8978 // checks for (vselect (build_vector all{ones,zeros) ...) have been made 8979 // and addressed. 8980 if (N1.getOpcode() == ISD::CONCAT_VECTORS && 8981 N2.getOpcode() == ISD::CONCAT_VECTORS && 8982 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { 8983 if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) 8984 return CV; 8985 } 8986 8987 if (SDValue V = foldVSelectOfConstants(N)) 8988 return V; 8989 8990 return SDValue(); 8991 } 8992 8993 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { 8994 SDValue N0 = N->getOperand(0); 8995 SDValue N1 = N->getOperand(1); 8996 SDValue N2 = N->getOperand(2); 8997 SDValue N3 = N->getOperand(3); 8998 SDValue N4 = N->getOperand(4); 8999 ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get(); 9000 9001 // fold select_cc lhs, rhs, x, x, cc -> x 9002 if (N2 == N3) 9003 return N2; 9004 9005 // Determine if the condition we're dealing with is constant 9006 if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, 9007 CC, SDLoc(N), false)) { 9008 AddToWorklist(SCC.getNode()); 9009 9010 if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) { 9011 if (!SCCC->isNullValue()) 9012 return N2; // cond always true -> true val 9013 else 9014 return N3; // cond always false -> false val 9015 } else if (SCC->isUndef()) { 9016 // When the condition is UNDEF, just return the first operand. This is 9017 // coherent the DAG creation, no setcc node is created in this case 9018 return N2; 9019 } else if (SCC.getOpcode() == ISD::SETCC) { 9020 // Fold to a simpler select_cc 9021 SDValue SelectOp = DAG.getNode( 9022 ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), 9023 SCC.getOperand(1), N2, N3, SCC.getOperand(2)); 9024 SelectOp->setFlags(SCC->getFlags()); 9025 return SelectOp; 9026 } 9027 } 9028 9029 // If we can fold this based on the true/false value, do so. 9030 if (SimplifySelectOps(N, N2, N3)) 9031 return SDValue(N, 0); // Don't revisit N. 9032 9033 // fold select_cc into other things, such as min/max/abs 9034 return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); 9035 } 9036 9037 SDValue DAGCombiner::visitSETCC(SDNode *N) { 9038 // setcc is very commonly used as an argument to brcond. This pattern 9039 // also lend itself to numerous combines and, as a result, it is desired 9040 // we keep the argument to a brcond as a setcc as much as possible. 9041 bool PreferSetCC = 9042 N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND; 9043 9044 SDValue Combined = SimplifySetCC( 9045 N->getValueType(0), N->getOperand(0), N->getOperand(1), 9046 cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC); 9047 9048 if (!Combined) 9049 return SDValue(); 9050 9051 // If we prefer to have a setcc, and we don't, we'll try our best to 9052 // recreate one using rebuildSetCC. 9053 if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { 9054 SDValue NewSetCC = rebuildSetCC(Combined); 9055 9056 // We don't have anything interesting to combine to. 9057 if (NewSetCC.getNode() == N) 9058 return SDValue(); 9059 9060 if (NewSetCC) 9061 return NewSetCC; 9062 } 9063 9064 return Combined; 9065 } 9066 9067 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { 9068 SDValue LHS = N->getOperand(0); 9069 SDValue RHS = N->getOperand(1); 9070 SDValue Carry = N->getOperand(2); 9071 SDValue Cond = N->getOperand(3); 9072 9073 // If Carry is false, fold to a regular SETCC. 9074 if (isNullConstant(Carry)) 9075 return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); 9076 9077 return SDValue(); 9078 } 9079 9080 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 9081 /// a build_vector of constants. 9082 /// This function is called by the DAGCombiner when visiting sext/zext/aext 9083 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). 9084 /// Vector extends are not folded if operations are legal; this is to 9085 /// avoid introducing illegal build_vector dag nodes. 9086 static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, 9087 SelectionDAG &DAG, bool LegalTypes) { 9088 unsigned Opcode = N->getOpcode(); 9089 SDValue N0 = N->getOperand(0); 9090 EVT VT = N->getValueType(0); 9091 SDLoc DL(N); 9092 9093 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || 9094 Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || 9095 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) 9096 && "Expected EXTEND dag node in input!"); 9097 9098 // fold (sext c1) -> c1 9099 // fold (zext c1) -> c1 9100 // fold (aext c1) -> c1 9101 if (isa<ConstantSDNode>(N0)) 9102 return DAG.getNode(Opcode, DL, VT, N0); 9103 9104 // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 9105 // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2) 9106 // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 9107 if (N0->getOpcode() == ISD::SELECT) { 9108 SDValue Op1 = N0->getOperand(1); 9109 SDValue Op2 = N0->getOperand(2); 9110 if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) && 9111 (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) { 9112 // For any_extend, choose sign extension of the constants to allow a 9113 // possible further transform to sign_extend_inreg.i.e. 9114 // 9115 // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0> 9116 // t2: i64 = any_extend t1 9117 // --> 9118 // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0> 9119 // --> 9120 // t4: i64 = sign_extend_inreg t3 9121 unsigned FoldOpc = Opcode; 9122 if (FoldOpc == ISD::ANY_EXTEND) 9123 FoldOpc = ISD::SIGN_EXTEND; 9124 return DAG.getSelect(DL, VT, N0->getOperand(0), 9125 DAG.getNode(FoldOpc, DL, VT, Op1), 9126 DAG.getNode(FoldOpc, DL, VT, Op2)); 9127 } 9128 } 9129 9130 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) 9131 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) 9132 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) 9133 EVT SVT = VT.getScalarType(); 9134 if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) && 9135 ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) 9136 return SDValue(); 9137 9138 // We can fold this node into a build_vector. 9139 unsigned VTBits = SVT.getSizeInBits(); 9140 unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); 9141 SmallVector<SDValue, 8> Elts; 9142 unsigned NumElts = VT.getVectorNumElements(); 9143 9144 // For zero-extensions, UNDEF elements still guarantee to have the upper 9145 // bits set to zero. 9146 bool IsZext = 9147 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG; 9148 9149 for (unsigned i = 0; i != NumElts; ++i) { 9150 SDValue Op = N0.getOperand(i); 9151 if (Op.isUndef()) { 9152 Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT)); 9153 continue; 9154 } 9155 9156 SDLoc DL(Op); 9157 // Get the constant value and if needed trunc it to the size of the type. 9158 // Nodes like build_vector might have constants wider than the scalar type. 9159 APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits); 9160 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) 9161 Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT)); 9162 else 9163 Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); 9164 } 9165 9166 return DAG.getBuildVector(VT, DL, Elts); 9167 } 9168 9169 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: 9170 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" 9171 // transformation. Returns true if extension are possible and the above 9172 // mentioned transformation is profitable. 9173 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0, 9174 unsigned ExtOpc, 9175 SmallVectorImpl<SDNode *> &ExtendNodes, 9176 const TargetLowering &TLI) { 9177 bool HasCopyToRegUses = false; 9178 bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType()); 9179 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 9180 UE = N0.getNode()->use_end(); 9181 UI != UE; ++UI) { 9182 SDNode *User = *UI; 9183 if (User == N) 9184 continue; 9185 if (UI.getUse().getResNo() != N0.getResNo()) 9186 continue; 9187 // FIXME: Only extend SETCC N, N and SETCC N, c for now. 9188 if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { 9189 ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get(); 9190 if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) 9191 // Sign bits will be lost after a zext. 9192 return false; 9193 bool Add = false; 9194 for (unsigned i = 0; i != 2; ++i) { 9195 SDValue UseOp = User->getOperand(i); 9196 if (UseOp == N0) 9197 continue; 9198 if (!isa<ConstantSDNode>(UseOp)) 9199 return false; 9200 Add = true; 9201 } 9202 if (Add) 9203 ExtendNodes.push_back(User); 9204 continue; 9205 } 9206 // If truncates aren't free and there are users we can't 9207 // extend, it isn't worthwhile. 9208 if (!isTruncFree) 9209 return false; 9210 // Remember if this value is live-out. 9211 if (User->getOpcode() == ISD::CopyToReg) 9212 HasCopyToRegUses = true; 9213 } 9214 9215 if (HasCopyToRegUses) { 9216 bool BothLiveOut = false; 9217 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 9218 UI != UE; ++UI) { 9219 SDUse &Use = UI.getUse(); 9220 if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { 9221 BothLiveOut = true; 9222 break; 9223 } 9224 } 9225 if (BothLiveOut) 9226 // Both unextended and extended values are live out. There had better be 9227 // a good reason for the transformation. 9228 return ExtendNodes.size(); 9229 } 9230 return true; 9231 } 9232 9233 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 9234 SDValue OrigLoad, SDValue ExtLoad, 9235 ISD::NodeType ExtType) { 9236 // Extend SetCC uses if necessary. 9237 SDLoc DL(ExtLoad); 9238 for (SDNode *SetCC : SetCCs) { 9239 SmallVector<SDValue, 4> Ops; 9240 9241 for (unsigned j = 0; j != 2; ++j) { 9242 SDValue SOp = SetCC->getOperand(j); 9243 if (SOp == OrigLoad) 9244 Ops.push_back(ExtLoad); 9245 else 9246 Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); 9247 } 9248 9249 Ops.push_back(SetCC->getOperand(2)); 9250 CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); 9251 } 9252 } 9253 9254 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). 9255 SDValue DAGCombiner::CombineExtLoad(SDNode *N) { 9256 SDValue N0 = N->getOperand(0); 9257 EVT DstVT = N->getValueType(0); 9258 EVT SrcVT = N0.getValueType(); 9259 9260 assert((N->getOpcode() == ISD::SIGN_EXTEND || 9261 N->getOpcode() == ISD::ZERO_EXTEND) && 9262 "Unexpected node type (not an extend)!"); 9263 9264 // fold (sext (load x)) to multiple smaller sextloads; same for zext. 9265 // For example, on a target with legal v4i32, but illegal v8i32, turn: 9266 // (v8i32 (sext (v8i16 (load x)))) 9267 // into: 9268 // (v8i32 (concat_vectors (v4i32 (sextload x)), 9269 // (v4i32 (sextload (x + 16))))) 9270 // Where uses of the original load, i.e.: 9271 // (v8i16 (load x)) 9272 // are replaced with: 9273 // (v8i16 (truncate 9274 // (v8i32 (concat_vectors (v4i32 (sextload x)), 9275 // (v4i32 (sextload (x + 16))))))) 9276 // 9277 // This combine is only applicable to illegal, but splittable, vectors. 9278 // All legal types, and illegal non-vector types, are handled elsewhere. 9279 // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. 9280 // 9281 if (N0->getOpcode() != ISD::LOAD) 9282 return SDValue(); 9283 9284 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9285 9286 if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || 9287 !N0.hasOneUse() || !LN0->isSimple() || 9288 !DstVT.isVector() || !DstVT.isPow2VectorType() || 9289 !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 9290 return SDValue(); 9291 9292 SmallVector<SDNode *, 4> SetCCs; 9293 if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI)) 9294 return SDValue(); 9295 9296 ISD::LoadExtType ExtType = 9297 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 9298 9299 // Try to split the vector types to get down to legal types. 9300 EVT SplitSrcVT = SrcVT; 9301 EVT SplitDstVT = DstVT; 9302 while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && 9303 SplitSrcVT.getVectorNumElements() > 1) { 9304 SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; 9305 SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; 9306 } 9307 9308 if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) 9309 return SDValue(); 9310 9311 assert(!DstVT.isScalableVector() && "Unexpected scalable vector type"); 9312 9313 SDLoc DL(N); 9314 const unsigned NumSplits = 9315 DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); 9316 const unsigned Stride = SplitSrcVT.getStoreSize(); 9317 SmallVector<SDValue, 4> Loads; 9318 SmallVector<SDValue, 4> Chains; 9319 9320 SDValue BasePtr = LN0->getBasePtr(); 9321 for (unsigned Idx = 0; Idx < NumSplits; Idx++) { 9322 const unsigned Offset = Idx * Stride; 9323 const unsigned Align = MinAlign(LN0->getAlignment(), Offset); 9324 9325 SDValue SplitLoad = DAG.getExtLoad( 9326 ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, 9327 LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, 9328 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 9329 9330 BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL); 9331 9332 Loads.push_back(SplitLoad.getValue(0)); 9333 Chains.push_back(SplitLoad.getValue(1)); 9334 } 9335 9336 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 9337 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); 9338 9339 // Simplify TF. 9340 AddToWorklist(NewChain.getNode()); 9341 9342 CombineTo(N, NewValue); 9343 9344 // Replace uses of the original load (before extension) 9345 // with a truncate of the concatenated sextloaded vectors. 9346 SDValue Trunc = 9347 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); 9348 ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode()); 9349 CombineTo(N0.getNode(), Trunc, NewChain); 9350 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9351 } 9352 9353 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 9354 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 9355 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { 9356 assert(N->getOpcode() == ISD::ZERO_EXTEND); 9357 EVT VT = N->getValueType(0); 9358 EVT OrigVT = N->getOperand(0).getValueType(); 9359 if (TLI.isZExtFree(OrigVT, VT)) 9360 return SDValue(); 9361 9362 // and/or/xor 9363 SDValue N0 = N->getOperand(0); 9364 if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 9365 N0.getOpcode() == ISD::XOR) || 9366 N0.getOperand(1).getOpcode() != ISD::Constant || 9367 (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))) 9368 return SDValue(); 9369 9370 // shl/shr 9371 SDValue N1 = N0->getOperand(0); 9372 if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) || 9373 N1.getOperand(1).getOpcode() != ISD::Constant || 9374 (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT))) 9375 return SDValue(); 9376 9377 // load 9378 if (!isa<LoadSDNode>(N1.getOperand(0))) 9379 return SDValue(); 9380 LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0)); 9381 EVT MemVT = Load->getMemoryVT(); 9382 if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) || 9383 Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed()) 9384 return SDValue(); 9385 9386 9387 // If the shift op is SHL, the logic op must be AND, otherwise the result 9388 // will be wrong. 9389 if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND) 9390 return SDValue(); 9391 9392 if (!N0.hasOneUse() || !N1.hasOneUse()) 9393 return SDValue(); 9394 9395 SmallVector<SDNode*, 4> SetCCs; 9396 if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0), 9397 ISD::ZERO_EXTEND, SetCCs, TLI)) 9398 return SDValue(); 9399 9400 // Actually do the transformation. 9401 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT, 9402 Load->getChain(), Load->getBasePtr(), 9403 Load->getMemoryVT(), Load->getMemOperand()); 9404 9405 SDLoc DL1(N1); 9406 SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad, 9407 N1.getOperand(1)); 9408 9409 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9410 Mask = Mask.zext(VT.getSizeInBits()); 9411 SDLoc DL0(N0); 9412 SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift, 9413 DAG.getConstant(Mask, DL0, VT)); 9414 9415 ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 9416 CombineTo(N, And); 9417 if (SDValue(Load, 0).hasOneUse()) { 9418 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1)); 9419 } else { 9420 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load), 9421 Load->getValueType(0), ExtLoad); 9422 CombineTo(Load, Trunc, ExtLoad.getValue(1)); 9423 } 9424 9425 // N0 is dead at this point. 9426 recursivelyDeleteUnusedNodes(N0.getNode()); 9427 9428 return SDValue(N,0); // Return N so it doesn't get rechecked! 9429 } 9430 9431 /// If we're narrowing or widening the result of a vector select and the final 9432 /// size is the same size as a setcc (compare) feeding the select, then try to 9433 /// apply the cast operation to the select's operands because matching vector 9434 /// sizes for a select condition and other operands should be more efficient. 9435 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { 9436 unsigned CastOpcode = Cast->getOpcode(); 9437 assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || 9438 CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || 9439 CastOpcode == ISD::FP_ROUND) && 9440 "Unexpected opcode for vector select narrowing/widening"); 9441 9442 // We only do this transform before legal ops because the pattern may be 9443 // obfuscated by target-specific operations after legalization. Do not create 9444 // an illegal select op, however, because that may be difficult to lower. 9445 EVT VT = Cast->getValueType(0); 9446 if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 9447 return SDValue(); 9448 9449 SDValue VSel = Cast->getOperand(0); 9450 if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || 9451 VSel.getOperand(0).getOpcode() != ISD::SETCC) 9452 return SDValue(); 9453 9454 // Does the setcc have the same vector size as the casted select? 9455 SDValue SetCC = VSel.getOperand(0); 9456 EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); 9457 if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) 9458 return SDValue(); 9459 9460 // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) 9461 SDValue A = VSel.getOperand(1); 9462 SDValue B = VSel.getOperand(2); 9463 SDValue CastA, CastB; 9464 SDLoc DL(Cast); 9465 if (CastOpcode == ISD::FP_ROUND) { 9466 // FP_ROUND (fptrunc) has an extra flag operand to pass along. 9467 CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); 9468 CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); 9469 } else { 9470 CastA = DAG.getNode(CastOpcode, DL, VT, A); 9471 CastB = DAG.getNode(CastOpcode, DL, VT, B); 9472 } 9473 return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); 9474 } 9475 9476 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9477 // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9478 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, 9479 const TargetLowering &TLI, EVT VT, 9480 bool LegalOperations, SDNode *N, 9481 SDValue N0, ISD::LoadExtType ExtLoadType) { 9482 SDNode *N0Node = N0.getNode(); 9483 bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node) 9484 : ISD::isZEXTLoad(N0Node); 9485 if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) || 9486 !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse()) 9487 return SDValue(); 9488 9489 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9490 EVT MemVT = LN0->getMemoryVT(); 9491 if ((LegalOperations || !LN0->isSimple() || 9492 VT.isVector()) && 9493 !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) 9494 return SDValue(); 9495 9496 SDValue ExtLoad = 9497 DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 9498 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 9499 Combiner.CombineTo(N, ExtLoad); 9500 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 9501 if (LN0->use_empty()) 9502 Combiner.recursivelyDeleteUnusedNodes(LN0); 9503 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9504 } 9505 9506 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9507 // Only generate vector extloads when 1) they're legal, and 2) they are 9508 // deemed desirable by the target. 9509 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, 9510 const TargetLowering &TLI, EVT VT, 9511 bool LegalOperations, SDNode *N, SDValue N0, 9512 ISD::LoadExtType ExtLoadType, 9513 ISD::NodeType ExtOpc) { 9514 if (!ISD::isNON_EXTLoad(N0.getNode()) || 9515 !ISD::isUNINDEXEDLoad(N0.getNode()) || 9516 ((LegalOperations || VT.isVector() || 9517 !cast<LoadSDNode>(N0)->isSimple()) && 9518 !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType()))) 9519 return {}; 9520 9521 bool DoXform = true; 9522 SmallVector<SDNode *, 4> SetCCs; 9523 if (!N0.hasOneUse()) 9524 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI); 9525 if (VT.isVector()) 9526 DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); 9527 if (!DoXform) 9528 return {}; 9529 9530 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9531 SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 9532 LN0->getBasePtr(), N0.getValueType(), 9533 LN0->getMemOperand()); 9534 Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc); 9535 // If the load value is used only by N, replace it via CombineTo N. 9536 bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); 9537 Combiner.CombineTo(N, ExtLoad); 9538 if (NoReplaceTrunc) { 9539 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 9540 Combiner.recursivelyDeleteUnusedNodes(LN0); 9541 } else { 9542 SDValue Trunc = 9543 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); 9544 Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 9545 } 9546 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9547 } 9548 9549 static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, 9550 const TargetLowering &TLI, EVT VT, 9551 SDNode *N, SDValue N0, 9552 ISD::LoadExtType ExtLoadType, 9553 ISD::NodeType ExtOpc) { 9554 if (!N0.hasOneUse()) 9555 return SDValue(); 9556 9557 MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0); 9558 if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD) 9559 return SDValue(); 9560 9561 if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0))) 9562 return SDValue(); 9563 9564 if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 9565 return SDValue(); 9566 9567 SDLoc dl(Ld); 9568 SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); 9569 SDValue NewLoad = DAG.getMaskedLoad( 9570 VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), 9571 PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(), 9572 ExtLoadType, Ld->isExpandingLoad()); 9573 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); 9574 return NewLoad; 9575 } 9576 9577 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, 9578 bool LegalOperations) { 9579 assert((N->getOpcode() == ISD::SIGN_EXTEND || 9580 N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext"); 9581 9582 SDValue SetCC = N->getOperand(0); 9583 if (LegalOperations || SetCC.getOpcode() != ISD::SETCC || 9584 !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1) 9585 return SDValue(); 9586 9587 SDValue X = SetCC.getOperand(0); 9588 SDValue Ones = SetCC.getOperand(1); 9589 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 9590 EVT VT = N->getValueType(0); 9591 EVT XVT = X.getValueType(); 9592 // setge X, C is canonicalized to setgt, so we do not need to match that 9593 // pattern. The setlt sibling is folded in SimplifySelectCC() because it does 9594 // not require the 'not' op. 9595 if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) { 9596 // Invert and smear/shift the sign bit: 9597 // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1) 9598 // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1) 9599 SDLoc DL(N); 9600 unsigned ShCt = VT.getSizeInBits() - 1; 9601 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9602 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 9603 SDValue NotX = DAG.getNOT(DL, X, VT); 9604 SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT); 9605 auto ShiftOpcode = 9606 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; 9607 return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); 9608 } 9609 } 9610 return SDValue(); 9611 } 9612 9613 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { 9614 SDValue N0 = N->getOperand(0); 9615 EVT VT = N->getValueType(0); 9616 SDLoc DL(N); 9617 9618 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 9619 return Res; 9620 9621 // fold (sext (sext x)) -> (sext x) 9622 // fold (sext (aext x)) -> (sext x) 9623 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 9624 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); 9625 9626 if (N0.getOpcode() == ISD::TRUNCATE) { 9627 // fold (sext (truncate (load x))) -> (sext (smaller load x)) 9628 // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) 9629 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 9630 SDNode *oye = N0.getOperand(0).getNode(); 9631 if (NarrowLoad.getNode() != N0.getNode()) { 9632 CombineTo(N0.getNode(), NarrowLoad); 9633 // CombineTo deleted the truncate, if needed, but not what's under it. 9634 AddToWorklist(oye); 9635 } 9636 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9637 } 9638 9639 // See if the value being truncated is already sign extended. If so, just 9640 // eliminate the trunc/sext pair. 9641 SDValue Op = N0.getOperand(0); 9642 unsigned OpBits = Op.getScalarValueSizeInBits(); 9643 unsigned MidBits = N0.getScalarValueSizeInBits(); 9644 unsigned DestBits = VT.getScalarSizeInBits(); 9645 unsigned NumSignBits = DAG.ComputeNumSignBits(Op); 9646 9647 if (OpBits == DestBits) { 9648 // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign 9649 // bits, it is already ready. 9650 if (NumSignBits > DestBits-MidBits) 9651 return Op; 9652 } else if (OpBits < DestBits) { 9653 // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign 9654 // bits, just sext from i32. 9655 if (NumSignBits > OpBits-MidBits) 9656 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); 9657 } else { 9658 // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign 9659 // bits, just truncate to i32. 9660 if (NumSignBits > OpBits-MidBits) 9661 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); 9662 } 9663 9664 // fold (sext (truncate x)) -> (sextinreg x). 9665 if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, 9666 N0.getValueType())) { 9667 if (OpBits < DestBits) 9668 Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); 9669 else if (OpBits > DestBits) 9670 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); 9671 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, 9672 DAG.getValueType(N0.getValueType())); 9673 } 9674 } 9675 9676 // Try to simplify (sext (load x)). 9677 if (SDValue foldedExt = 9678 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 9679 ISD::SEXTLOAD, ISD::SIGN_EXTEND)) 9680 return foldedExt; 9681 9682 if (SDValue foldedExt = 9683 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD, 9684 ISD::SIGN_EXTEND)) 9685 return foldedExt; 9686 9687 // fold (sext (load x)) to multiple smaller sextloads. 9688 // Only on illegal but splittable vectors. 9689 if (SDValue ExtLoad = CombineExtLoad(N)) 9690 return ExtLoad; 9691 9692 // Try to simplify (sext (sextload x)). 9693 if (SDValue foldedExt = tryToFoldExtOfExtload( 9694 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD)) 9695 return foldedExt; 9696 9697 // fold (sext (and/or/xor (load x), cst)) -> 9698 // (and/or/xor (sextload x), (sext cst)) 9699 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 9700 N0.getOpcode() == ISD::XOR) && 9701 isa<LoadSDNode>(N0.getOperand(0)) && 9702 N0.getOperand(1).getOpcode() == ISD::Constant && 9703 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 9704 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 9705 EVT MemVT = LN00->getMemoryVT(); 9706 if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) && 9707 LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) { 9708 SmallVector<SDNode*, 4> SetCCs; 9709 bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 9710 ISD::SIGN_EXTEND, SetCCs, TLI); 9711 if (DoXform) { 9712 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT, 9713 LN00->getChain(), LN00->getBasePtr(), 9714 LN00->getMemoryVT(), 9715 LN00->getMemOperand()); 9716 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9717 Mask = Mask.sext(VT.getSizeInBits()); 9718 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 9719 ExtLoad, DAG.getConstant(Mask, DL, VT)); 9720 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND); 9721 bool NoReplaceTruncAnd = !N0.hasOneUse(); 9722 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 9723 CombineTo(N, And); 9724 // If N0 has multiple uses, change other uses as well. 9725 if (NoReplaceTruncAnd) { 9726 SDValue TruncAnd = 9727 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 9728 CombineTo(N0.getNode(), TruncAnd); 9729 } 9730 if (NoReplaceTrunc) { 9731 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 9732 } else { 9733 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 9734 LN00->getValueType(0), ExtLoad); 9735 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 9736 } 9737 return SDValue(N,0); // Return N so it doesn't get rechecked! 9738 } 9739 } 9740 } 9741 9742 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 9743 return V; 9744 9745 if (N0.getOpcode() == ISD::SETCC) { 9746 SDValue N00 = N0.getOperand(0); 9747 SDValue N01 = N0.getOperand(1); 9748 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 9749 EVT N00VT = N0.getOperand(0).getValueType(); 9750 9751 // sext(setcc) -> sext_in_reg(vsetcc) for vectors. 9752 // Only do this before legalize for now. 9753 if (VT.isVector() && !LegalOperations && 9754 TLI.getBooleanContents(N00VT) == 9755 TargetLowering::ZeroOrNegativeOneBooleanContent) { 9756 // On some architectures (such as SSE/NEON/etc) the SETCC result type is 9757 // of the same size as the compared operands. Only optimize sext(setcc()) 9758 // if this is the case. 9759 EVT SVT = getSetCCResultType(N00VT); 9760 9761 // If we already have the desired type, don't change it. 9762 if (SVT != N0.getValueType()) { 9763 // We know that the # elements of the results is the same as the 9764 // # elements of the compare (and the # elements of the compare result 9765 // for that matter). Check to see that they are the same size. If so, 9766 // we know that the element size of the sext'd result matches the 9767 // element size of the compare operands. 9768 if (VT.getSizeInBits() == SVT.getSizeInBits()) 9769 return DAG.getSetCC(DL, VT, N00, N01, CC); 9770 9771 // If the desired elements are smaller or larger than the source 9772 // elements, we can use a matching integer vector type and then 9773 // truncate/sign extend. 9774 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); 9775 if (SVT == MatchingVecType) { 9776 SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); 9777 return DAG.getSExtOrTrunc(VsetCC, DL, VT); 9778 } 9779 } 9780 } 9781 9782 // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) 9783 // Here, T can be 1 or -1, depending on the type of the setcc and 9784 // getBooleanContents(). 9785 unsigned SetCCWidth = N0.getScalarValueSizeInBits(); 9786 9787 // To determine the "true" side of the select, we need to know the high bit 9788 // of the value returned by the setcc if it evaluates to true. 9789 // If the type of the setcc is i1, then the true case of the select is just 9790 // sext(i1 1), that is, -1. 9791 // If the type of the setcc is larger (say, i8) then the value of the high 9792 // bit depends on getBooleanContents(), so ask TLI for a real "true" value 9793 // of the appropriate width. 9794 SDValue ExtTrueVal = (SetCCWidth == 1) 9795 ? DAG.getAllOnesConstant(DL, VT) 9796 : DAG.getBoolConstant(true, DL, VT, N00VT); 9797 SDValue Zero = DAG.getConstant(0, DL, VT); 9798 if (SDValue SCC = 9799 SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) 9800 return SCC; 9801 9802 if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { 9803 EVT SetCCVT = getSetCCResultType(N00VT); 9804 // Don't do this transform for i1 because there's a select transform 9805 // that would reverse it. 9806 // TODO: We should not do this transform at all without a target hook 9807 // because a sext is likely cheaper than a select? 9808 if (SetCCVT.getScalarSizeInBits() != 1 && 9809 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { 9810 SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); 9811 return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); 9812 } 9813 } 9814 } 9815 9816 // fold (sext x) -> (zext x) if the sign bit is known zero. 9817 if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && 9818 DAG.SignBitIsZero(N0)) 9819 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); 9820 9821 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 9822 return NewVSel; 9823 9824 // Eliminate this sign extend by doing a negation in the destination type: 9825 // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64) 9826 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && 9827 isNullOrNullSplat(N0.getOperand(0)) && 9828 N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND && 9829 TLI.isOperationLegalOrCustom(ISD::SUB, VT)) { 9830 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT); 9831 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext); 9832 } 9833 // Eliminate this sign extend by doing a decrement in the destination type: 9834 // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1) 9835 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && 9836 isAllOnesOrAllOnesSplat(N0.getOperand(1)) && 9837 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 9838 TLI.isOperationLegalOrCustom(ISD::ADD, VT)) { 9839 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT); 9840 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); 9841 } 9842 9843 return SDValue(); 9844 } 9845 9846 // isTruncateOf - If N is a truncate of some other value, return true, record 9847 // the value being truncated in Op and which of Op's bits are zero/one in Known. 9848 // This function computes KnownBits to avoid a duplicated call to 9849 // computeKnownBits in the caller. 9850 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, 9851 KnownBits &Known) { 9852 if (N->getOpcode() == ISD::TRUNCATE) { 9853 Op = N->getOperand(0); 9854 Known = DAG.computeKnownBits(Op); 9855 return true; 9856 } 9857 9858 if (N.getOpcode() != ISD::SETCC || 9859 N.getValueType().getScalarType() != MVT::i1 || 9860 cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE) 9861 return false; 9862 9863 SDValue Op0 = N->getOperand(0); 9864 SDValue Op1 = N->getOperand(1); 9865 assert(Op0.getValueType() == Op1.getValueType()); 9866 9867 if (isNullOrNullSplat(Op0)) 9868 Op = Op1; 9869 else if (isNullOrNullSplat(Op1)) 9870 Op = Op0; 9871 else 9872 return false; 9873 9874 Known = DAG.computeKnownBits(Op); 9875 9876 return (Known.Zero | 1).isAllOnesValue(); 9877 } 9878 9879 /// Given an extending node with a pop-count operand, if the target does not 9880 /// support a pop-count in the narrow source type but does support it in the 9881 /// destination type, widen the pop-count to the destination type. 9882 static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) { 9883 assert((Extend->getOpcode() == ISD::ZERO_EXTEND || 9884 Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op"); 9885 9886 SDValue CtPop = Extend->getOperand(0); 9887 if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse()) 9888 return SDValue(); 9889 9890 EVT VT = Extend->getValueType(0); 9891 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9892 if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) || 9893 !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT)) 9894 return SDValue(); 9895 9896 // zext (ctpop X) --> ctpop (zext X) 9897 SDLoc DL(Extend); 9898 SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT); 9899 return DAG.getNode(ISD::CTPOP, DL, VT, NewZext); 9900 } 9901 9902 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { 9903 SDValue N0 = N->getOperand(0); 9904 EVT VT = N->getValueType(0); 9905 9906 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 9907 return Res; 9908 9909 // fold (zext (zext x)) -> (zext x) 9910 // fold (zext (aext x)) -> (zext x) 9911 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 9912 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, 9913 N0.getOperand(0)); 9914 9915 // fold (zext (truncate x)) -> (zext x) or 9916 // (zext (truncate x)) -> (truncate x) 9917 // This is valid when the truncated bits of x are already zero. 9918 SDValue Op; 9919 KnownBits Known; 9920 if (isTruncateOf(DAG, N0, Op, Known)) { 9921 APInt TruncatedBits = 9922 (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ? 9923 APInt(Op.getScalarValueSizeInBits(), 0) : 9924 APInt::getBitsSet(Op.getScalarValueSizeInBits(), 9925 N0.getScalarValueSizeInBits(), 9926 std::min(Op.getScalarValueSizeInBits(), 9927 VT.getScalarSizeInBits())); 9928 if (TruncatedBits.isSubsetOf(Known.Zero)) 9929 return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 9930 } 9931 9932 // fold (zext (truncate x)) -> (and x, mask) 9933 if (N0.getOpcode() == ISD::TRUNCATE) { 9934 // fold (zext (truncate (load x))) -> (zext (smaller load x)) 9935 // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) 9936 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 9937 SDNode *oye = N0.getOperand(0).getNode(); 9938 if (NarrowLoad.getNode() != N0.getNode()) { 9939 CombineTo(N0.getNode(), NarrowLoad); 9940 // CombineTo deleted the truncate, if needed, but not what's under it. 9941 AddToWorklist(oye); 9942 } 9943 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9944 } 9945 9946 EVT SrcVT = N0.getOperand(0).getValueType(); 9947 EVT MinVT = N0.getValueType(); 9948 9949 // Try to mask before the extension to avoid having to generate a larger mask, 9950 // possibly over several sub-vectors. 9951 if (SrcVT.bitsLT(VT) && VT.isVector()) { 9952 if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && 9953 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { 9954 SDValue Op = N0.getOperand(0); 9955 Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); 9956 AddToWorklist(Op.getNode()); 9957 SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 9958 // Transfer the debug info; the new node is equivalent to N0. 9959 DAG.transferDbgValues(N0, ZExtOrTrunc); 9960 return ZExtOrTrunc; 9961 } 9962 } 9963 9964 if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { 9965 SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 9966 AddToWorklist(Op.getNode()); 9967 SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); 9968 // We may safely transfer the debug info describing the truncate node over 9969 // to the equivalent and operation. 9970 DAG.transferDbgValues(N0, And); 9971 return And; 9972 } 9973 } 9974 9975 // Fold (zext (and (trunc x), cst)) -> (and x, cst), 9976 // if either of the casts is not free. 9977 if (N0.getOpcode() == ISD::AND && 9978 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 9979 N0.getOperand(1).getOpcode() == ISD::Constant && 9980 (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 9981 N0.getValueType()) || 9982 !TLI.isZExtFree(N0.getValueType(), VT))) { 9983 SDValue X = N0.getOperand(0).getOperand(0); 9984 X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); 9985 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9986 Mask = Mask.zext(VT.getSizeInBits()); 9987 SDLoc DL(N); 9988 return DAG.getNode(ISD::AND, DL, VT, 9989 X, DAG.getConstant(Mask, DL, VT)); 9990 } 9991 9992 // Try to simplify (zext (load x)). 9993 if (SDValue foldedExt = 9994 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 9995 ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) 9996 return foldedExt; 9997 9998 if (SDValue foldedExt = 9999 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD, 10000 ISD::ZERO_EXTEND)) 10001 return foldedExt; 10002 10003 // fold (zext (load x)) to multiple smaller zextloads. 10004 // Only on illegal but splittable vectors. 10005 if (SDValue ExtLoad = CombineExtLoad(N)) 10006 return ExtLoad; 10007 10008 // fold (zext (and/or/xor (load x), cst)) -> 10009 // (and/or/xor (zextload x), (zext cst)) 10010 // Unless (and (load x) cst) will match as a zextload already and has 10011 // additional users. 10012 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 10013 N0.getOpcode() == ISD::XOR) && 10014 isa<LoadSDNode>(N0.getOperand(0)) && 10015 N0.getOperand(1).getOpcode() == ISD::Constant && 10016 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 10017 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 10018 EVT MemVT = LN00->getMemoryVT(); 10019 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) && 10020 LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) { 10021 bool DoXform = true; 10022 SmallVector<SDNode*, 4> SetCCs; 10023 if (!N0.hasOneUse()) { 10024 if (N0.getOpcode() == ISD::AND) { 10025 auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); 10026 EVT LoadResultTy = AndC->getValueType(0); 10027 EVT ExtVT; 10028 if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT)) 10029 DoXform = false; 10030 } 10031 } 10032 if (DoXform) 10033 DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 10034 ISD::ZERO_EXTEND, SetCCs, TLI); 10035 if (DoXform) { 10036 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT, 10037 LN00->getChain(), LN00->getBasePtr(), 10038 LN00->getMemoryVT(), 10039 LN00->getMemOperand()); 10040 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10041 Mask = Mask.zext(VT.getSizeInBits()); 10042 SDLoc DL(N); 10043 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 10044 ExtLoad, DAG.getConstant(Mask, DL, VT)); 10045 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 10046 bool NoReplaceTruncAnd = !N0.hasOneUse(); 10047 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 10048 CombineTo(N, And); 10049 // If N0 has multiple uses, change other uses as well. 10050 if (NoReplaceTruncAnd) { 10051 SDValue TruncAnd = 10052 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 10053 CombineTo(N0.getNode(), TruncAnd); 10054 } 10055 if (NoReplaceTrunc) { 10056 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 10057 } else { 10058 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 10059 LN00->getValueType(0), ExtLoad); 10060 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 10061 } 10062 return SDValue(N,0); // Return N so it doesn't get rechecked! 10063 } 10064 } 10065 } 10066 10067 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 10068 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 10069 if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N)) 10070 return ZExtLoad; 10071 10072 // Try to simplify (zext (zextload x)). 10073 if (SDValue foldedExt = tryToFoldExtOfExtload( 10074 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD)) 10075 return foldedExt; 10076 10077 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 10078 return V; 10079 10080 if (N0.getOpcode() == ISD::SETCC) { 10081 // Only do this before legalize for now. 10082 if (!LegalOperations && VT.isVector() && 10083 N0.getValueType().getVectorElementType() == MVT::i1) { 10084 EVT N00VT = N0.getOperand(0).getValueType(); 10085 if (getSetCCResultType(N00VT) == N0.getValueType()) 10086 return SDValue(); 10087 10088 // We know that the # elements of the results is the same as the # 10089 // elements of the compare (and the # elements of the compare result for 10090 // that matter). Check to see that they are the same size. If so, we know 10091 // that the element size of the sext'd result matches the element size of 10092 // the compare operands. 10093 SDLoc DL(N); 10094 SDValue VecOnes = DAG.getConstant(1, DL, VT); 10095 if (VT.getSizeInBits() == N00VT.getSizeInBits()) { 10096 // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors. 10097 SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), 10098 N0.getOperand(1), N0.getOperand(2)); 10099 return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes); 10100 } 10101 10102 // If the desired elements are smaller or larger than the source 10103 // elements we can use a matching integer vector type and then 10104 // truncate/sign extend. 10105 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 10106 SDValue VsetCC = 10107 DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), 10108 N0.getOperand(1), N0.getOperand(2)); 10109 return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT), 10110 VecOnes); 10111 } 10112 10113 // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc 10114 SDLoc DL(N); 10115 if (SDValue SCC = SimplifySelectCC( 10116 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), 10117 DAG.getConstant(0, DL, VT), 10118 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 10119 return SCC; 10120 } 10121 10122 // (zext (shl (zext x), cst)) -> (shl (zext x), cst) 10123 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && 10124 isa<ConstantSDNode>(N0.getOperand(1)) && 10125 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 10126 N0.hasOneUse()) { 10127 SDValue ShAmt = N0.getOperand(1); 10128 if (N0.getOpcode() == ISD::SHL) { 10129 SDValue InnerZExt = N0.getOperand(0); 10130 // If the original shl may be shifting out bits, do not perform this 10131 // transformation. 10132 unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - 10133 InnerZExt.getOperand(0).getValueSizeInBits(); 10134 if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits)) 10135 return SDValue(); 10136 } 10137 10138 SDLoc DL(N); 10139 10140 // Ensure that the shift amount is wide enough for the shifted value. 10141 if (VT.getSizeInBits() >= 256) 10142 ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); 10143 10144 return DAG.getNode(N0.getOpcode(), DL, VT, 10145 DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), 10146 ShAmt); 10147 } 10148 10149 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 10150 return NewVSel; 10151 10152 if (SDValue NewCtPop = widenCtPop(N, DAG)) 10153 return NewCtPop; 10154 10155 return SDValue(); 10156 } 10157 10158 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { 10159 SDValue N0 = N->getOperand(0); 10160 EVT VT = N->getValueType(0); 10161 10162 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10163 return Res; 10164 10165 // fold (aext (aext x)) -> (aext x) 10166 // fold (aext (zext x)) -> (zext x) 10167 // fold (aext (sext x)) -> (sext x) 10168 if (N0.getOpcode() == ISD::ANY_EXTEND || 10169 N0.getOpcode() == ISD::ZERO_EXTEND || 10170 N0.getOpcode() == ISD::SIGN_EXTEND) 10171 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 10172 10173 // fold (aext (truncate (load x))) -> (aext (smaller load x)) 10174 // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) 10175 if (N0.getOpcode() == ISD::TRUNCATE) { 10176 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 10177 SDNode *oye = N0.getOperand(0).getNode(); 10178 if (NarrowLoad.getNode() != N0.getNode()) { 10179 CombineTo(N0.getNode(), NarrowLoad); 10180 // CombineTo deleted the truncate, if needed, but not what's under it. 10181 AddToWorklist(oye); 10182 } 10183 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10184 } 10185 } 10186 10187 // fold (aext (truncate x)) 10188 if (N0.getOpcode() == ISD::TRUNCATE) 10189 return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 10190 10191 // Fold (aext (and (trunc x), cst)) -> (and x, cst) 10192 // if the trunc is not free. 10193 if (N0.getOpcode() == ISD::AND && 10194 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 10195 N0.getOperand(1).getOpcode() == ISD::Constant && 10196 !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 10197 N0.getValueType())) { 10198 SDLoc DL(N); 10199 SDValue X = N0.getOperand(0).getOperand(0); 10200 X = DAG.getAnyExtOrTrunc(X, DL, VT); 10201 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10202 Mask = Mask.zext(VT.getSizeInBits()); 10203 return DAG.getNode(ISD::AND, DL, VT, 10204 X, DAG.getConstant(Mask, DL, VT)); 10205 } 10206 10207 // fold (aext (load x)) -> (aext (truncate (extload x))) 10208 // None of the supported targets knows how to perform load and any_ext 10209 // on vectors in one instruction. We only perform this transformation on 10210 // scalars. 10211 if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && 10212 ISD::isUNINDEXEDLoad(N0.getNode()) && 10213 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 10214 bool DoXform = true; 10215 SmallVector<SDNode*, 4> SetCCs; 10216 if (!N0.hasOneUse()) 10217 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, 10218 TLI); 10219 if (DoXform) { 10220 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10221 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 10222 LN0->getChain(), 10223 LN0->getBasePtr(), N0.getValueType(), 10224 LN0->getMemOperand()); 10225 ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); 10226 // If the load value is used only by N, replace it via CombineTo N. 10227 bool NoReplaceTrunc = N0.hasOneUse(); 10228 CombineTo(N, ExtLoad); 10229 if (NoReplaceTrunc) { 10230 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10231 recursivelyDeleteUnusedNodes(LN0); 10232 } else { 10233 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), 10234 N0.getValueType(), ExtLoad); 10235 CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 10236 } 10237 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10238 } 10239 } 10240 10241 // fold (aext (zextload x)) -> (aext (truncate (zextload x))) 10242 // fold (aext (sextload x)) -> (aext (truncate (sextload x))) 10243 // fold (aext ( extload x)) -> (aext (truncate (extload x))) 10244 if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && 10245 ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { 10246 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10247 ISD::LoadExtType ExtType = LN0->getExtensionType(); 10248 EVT MemVT = LN0->getMemoryVT(); 10249 if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { 10250 SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), 10251 VT, LN0->getChain(), LN0->getBasePtr(), 10252 MemVT, LN0->getMemOperand()); 10253 CombineTo(N, ExtLoad); 10254 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10255 recursivelyDeleteUnusedNodes(LN0); 10256 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10257 } 10258 } 10259 10260 if (N0.getOpcode() == ISD::SETCC) { 10261 // For vectors: 10262 // aext(setcc) -> vsetcc 10263 // aext(setcc) -> truncate(vsetcc) 10264 // aext(setcc) -> aext(vsetcc) 10265 // Only do this before legalize for now. 10266 if (VT.isVector() && !LegalOperations) { 10267 EVT N00VT = N0.getOperand(0).getValueType(); 10268 if (getSetCCResultType(N00VT) == N0.getValueType()) 10269 return SDValue(); 10270 10271 // We know that the # elements of the results is the same as the 10272 // # elements of the compare (and the # elements of the compare result 10273 // for that matter). Check to see that they are the same size. If so, 10274 // we know that the element size of the sext'd result matches the 10275 // element size of the compare operands. 10276 if (VT.getSizeInBits() == N00VT.getSizeInBits()) 10277 return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), 10278 N0.getOperand(1), 10279 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 10280 10281 // If the desired elements are smaller or larger than the source 10282 // elements we can use a matching integer vector type and then 10283 // truncate/any extend 10284 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 10285 SDValue VsetCC = 10286 DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), 10287 N0.getOperand(1), 10288 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 10289 return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); 10290 } 10291 10292 // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc 10293 SDLoc DL(N); 10294 if (SDValue SCC = SimplifySelectCC( 10295 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), 10296 DAG.getConstant(0, DL, VT), 10297 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 10298 return SCC; 10299 } 10300 10301 if (SDValue NewCtPop = widenCtPop(N, DAG)) 10302 return NewCtPop; 10303 10304 return SDValue(); 10305 } 10306 10307 SDValue DAGCombiner::visitAssertExt(SDNode *N) { 10308 unsigned Opcode = N->getOpcode(); 10309 SDValue N0 = N->getOperand(0); 10310 SDValue N1 = N->getOperand(1); 10311 EVT AssertVT = cast<VTSDNode>(N1)->getVT(); 10312 10313 // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) 10314 if (N0.getOpcode() == Opcode && 10315 AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) 10316 return N0; 10317 10318 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 10319 N0.getOperand(0).getOpcode() == Opcode) { 10320 // We have an assert, truncate, assert sandwich. Make one stronger assert 10321 // by asserting on the smallest asserted type to the larger source type. 10322 // This eliminates the later assert: 10323 // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN 10324 // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN 10325 SDValue BigA = N0.getOperand(0); 10326 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 10327 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 10328 "Asserting zero/sign-extended bits to a type larger than the " 10329 "truncated destination does not provide information"); 10330 10331 SDLoc DL(N); 10332 EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; 10333 SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); 10334 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 10335 BigA.getOperand(0), MinAssertVTVal); 10336 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 10337 } 10338 10339 // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller 10340 // than X. Just move the AssertZext in front of the truncate and drop the 10341 // AssertSExt. 10342 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 10343 N0.getOperand(0).getOpcode() == ISD::AssertSext && 10344 Opcode == ISD::AssertZext) { 10345 SDValue BigA = N0.getOperand(0); 10346 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 10347 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 10348 "Asserting zero/sign-extended bits to a type larger than the " 10349 "truncated destination does not provide information"); 10350 10351 if (AssertVT.bitsLT(BigA_AssertVT)) { 10352 SDLoc DL(N); 10353 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 10354 BigA.getOperand(0), N1); 10355 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 10356 } 10357 } 10358 10359 return SDValue(); 10360 } 10361 10362 /// If the result of a wider load is shifted to right of N bits and then 10363 /// truncated to a narrower type and where N is a multiple of number of bits of 10364 /// the narrower type, transform it to a narrower load from address + N / num of 10365 /// bits of new type. Also narrow the load if the result is masked with an AND 10366 /// to effectively produce a smaller type. If the result is to be extended, also 10367 /// fold the extension to form a extending load. 10368 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { 10369 unsigned Opc = N->getOpcode(); 10370 10371 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 10372 SDValue N0 = N->getOperand(0); 10373 EVT VT = N->getValueType(0); 10374 EVT ExtVT = VT; 10375 10376 // This transformation isn't valid for vector loads. 10377 if (VT.isVector()) 10378 return SDValue(); 10379 10380 unsigned ShAmt = 0; 10381 bool HasShiftedOffset = false; 10382 // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then 10383 // extended to VT. 10384 if (Opc == ISD::SIGN_EXTEND_INREG) { 10385 ExtType = ISD::SEXTLOAD; 10386 ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 10387 } else if (Opc == ISD::SRL) { 10388 // Another special-case: SRL is basically zero-extending a narrower value, 10389 // or it maybe shifting a higher subword, half or byte into the lowest 10390 // bits. 10391 ExtType = ISD::ZEXTLOAD; 10392 N0 = SDValue(N, 0); 10393 10394 auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); 10395 auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10396 if (!N01 || !LN0) 10397 return SDValue(); 10398 10399 uint64_t ShiftAmt = N01->getZExtValue(); 10400 uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); 10401 if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) 10402 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); 10403 else 10404 ExtVT = EVT::getIntegerVT(*DAG.getContext(), 10405 VT.getSizeInBits() - ShiftAmt); 10406 } else if (Opc == ISD::AND) { 10407 // An AND with a constant mask is the same as a truncate + zero-extend. 10408 auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10409 if (!AndC) 10410 return SDValue(); 10411 10412 const APInt &Mask = AndC->getAPIntValue(); 10413 unsigned ActiveBits = 0; 10414 if (Mask.isMask()) { 10415 ActiveBits = Mask.countTrailingOnes(); 10416 } else if (Mask.isShiftedMask()) { 10417 ShAmt = Mask.countTrailingZeros(); 10418 APInt ShiftedMask = Mask.lshr(ShAmt); 10419 ActiveBits = ShiftedMask.countTrailingOnes(); 10420 HasShiftedOffset = true; 10421 } else 10422 return SDValue(); 10423 10424 ExtType = ISD::ZEXTLOAD; 10425 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 10426 } 10427 10428 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 10429 SDValue SRL = N0; 10430 if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) { 10431 ShAmt = ConstShift->getZExtValue(); 10432 unsigned EVTBits = ExtVT.getSizeInBits(); 10433 // Is the shift amount a multiple of size of VT? 10434 if ((ShAmt & (EVTBits-1)) == 0) { 10435 N0 = N0.getOperand(0); 10436 // Is the load width a multiple of size of VT? 10437 if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0) 10438 return SDValue(); 10439 } 10440 10441 // At this point, we must have a load or else we can't do the transform. 10442 if (!isa<LoadSDNode>(N0)) return SDValue(); 10443 10444 auto *LN0 = cast<LoadSDNode>(N0); 10445 10446 // Because a SRL must be assumed to *need* to zero-extend the high bits 10447 // (as opposed to anyext the high bits), we can't combine the zextload 10448 // lowering of SRL and an sextload. 10449 if (LN0->getExtensionType() == ISD::SEXTLOAD) 10450 return SDValue(); 10451 10452 // If the shift amount is larger than the input type then we're not 10453 // accessing any of the loaded bytes. If the load was a zextload/extload 10454 // then the result of the shift+trunc is zero/undef (handled elsewhere). 10455 if (ShAmt >= LN0->getMemoryVT().getSizeInBits()) 10456 return SDValue(); 10457 10458 // If the SRL is only used by a masking AND, we may be able to adjust 10459 // the ExtVT to make the AND redundant. 10460 SDNode *Mask = *(SRL->use_begin()); 10461 if (Mask->getOpcode() == ISD::AND && 10462 isa<ConstantSDNode>(Mask->getOperand(1))) { 10463 const APInt &ShiftMask = 10464 cast<ConstantSDNode>(Mask->getOperand(1))->getAPIntValue(); 10465 if (ShiftMask.isMask()) { 10466 EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), 10467 ShiftMask.countTrailingOnes()); 10468 // If the mask is smaller, recompute the type. 10469 if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) && 10470 TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) 10471 ExtVT = MaskedVT; 10472 } 10473 } 10474 } 10475 } 10476 10477 // If the load is shifted left (and the result isn't shifted back right), 10478 // we can fold the truncate through the shift. 10479 unsigned ShLeftAmt = 0; 10480 if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 10481 ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { 10482 if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 10483 ShLeftAmt = N01->getZExtValue(); 10484 N0 = N0.getOperand(0); 10485 } 10486 } 10487 10488 // If we haven't found a load, we can't narrow it. 10489 if (!isa<LoadSDNode>(N0)) 10490 return SDValue(); 10491 10492 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10493 // Reducing the width of a volatile load is illegal. For atomics, we may be 10494 // able to reduce the width provided we never widen again. (see D66309) 10495 if (!LN0->isSimple() || 10496 !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt)) 10497 return SDValue(); 10498 10499 auto AdjustBigEndianShift = [&](unsigned ShAmt) { 10500 unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); 10501 unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); 10502 return LVTStoreBits - EVTStoreBits - ShAmt; 10503 }; 10504 10505 // For big endian targets, we need to adjust the offset to the pointer to 10506 // load the correct bytes. 10507 if (DAG.getDataLayout().isBigEndian()) 10508 ShAmt = AdjustBigEndianShift(ShAmt); 10509 10510 uint64_t PtrOff = ShAmt / 8; 10511 unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); 10512 SDLoc DL(LN0); 10513 // The original load itself didn't wrap, so an offset within it doesn't. 10514 SDNodeFlags Flags; 10515 Flags.setNoUnsignedWrap(true); 10516 SDValue NewPtr = 10517 DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags); 10518 AddToWorklist(NewPtr.getNode()); 10519 10520 SDValue Load; 10521 if (ExtType == ISD::NON_EXTLOAD) 10522 Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr, 10523 LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign, 10524 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 10525 else 10526 Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr, 10527 LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, 10528 NewAlign, LN0->getMemOperand()->getFlags(), 10529 LN0->getAAInfo()); 10530 10531 // Replace the old load's chain with the new load's chain. 10532 WorklistRemover DeadNodes(*this); 10533 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 10534 10535 // Shift the result left, if we've swallowed a left shift. 10536 SDValue Result = Load; 10537 if (ShLeftAmt != 0) { 10538 EVT ShImmTy = getShiftAmountTy(Result.getValueType()); 10539 if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) 10540 ShImmTy = VT; 10541 // If the shift amount is as large as the result size (but, presumably, 10542 // no larger than the source) then the useful bits of the result are 10543 // zero; we can't simply return the shortened shift, because the result 10544 // of that operation is undefined. 10545 if (ShLeftAmt >= VT.getSizeInBits()) 10546 Result = DAG.getConstant(0, DL, VT); 10547 else 10548 Result = DAG.getNode(ISD::SHL, DL, VT, 10549 Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy)); 10550 } 10551 10552 if (HasShiftedOffset) { 10553 // Recalculate the shift amount after it has been altered to calculate 10554 // the offset. 10555 if (DAG.getDataLayout().isBigEndian()) 10556 ShAmt = AdjustBigEndianShift(ShAmt); 10557 10558 // We're using a shifted mask, so the load now has an offset. This means 10559 // that data has been loaded into the lower bytes than it would have been 10560 // before, so we need to shl the loaded data into the correct position in the 10561 // register. 10562 SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT); 10563 Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC); 10564 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 10565 } 10566 10567 // Return the new loaded value. 10568 return Result; 10569 } 10570 10571 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { 10572 SDValue N0 = N->getOperand(0); 10573 SDValue N1 = N->getOperand(1); 10574 EVT VT = N->getValueType(0); 10575 EVT EVT = cast<VTSDNode>(N1)->getVT(); 10576 unsigned VTBits = VT.getScalarSizeInBits(); 10577 unsigned EVTBits = EVT.getScalarSizeInBits(); 10578 10579 if (N0.isUndef()) 10580 return DAG.getUNDEF(VT); 10581 10582 // fold (sext_in_reg c1) -> c1 10583 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 10584 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); 10585 10586 // If the input is already sign extended, just drop the extension. 10587 if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1) 10588 return N0; 10589 10590 // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 10591 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && 10592 EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) 10593 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, 10594 N0.getOperand(0), N1); 10595 10596 // fold (sext_in_reg (sext x)) -> (sext x) 10597 // fold (sext_in_reg (aext x)) -> (sext x) 10598 // if x is small enough or if we know that x has more than 1 sign bit and the 10599 // sign_extend_inreg is extending from one of them. 10600 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { 10601 SDValue N00 = N0.getOperand(0); 10602 unsigned N00Bits = N00.getScalarValueSizeInBits(); 10603 if ((N00Bits <= EVTBits || 10604 (N00Bits - DAG.ComputeNumSignBits(N00)) < EVTBits) && 10605 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 10606 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); 10607 } 10608 10609 // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) 10610 if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || 10611 N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || 10612 N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && 10613 N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { 10614 if (!LegalOperations || 10615 TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) 10616 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, 10617 N0.getOperand(0)); 10618 } 10619 10620 // fold (sext_in_reg (zext x)) -> (sext x) 10621 // iff we are extending the source sign bit. 10622 if (N0.getOpcode() == ISD::ZERO_EXTEND) { 10623 SDValue N00 = N0.getOperand(0); 10624 if (N00.getScalarValueSizeInBits() == EVTBits && 10625 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 10626 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); 10627 } 10628 10629 // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. 10630 if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) 10631 return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); 10632 10633 // fold operands of sext_in_reg based on knowledge that the top bits are not 10634 // demanded. 10635 if (SimplifyDemandedBits(SDValue(N, 0))) 10636 return SDValue(N, 0); 10637 10638 // fold (sext_in_reg (load x)) -> (smaller sextload x) 10639 // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) 10640 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 10641 return NarrowLoad; 10642 10643 // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) 10644 // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. 10645 // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. 10646 if (N0.getOpcode() == ISD::SRL) { 10647 if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1))) 10648 if (ShAmt->getAPIntValue().ule(VTBits - EVTBits)) { 10649 // We can turn this into an SRA iff the input to the SRL is already sign 10650 // extended enough. 10651 unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); 10652 if (((VTBits - EVTBits) - ShAmt->getZExtValue()) < InSignBits) 10653 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), 10654 N0.getOperand(1)); 10655 } 10656 } 10657 10658 // fold (sext_inreg (extload x)) -> (sextload x) 10659 // If sextload is not supported by target, we can only do the combine when 10660 // load has one use. Doing otherwise can block folding the extload with other 10661 // extends that the target does support. 10662 if (ISD::isEXTLoad(N0.getNode()) && 10663 ISD::isUNINDEXEDLoad(N0.getNode()) && 10664 EVT == cast<LoadSDNode>(N0)->getMemoryVT() && 10665 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() && 10666 N0.hasOneUse()) || 10667 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { 10668 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10669 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 10670 LN0->getChain(), 10671 LN0->getBasePtr(), EVT, 10672 LN0->getMemOperand()); 10673 CombineTo(N, ExtLoad); 10674 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 10675 AddToWorklist(ExtLoad.getNode()); 10676 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10677 } 10678 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use 10679 if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && 10680 N0.hasOneUse() && 10681 EVT == cast<LoadSDNode>(N0)->getMemoryVT() && 10682 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) && 10683 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { 10684 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10685 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 10686 LN0->getChain(), 10687 LN0->getBasePtr(), EVT, 10688 LN0->getMemOperand()); 10689 CombineTo(N, ExtLoad); 10690 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 10691 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10692 } 10693 10694 // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) 10695 if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) { 10696 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 10697 N0.getOperand(1), false)) 10698 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, 10699 BSwap, N1); 10700 } 10701 10702 return SDValue(); 10703 } 10704 10705 SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { 10706 SDValue N0 = N->getOperand(0); 10707 EVT VT = N->getValueType(0); 10708 10709 if (N0.isUndef()) 10710 return DAG.getUNDEF(VT); 10711 10712 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10713 return Res; 10714 10715 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 10716 return SDValue(N, 0); 10717 10718 return SDValue(); 10719 } 10720 10721 SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { 10722 SDValue N0 = N->getOperand(0); 10723 EVT VT = N->getValueType(0); 10724 10725 if (N0.isUndef()) 10726 return DAG.getUNDEF(VT); 10727 10728 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10729 return Res; 10730 10731 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 10732 return SDValue(N, 0); 10733 10734 return SDValue(); 10735 } 10736 10737 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { 10738 SDValue N0 = N->getOperand(0); 10739 EVT VT = N->getValueType(0); 10740 EVT SrcVT = N0.getValueType(); 10741 bool isLE = DAG.getDataLayout().isLittleEndian(); 10742 10743 // noop truncate 10744 if (SrcVT == VT) 10745 return N0; 10746 10747 // fold (truncate (truncate x)) -> (truncate x) 10748 if (N0.getOpcode() == ISD::TRUNCATE) 10749 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 10750 10751 // fold (truncate c1) -> c1 10752 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 10753 SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); 10754 if (C.getNode() != N) 10755 return C; 10756 } 10757 10758 // fold (truncate (ext x)) -> (ext x) or (truncate x) or x 10759 if (N0.getOpcode() == ISD::ZERO_EXTEND || 10760 N0.getOpcode() == ISD::SIGN_EXTEND || 10761 N0.getOpcode() == ISD::ANY_EXTEND) { 10762 // if the source is smaller than the dest, we still need an extend. 10763 if (N0.getOperand(0).getValueType().bitsLT(VT)) 10764 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 10765 // if the source is larger than the dest, than we just need the truncate. 10766 if (N0.getOperand(0).getValueType().bitsGT(VT)) 10767 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 10768 // if the source and dest are the same type, we can drop both the extend 10769 // and the truncate. 10770 return N0.getOperand(0); 10771 } 10772 10773 // If this is anyext(trunc), don't fold it, allow ourselves to be folded. 10774 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) 10775 return SDValue(); 10776 10777 // Fold extract-and-trunc into a narrow extract. For example: 10778 // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) 10779 // i32 y = TRUNCATE(i64 x) 10780 // -- becomes -- 10781 // v16i8 b = BITCAST (v2i64 val) 10782 // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) 10783 // 10784 // Note: We only run this optimization after type legalization (which often 10785 // creates this pattern) and before operation legalization after which 10786 // we need to be more careful about the vector instructions that we generate. 10787 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 10788 LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { 10789 EVT VecTy = N0.getOperand(0).getValueType(); 10790 EVT ExTy = N0.getValueType(); 10791 EVT TrTy = N->getValueType(0); 10792 10793 unsigned NumElem = VecTy.getVectorNumElements(); 10794 unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); 10795 10796 EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); 10797 assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); 10798 10799 SDValue EltNo = N0->getOperand(1); 10800 if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) { 10801 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 10802 int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); 10803 10804 SDLoc DL(N); 10805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy, 10806 DAG.getBitcast(NVT, N0.getOperand(0)), 10807 DAG.getVectorIdxConstant(Index, DL)); 10808 } 10809 } 10810 10811 // trunc (select c, a, b) -> select c, (trunc a), (trunc b) 10812 if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { 10813 if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && 10814 TLI.isTruncateFree(SrcVT, VT)) { 10815 SDLoc SL(N0); 10816 SDValue Cond = N0.getOperand(0); 10817 SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 10818 SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); 10819 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); 10820 } 10821 } 10822 10823 // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() 10824 if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 10825 (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) && 10826 TLI.isTypeDesirableForOp(ISD::SHL, VT)) { 10827 SDValue Amt = N0.getOperand(1); 10828 KnownBits Known = DAG.computeKnownBits(Amt); 10829 unsigned Size = VT.getScalarSizeInBits(); 10830 if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { 10831 SDLoc SL(N); 10832 EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); 10833 10834 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 10835 if (AmtVT != Amt.getValueType()) { 10836 Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); 10837 AddToWorklist(Amt.getNode()); 10838 } 10839 return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); 10840 } 10841 } 10842 10843 // Attempt to pre-truncate BUILD_VECTOR sources. 10844 if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && 10845 TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType())) { 10846 SDLoc DL(N); 10847 EVT SVT = VT.getScalarType(); 10848 SmallVector<SDValue, 8> TruncOps; 10849 for (const SDValue &Op : N0->op_values()) { 10850 SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op); 10851 TruncOps.push_back(TruncOp); 10852 } 10853 return DAG.getBuildVector(VT, DL, TruncOps); 10854 } 10855 10856 // Fold a series of buildvector, bitcast, and truncate if possible. 10857 // For example fold 10858 // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to 10859 // (2xi32 (buildvector x, y)). 10860 if (Level == AfterLegalizeVectorOps && VT.isVector() && 10861 N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 10862 N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 10863 N0.getOperand(0).hasOneUse()) { 10864 SDValue BuildVect = N0.getOperand(0); 10865 EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); 10866 EVT TruncVecEltTy = VT.getVectorElementType(); 10867 10868 // Check that the element types match. 10869 if (BuildVectEltTy == TruncVecEltTy) { 10870 // Now we only need to compute the offset of the truncated elements. 10871 unsigned BuildVecNumElts = BuildVect.getNumOperands(); 10872 unsigned TruncVecNumElts = VT.getVectorNumElements(); 10873 unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; 10874 10875 assert((BuildVecNumElts % TruncVecNumElts) == 0 && 10876 "Invalid number of elements"); 10877 10878 SmallVector<SDValue, 8> Opnds; 10879 for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) 10880 Opnds.push_back(BuildVect.getOperand(i)); 10881 10882 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 10883 } 10884 } 10885 10886 // See if we can simplify the input to this truncate through knowledge that 10887 // only the low bits are being used. 10888 // For example "trunc (or (shl x, 8), y)" // -> trunc y 10889 // Currently we only perform this optimization on scalars because vectors 10890 // may have different active low bits. 10891 if (!VT.isVector()) { 10892 APInt Mask = 10893 APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); 10894 if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) 10895 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); 10896 } 10897 10898 // fold (truncate (load x)) -> (smaller load x) 10899 // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) 10900 if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { 10901 if (SDValue Reduced = ReduceLoadWidth(N)) 10902 return Reduced; 10903 10904 // Handle the case where the load remains an extending load even 10905 // after truncation. 10906 if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { 10907 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10908 if (LN0->isSimple() && 10909 LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { 10910 SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), 10911 VT, LN0->getChain(), LN0->getBasePtr(), 10912 LN0->getMemoryVT(), 10913 LN0->getMemOperand()); 10914 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); 10915 return NewLoad; 10916 } 10917 } 10918 } 10919 10920 // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), 10921 // where ... are all 'undef'. 10922 if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { 10923 SmallVector<EVT, 8> VTs; 10924 SDValue V; 10925 unsigned Idx = 0; 10926 unsigned NumDefs = 0; 10927 10928 for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { 10929 SDValue X = N0.getOperand(i); 10930 if (!X.isUndef()) { 10931 V = X; 10932 Idx = i; 10933 NumDefs++; 10934 } 10935 // Stop if more than one members are non-undef. 10936 if (NumDefs > 1) 10937 break; 10938 VTs.push_back(EVT::getVectorVT(*DAG.getContext(), 10939 VT.getVectorElementType(), 10940 X.getValueType().getVectorNumElements())); 10941 } 10942 10943 if (NumDefs == 0) 10944 return DAG.getUNDEF(VT); 10945 10946 if (NumDefs == 1) { 10947 assert(V.getNode() && "The single defined operand is empty!"); 10948 SmallVector<SDValue, 8> Opnds; 10949 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 10950 if (i != Idx) { 10951 Opnds.push_back(DAG.getUNDEF(VTs[i])); 10952 continue; 10953 } 10954 SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); 10955 AddToWorklist(NV.getNode()); 10956 Opnds.push_back(NV); 10957 } 10958 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); 10959 } 10960 } 10961 10962 // Fold truncate of a bitcast of a vector to an extract of the low vector 10963 // element. 10964 // 10965 // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx 10966 if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { 10967 SDValue VecSrc = N0.getOperand(0); 10968 EVT VecSrcVT = VecSrc.getValueType(); 10969 if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT && 10970 (!LegalOperations || 10971 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) { 10972 SDLoc SL(N); 10973 10974 unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1; 10975 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, 10976 DAG.getVectorIdxConstant(Idx, SL)); 10977 } 10978 } 10979 10980 // Simplify the operands using demanded-bits information. 10981 if (!VT.isVector() && 10982 SimplifyDemandedBits(SDValue(N, 0))) 10983 return SDValue(N, 0); 10984 10985 // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) 10986 // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) 10987 // When the adde's carry is not used. 10988 if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && 10989 N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && 10990 // We only do for addcarry before legalize operation 10991 ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) || 10992 TLI.isOperationLegal(N0.getOpcode(), VT))) { 10993 SDLoc SL(N); 10994 auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 10995 auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 10996 auto VTs = DAG.getVTList(VT, N0->getValueType(1)); 10997 return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); 10998 } 10999 11000 // fold (truncate (extract_subvector(ext x))) -> 11001 // (extract_subvector x) 11002 // TODO: This can be generalized to cover cases where the truncate and extract 11003 // do not fully cancel each other out. 11004 if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 11005 SDValue N00 = N0.getOperand(0); 11006 if (N00.getOpcode() == ISD::SIGN_EXTEND || 11007 N00.getOpcode() == ISD::ZERO_EXTEND || 11008 N00.getOpcode() == ISD::ANY_EXTEND) { 11009 if (N00.getOperand(0)->getValueType(0).getVectorElementType() == 11010 VT.getVectorElementType()) 11011 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT, 11012 N00.getOperand(0), N0.getOperand(1)); 11013 } 11014 } 11015 11016 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 11017 return NewVSel; 11018 11019 // Narrow a suitable binary operation with a non-opaque constant operand by 11020 // moving it ahead of the truncate. This is limited to pre-legalization 11021 // because targets may prefer a wider type during later combines and invert 11022 // this transform. 11023 switch (N0.getOpcode()) { 11024 case ISD::ADD: 11025 case ISD::SUB: 11026 case ISD::MUL: 11027 case ISD::AND: 11028 case ISD::OR: 11029 case ISD::XOR: 11030 if (!LegalOperations && N0.hasOneUse() && 11031 (isConstantOrConstantVector(N0.getOperand(0), true) || 11032 isConstantOrConstantVector(N0.getOperand(1), true))) { 11033 // TODO: We already restricted this to pre-legalization, but for vectors 11034 // we are extra cautious to not create an unsupported operation. 11035 // Target-specific changes are likely needed to avoid regressions here. 11036 if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { 11037 SDLoc DL(N); 11038 SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); 11039 SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); 11040 return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); 11041 } 11042 } 11043 } 11044 11045 return SDValue(); 11046 } 11047 11048 static SDNode *getBuildPairElt(SDNode *N, unsigned i) { 11049 SDValue Elt = N->getOperand(i); 11050 if (Elt.getOpcode() != ISD::MERGE_VALUES) 11051 return Elt.getNode(); 11052 return Elt.getOperand(Elt.getResNo()).getNode(); 11053 } 11054 11055 /// build_pair (load, load) -> load 11056 /// if load locations are consecutive. 11057 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { 11058 assert(N->getOpcode() == ISD::BUILD_PAIR); 11059 11060 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0)); 11061 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1)); 11062 11063 // A BUILD_PAIR is always having the least significant part in elt 0 and the 11064 // most significant part in elt 1. So when combining into one large load, we 11065 // need to consider the endianness. 11066 if (DAG.getDataLayout().isBigEndian()) 11067 std::swap(LD1, LD2); 11068 11069 if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || 11070 LD1->getAddressSpace() != LD2->getAddressSpace()) 11071 return SDValue(); 11072 EVT LD1VT = LD1->getValueType(0); 11073 unsigned LD1Bytes = LD1VT.getStoreSize(); 11074 if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && 11075 DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { 11076 unsigned Align = LD1->getAlignment(); 11077 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( 11078 VT.getTypeForEVT(*DAG.getContext())); 11079 11080 if (NewAlign <= Align && 11081 (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) 11082 return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), 11083 LD1->getPointerInfo(), Align); 11084 } 11085 11086 return SDValue(); 11087 } 11088 11089 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { 11090 // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi 11091 // and Lo parts; on big-endian machines it doesn't. 11092 return DAG.getDataLayout().isBigEndian() ? 1 : 0; 11093 } 11094 11095 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, 11096 const TargetLowering &TLI) { 11097 // If this is not a bitcast to an FP type or if the target doesn't have 11098 // IEEE754-compliant FP logic, we're done. 11099 EVT VT = N->getValueType(0); 11100 if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) 11101 return SDValue(); 11102 11103 // TODO: Handle cases where the integer constant is a different scalar 11104 // bitwidth to the FP. 11105 SDValue N0 = N->getOperand(0); 11106 EVT SourceVT = N0.getValueType(); 11107 if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits()) 11108 return SDValue(); 11109 11110 unsigned FPOpcode; 11111 APInt SignMask; 11112 switch (N0.getOpcode()) { 11113 case ISD::AND: 11114 FPOpcode = ISD::FABS; 11115 SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11116 break; 11117 case ISD::XOR: 11118 FPOpcode = ISD::FNEG; 11119 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11120 break; 11121 case ISD::OR: 11122 FPOpcode = ISD::FABS; 11123 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11124 break; 11125 default: 11126 return SDValue(); 11127 } 11128 11129 // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X 11130 // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X 11131 // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) -> 11132 // fneg (fabs X) 11133 SDValue LogicOp0 = N0.getOperand(0); 11134 ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true); 11135 if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && 11136 LogicOp0.getOpcode() == ISD::BITCAST && 11137 LogicOp0.getOperand(0).getValueType() == VT) { 11138 SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0)); 11139 NumFPLogicOpsConv++; 11140 if (N0.getOpcode() == ISD::OR) 11141 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp); 11142 return FPOp; 11143 } 11144 11145 return SDValue(); 11146 } 11147 11148 SDValue DAGCombiner::visitBITCAST(SDNode *N) { 11149 SDValue N0 = N->getOperand(0); 11150 EVT VT = N->getValueType(0); 11151 11152 if (N0.isUndef()) 11153 return DAG.getUNDEF(VT); 11154 11155 // If the input is a BUILD_VECTOR with all constant elements, fold this now. 11156 // Only do this before legalize types, unless both types are integer and the 11157 // scalar type is legal. Only do this before legalize ops, since the target 11158 // maybe depending on the bitcast. 11159 // First check to see if this is all constant. 11160 // TODO: Support FP bitcasts after legalize types. 11161 if (VT.isVector() && 11162 (!LegalTypes || 11163 (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() && 11164 TLI.isTypeLegal(VT.getVectorElementType()))) && 11165 N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && 11166 cast<BuildVectorSDNode>(N0)->isConstant()) 11167 return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), 11168 VT.getVectorElementType()); 11169 11170 // If the input is a constant, let getNode fold it. 11171 if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) { 11172 // If we can't allow illegal operations, we need to check that this is just 11173 // a fp -> int or int -> conversion and that the resulting operation will 11174 // be legal. 11175 if (!LegalOperations || 11176 (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() && 11177 TLI.isOperationLegal(ISD::ConstantFP, VT)) || 11178 (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() && 11179 TLI.isOperationLegal(ISD::Constant, VT))) { 11180 SDValue C = DAG.getBitcast(VT, N0); 11181 if (C.getNode() != N) 11182 return C; 11183 } 11184 } 11185 11186 // (conv (conv x, t1), t2) -> (conv x, t2) 11187 if (N0.getOpcode() == ISD::BITCAST) 11188 return DAG.getBitcast(VT, N0.getOperand(0)); 11189 11190 // fold (conv (load x)) -> (load (conv*)x) 11191 // If the resultant load doesn't need a higher alignment than the original! 11192 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 11193 // Do not remove the cast if the types differ in endian layout. 11194 TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == 11195 TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && 11196 // If the load is volatile, we only want to change the load type if the 11197 // resulting load is legal. Otherwise we might increase the number of 11198 // memory accesses. We don't care if the original type was legal or not 11199 // as we assume software couldn't rely on the number of accesses of an 11200 // illegal type. 11201 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) || 11202 TLI.isOperationLegal(ISD::LOAD, VT))) { 11203 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11204 11205 if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, 11206 *LN0->getMemOperand())) { 11207 SDValue Load = 11208 DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 11209 LN0->getPointerInfo(), LN0->getAlignment(), 11210 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 11211 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 11212 return Load; 11213 } 11214 } 11215 11216 if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) 11217 return V; 11218 11219 // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) 11220 // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) 11221 // 11222 // For ppc_fp128: 11223 // fold (bitcast (fneg x)) -> 11224 // flipbit = signbit 11225 // (xor (bitcast x) (build_pair flipbit, flipbit)) 11226 // 11227 // fold (bitcast (fabs x)) -> 11228 // flipbit = (and (extract_element (bitcast x), 0), signbit) 11229 // (xor (bitcast x) (build_pair flipbit, flipbit)) 11230 // This often reduces constant pool loads. 11231 if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || 11232 (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && 11233 N0.getNode()->hasOneUse() && VT.isInteger() && 11234 !VT.isVector() && !N0.getValueType().isVector()) { 11235 SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0)); 11236 AddToWorklist(NewConv.getNode()); 11237 11238 SDLoc DL(N); 11239 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 11240 assert(VT.getSizeInBits() == 128); 11241 SDValue SignBit = DAG.getConstant( 11242 APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); 11243 SDValue FlipBit; 11244 if (N0.getOpcode() == ISD::FNEG) { 11245 FlipBit = SignBit; 11246 AddToWorklist(FlipBit.getNode()); 11247 } else { 11248 assert(N0.getOpcode() == ISD::FABS); 11249 SDValue Hi = 11250 DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, 11251 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 11252 SDLoc(NewConv))); 11253 AddToWorklist(Hi.getNode()); 11254 FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); 11255 AddToWorklist(FlipBit.getNode()); 11256 } 11257 SDValue FlipBits = 11258 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 11259 AddToWorklist(FlipBits.getNode()); 11260 return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); 11261 } 11262 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 11263 if (N0.getOpcode() == ISD::FNEG) 11264 return DAG.getNode(ISD::XOR, DL, VT, 11265 NewConv, DAG.getConstant(SignBit, DL, VT)); 11266 assert(N0.getOpcode() == ISD::FABS); 11267 return DAG.getNode(ISD::AND, DL, VT, 11268 NewConv, DAG.getConstant(~SignBit, DL, VT)); 11269 } 11270 11271 // fold (bitconvert (fcopysign cst, x)) -> 11272 // (or (and (bitconvert x), sign), (and cst, (not sign))) 11273 // Note that we don't handle (copysign x, cst) because this can always be 11274 // folded to an fneg or fabs. 11275 // 11276 // For ppc_fp128: 11277 // fold (bitcast (fcopysign cst, x)) -> 11278 // flipbit = (and (extract_element 11279 // (xor (bitcast cst), (bitcast x)), 0), 11280 // signbit) 11281 // (xor (bitcast cst) (build_pair flipbit, flipbit)) 11282 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && 11283 isa<ConstantFPSDNode>(N0.getOperand(0)) && 11284 VT.isInteger() && !VT.isVector()) { 11285 unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits(); 11286 EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); 11287 if (isTypeLegal(IntXVT)) { 11288 SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1)); 11289 AddToWorklist(X.getNode()); 11290 11291 // If X has a different width than the result/lhs, sext it or truncate it. 11292 unsigned VTWidth = VT.getSizeInBits(); 11293 if (OrigXWidth < VTWidth) { 11294 X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); 11295 AddToWorklist(X.getNode()); 11296 } else if (OrigXWidth > VTWidth) { 11297 // To get the sign bit in the right place, we have to shift it right 11298 // before truncating. 11299 SDLoc DL(X); 11300 X = DAG.getNode(ISD::SRL, DL, 11301 X.getValueType(), X, 11302 DAG.getConstant(OrigXWidth-VTWidth, DL, 11303 X.getValueType())); 11304 AddToWorklist(X.getNode()); 11305 X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); 11306 AddToWorklist(X.getNode()); 11307 } 11308 11309 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 11310 APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); 11311 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 11312 AddToWorklist(Cst.getNode()); 11313 SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); 11314 AddToWorklist(X.getNode()); 11315 SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); 11316 AddToWorklist(XorResult.getNode()); 11317 SDValue XorResult64 = DAG.getNode( 11318 ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, 11319 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 11320 SDLoc(XorResult))); 11321 AddToWorklist(XorResult64.getNode()); 11322 SDValue FlipBit = 11323 DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, 11324 DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); 11325 AddToWorklist(FlipBit.getNode()); 11326 SDValue FlipBits = 11327 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 11328 AddToWorklist(FlipBits.getNode()); 11329 return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); 11330 } 11331 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 11332 X = DAG.getNode(ISD::AND, SDLoc(X), VT, 11333 X, DAG.getConstant(SignBit, SDLoc(X), VT)); 11334 AddToWorklist(X.getNode()); 11335 11336 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 11337 Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, 11338 Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT)); 11339 AddToWorklist(Cst.getNode()); 11340 11341 return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); 11342 } 11343 } 11344 11345 // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. 11346 if (N0.getOpcode() == ISD::BUILD_PAIR) 11347 if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) 11348 return CombineLD; 11349 11350 // Remove double bitcasts from shuffles - this is often a legacy of 11351 // XformToShuffleWithZero being used to combine bitmaskings (of 11352 // float vectors bitcast to integer vectors) into shuffles. 11353 // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1) 11354 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() && 11355 N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() && 11356 VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() && 11357 !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) { 11358 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0); 11359 11360 // If operands are a bitcast, peek through if it casts the original VT. 11361 // If operands are a constant, just bitcast back to original VT. 11362 auto PeekThroughBitcast = [&](SDValue Op) { 11363 if (Op.getOpcode() == ISD::BITCAST && 11364 Op.getOperand(0).getValueType() == VT) 11365 return SDValue(Op.getOperand(0)); 11366 if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || 11367 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) 11368 return DAG.getBitcast(VT, Op); 11369 return SDValue(); 11370 }; 11371 11372 // FIXME: If either input vector is bitcast, try to convert the shuffle to 11373 // the result type of this bitcast. This would eliminate at least one 11374 // bitcast. See the transform in InstCombine. 11375 SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); 11376 SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); 11377 if (!(SV0 && SV1)) 11378 return SDValue(); 11379 11380 int MaskScale = 11381 VT.getVectorNumElements() / N0.getValueType().getVectorNumElements(); 11382 SmallVector<int, 8> NewMask; 11383 for (int M : SVN->getMask()) 11384 for (int i = 0; i != MaskScale; ++i) 11385 NewMask.push_back(M < 0 ? -1 : M * MaskScale + i); 11386 11387 SDValue LegalShuffle = 11388 TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG); 11389 if (LegalShuffle) 11390 return LegalShuffle; 11391 } 11392 11393 return SDValue(); 11394 } 11395 11396 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { 11397 EVT VT = N->getValueType(0); 11398 return CombineConsecutiveLoads(N, VT); 11399 } 11400 11401 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef 11402 /// operands. DstEltVT indicates the destination element value type. 11403 SDValue DAGCombiner:: 11404 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { 11405 EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); 11406 11407 // If this is already the right type, we're done. 11408 if (SrcEltVT == DstEltVT) return SDValue(BV, 0); 11409 11410 unsigned SrcBitSize = SrcEltVT.getSizeInBits(); 11411 unsigned DstBitSize = DstEltVT.getSizeInBits(); 11412 11413 // If this is a conversion of N elements of one type to N elements of another 11414 // type, convert each element. This handles FP<->INT cases. 11415 if (SrcBitSize == DstBitSize) { 11416 SmallVector<SDValue, 8> Ops; 11417 for (SDValue Op : BV->op_values()) { 11418 // If the vector element type is not legal, the BUILD_VECTOR operands 11419 // are promoted and implicitly truncated. Make that explicit here. 11420 if (Op.getValueType() != SrcEltVT) 11421 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); 11422 Ops.push_back(DAG.getBitcast(DstEltVT, Op)); 11423 AddToWorklist(Ops.back().getNode()); 11424 } 11425 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 11426 BV->getValueType(0).getVectorNumElements()); 11427 return DAG.getBuildVector(VT, SDLoc(BV), Ops); 11428 } 11429 11430 // Otherwise, we're growing or shrinking the elements. To avoid having to 11431 // handle annoying details of growing/shrinking FP values, we convert them to 11432 // int first. 11433 if (SrcEltVT.isFloatingPoint()) { 11434 // Convert the input float vector to a int vector where the elements are the 11435 // same sizes. 11436 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); 11437 BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); 11438 SrcEltVT = IntVT; 11439 } 11440 11441 // Now we know the input is an integer vector. If the output is a FP type, 11442 // convert to integer first, then to FP of the right size. 11443 if (DstEltVT.isFloatingPoint()) { 11444 EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); 11445 SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); 11446 11447 // Next, convert to FP elements of the same size. 11448 return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); 11449 } 11450 11451 SDLoc DL(BV); 11452 11453 // Okay, we know the src/dst types are both integers of differing types. 11454 // Handling growing first. 11455 assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); 11456 if (SrcBitSize < DstBitSize) { 11457 unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; 11458 11459 SmallVector<SDValue, 8> Ops; 11460 for (unsigned i = 0, e = BV->getNumOperands(); i != e; 11461 i += NumInputsPerOutput) { 11462 bool isLE = DAG.getDataLayout().isLittleEndian(); 11463 APInt NewBits = APInt(DstBitSize, 0); 11464 bool EltIsUndef = true; 11465 for (unsigned j = 0; j != NumInputsPerOutput; ++j) { 11466 // Shift the previously computed bits over. 11467 NewBits <<= SrcBitSize; 11468 SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); 11469 if (Op.isUndef()) continue; 11470 EltIsUndef = false; 11471 11472 NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue(). 11473 zextOrTrunc(SrcBitSize).zext(DstBitSize); 11474 } 11475 11476 if (EltIsUndef) 11477 Ops.push_back(DAG.getUNDEF(DstEltVT)); 11478 else 11479 Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT)); 11480 } 11481 11482 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); 11483 return DAG.getBuildVector(VT, DL, Ops); 11484 } 11485 11486 // Finally, this must be the case where we are shrinking elements: each input 11487 // turns into multiple outputs. 11488 unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; 11489 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 11490 NumOutputsPerInput*BV->getNumOperands()); 11491 SmallVector<SDValue, 8> Ops; 11492 11493 for (const SDValue &Op : BV->op_values()) { 11494 if (Op.isUndef()) { 11495 Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); 11496 continue; 11497 } 11498 11499 APInt OpVal = cast<ConstantSDNode>(Op)-> 11500 getAPIntValue().zextOrTrunc(SrcBitSize); 11501 11502 for (unsigned j = 0; j != NumOutputsPerInput; ++j) { 11503 APInt ThisVal = OpVal.trunc(DstBitSize); 11504 Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); 11505 OpVal.lshrInPlace(DstBitSize); 11506 } 11507 11508 // For big endian targets, swap the order of the pieces of each element. 11509 if (DAG.getDataLayout().isBigEndian()) 11510 std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); 11511 } 11512 11513 return DAG.getBuildVector(VT, DL, Ops); 11514 } 11515 11516 static bool isContractable(SDNode *N) { 11517 SDNodeFlags F = N->getFlags(); 11518 return F.hasAllowContract() || F.hasAllowReassociation(); 11519 } 11520 11521 /// Try to perform FMA combining on a given FADD node. 11522 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { 11523 SDValue N0 = N->getOperand(0); 11524 SDValue N1 = N->getOperand(1); 11525 EVT VT = N->getValueType(0); 11526 SDLoc SL(N); 11527 11528 const TargetOptions &Options = DAG.getTarget().Options; 11529 11530 // Floating-point multiply-add with intermediate rounding. 11531 bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N)); 11532 11533 // Floating-point multiply-add without intermediate rounding. 11534 bool HasFMA = 11535 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 11536 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 11537 11538 // No valid opcode, do not combine. 11539 if (!HasFMAD && !HasFMA) 11540 return SDValue(); 11541 11542 SDNodeFlags Flags = N->getFlags(); 11543 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 11544 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 11545 CanFuse || HasFMAD); 11546 // If the addition is not contractable, do not combine. 11547 if (!AllowFusionGlobally && !isContractable(N)) 11548 return SDValue(); 11549 11550 const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); 11551 if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) 11552 return SDValue(); 11553 11554 // Always prefer FMAD to FMA for precision. 11555 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 11556 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 11557 11558 // Is the node an FMUL and contractable either due to global flags or 11559 // SDNodeFlags. 11560 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 11561 if (N.getOpcode() != ISD::FMUL) 11562 return false; 11563 return AllowFusionGlobally || isContractable(N.getNode()); 11564 }; 11565 // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), 11566 // prefer to fold the multiply with fewer uses. 11567 if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { 11568 if (N0.getNode()->use_size() > N1.getNode()->use_size()) 11569 std::swap(N0, N1); 11570 } 11571 11572 // fold (fadd (fmul x, y), z) -> (fma x, y, z) 11573 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { 11574 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11575 N0.getOperand(0), N0.getOperand(1), N1, Flags); 11576 } 11577 11578 // fold (fadd x, (fmul y, z)) -> (fma y, z, x) 11579 // Note: Commutes FADD operands. 11580 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { 11581 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11582 N1.getOperand(0), N1.getOperand(1), N0, Flags); 11583 } 11584 11585 // Look through FP_EXTEND nodes to do more combining. 11586 11587 // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) 11588 if (N0.getOpcode() == ISD::FP_EXTEND) { 11589 SDValue N00 = N0.getOperand(0); 11590 if (isContractableFMUL(N00) && 11591 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11592 N00.getValueType())) { 11593 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11594 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11595 N00.getOperand(0)), 11596 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11597 N00.getOperand(1)), N1, Flags); 11598 } 11599 } 11600 11601 // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) 11602 // Note: Commutes FADD operands. 11603 if (N1.getOpcode() == ISD::FP_EXTEND) { 11604 SDValue N10 = N1.getOperand(0); 11605 if (isContractableFMUL(N10) && 11606 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11607 N10.getValueType())) { 11608 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11609 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11610 N10.getOperand(0)), 11611 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11612 N10.getOperand(1)), N0, Flags); 11613 } 11614 } 11615 11616 // More folding opportunities when target permits. 11617 if (Aggressive) { 11618 // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) 11619 if (CanFuse && 11620 N0.getOpcode() == PreferredFusedOpcode && 11621 N0.getOperand(2).getOpcode() == ISD::FMUL && 11622 N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { 11623 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11624 N0.getOperand(0), N0.getOperand(1), 11625 DAG.getNode(PreferredFusedOpcode, SL, VT, 11626 N0.getOperand(2).getOperand(0), 11627 N0.getOperand(2).getOperand(1), 11628 N1, Flags), Flags); 11629 } 11630 11631 // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) 11632 if (CanFuse && 11633 N1->getOpcode() == PreferredFusedOpcode && 11634 N1.getOperand(2).getOpcode() == ISD::FMUL && 11635 N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { 11636 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11637 N1.getOperand(0), N1.getOperand(1), 11638 DAG.getNode(PreferredFusedOpcode, SL, VT, 11639 N1.getOperand(2).getOperand(0), 11640 N1.getOperand(2).getOperand(1), 11641 N0, Flags), Flags); 11642 } 11643 11644 11645 // fold (fadd (fma x, y, (fpext (fmul u, v))), z) 11646 // -> (fma x, y, (fma (fpext u), (fpext v), z)) 11647 auto FoldFAddFMAFPExtFMul = [&] ( 11648 SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, 11649 SDNodeFlags Flags) { 11650 return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, 11651 DAG.getNode(PreferredFusedOpcode, SL, VT, 11652 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 11653 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), 11654 Z, Flags), Flags); 11655 }; 11656 if (N0.getOpcode() == PreferredFusedOpcode) { 11657 SDValue N02 = N0.getOperand(2); 11658 if (N02.getOpcode() == ISD::FP_EXTEND) { 11659 SDValue N020 = N02.getOperand(0); 11660 if (isContractableFMUL(N020) && 11661 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11662 N020.getValueType())) { 11663 return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), 11664 N020.getOperand(0), N020.getOperand(1), 11665 N1, Flags); 11666 } 11667 } 11668 } 11669 11670 // fold (fadd (fpext (fma x, y, (fmul u, v))), z) 11671 // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) 11672 // FIXME: This turns two single-precision and one double-precision 11673 // operation into two double-precision operations, which might not be 11674 // interesting for all targets, especially GPUs. 11675 auto FoldFAddFPExtFMAFMul = [&] ( 11676 SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, 11677 SDNodeFlags Flags) { 11678 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11679 DAG.getNode(ISD::FP_EXTEND, SL, VT, X), 11680 DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), 11681 DAG.getNode(PreferredFusedOpcode, SL, VT, 11682 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 11683 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), 11684 Z, Flags), Flags); 11685 }; 11686 if (N0.getOpcode() == ISD::FP_EXTEND) { 11687 SDValue N00 = N0.getOperand(0); 11688 if (N00.getOpcode() == PreferredFusedOpcode) { 11689 SDValue N002 = N00.getOperand(2); 11690 if (isContractableFMUL(N002) && 11691 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11692 N00.getValueType())) { 11693 return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), 11694 N002.getOperand(0), N002.getOperand(1), 11695 N1, Flags); 11696 } 11697 } 11698 } 11699 11700 // fold (fadd x, (fma y, z, (fpext (fmul u, v))) 11701 // -> (fma y, z, (fma (fpext u), (fpext v), x)) 11702 if (N1.getOpcode() == PreferredFusedOpcode) { 11703 SDValue N12 = N1.getOperand(2); 11704 if (N12.getOpcode() == ISD::FP_EXTEND) { 11705 SDValue N120 = N12.getOperand(0); 11706 if (isContractableFMUL(N120) && 11707 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11708 N120.getValueType())) { 11709 return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), 11710 N120.getOperand(0), N120.getOperand(1), 11711 N0, Flags); 11712 } 11713 } 11714 } 11715 11716 // fold (fadd x, (fpext (fma y, z, (fmul u, v))) 11717 // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) 11718 // FIXME: This turns two single-precision and one double-precision 11719 // operation into two double-precision operations, which might not be 11720 // interesting for all targets, especially GPUs. 11721 if (N1.getOpcode() == ISD::FP_EXTEND) { 11722 SDValue N10 = N1.getOperand(0); 11723 if (N10.getOpcode() == PreferredFusedOpcode) { 11724 SDValue N102 = N10.getOperand(2); 11725 if (isContractableFMUL(N102) && 11726 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11727 N10.getValueType())) { 11728 return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), 11729 N102.getOperand(0), N102.getOperand(1), 11730 N0, Flags); 11731 } 11732 } 11733 } 11734 } 11735 11736 return SDValue(); 11737 } 11738 11739 /// Try to perform FMA combining on a given FSUB node. 11740 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { 11741 SDValue N0 = N->getOperand(0); 11742 SDValue N1 = N->getOperand(1); 11743 EVT VT = N->getValueType(0); 11744 SDLoc SL(N); 11745 11746 const TargetOptions &Options = DAG.getTarget().Options; 11747 // Floating-point multiply-add with intermediate rounding. 11748 bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N)); 11749 11750 // Floating-point multiply-add without intermediate rounding. 11751 bool HasFMA = 11752 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 11753 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 11754 11755 // No valid opcode, do not combine. 11756 if (!HasFMAD && !HasFMA) 11757 return SDValue(); 11758 11759 const SDNodeFlags Flags = N->getFlags(); 11760 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 11761 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 11762 CanFuse || HasFMAD); 11763 11764 // If the subtraction is not contractable, do not combine. 11765 if (!AllowFusionGlobally && !isContractable(N)) 11766 return SDValue(); 11767 11768 const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); 11769 if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) 11770 return SDValue(); 11771 11772 // Always prefer FMAD to FMA for precision. 11773 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 11774 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 11775 11776 // Is the node an FMUL and contractable either due to global flags or 11777 // SDNodeFlags. 11778 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 11779 if (N.getOpcode() != ISD::FMUL) 11780 return false; 11781 return AllowFusionGlobally || isContractable(N.getNode()); 11782 }; 11783 11784 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 11785 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { 11786 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11787 N0.getOperand(0), N0.getOperand(1), 11788 DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); 11789 } 11790 11791 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 11792 // Note: Commutes FSUB operands. 11793 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { 11794 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11795 DAG.getNode(ISD::FNEG, SL, VT, 11796 N1.getOperand(0)), 11797 N1.getOperand(1), N0, Flags); 11798 } 11799 11800 // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) 11801 if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && 11802 (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { 11803 SDValue N00 = N0.getOperand(0).getOperand(0); 11804 SDValue N01 = N0.getOperand(0).getOperand(1); 11805 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11806 DAG.getNode(ISD::FNEG, SL, VT, N00), N01, 11807 DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); 11808 } 11809 11810 // Look through FP_EXTEND nodes to do more combining. 11811 11812 // fold (fsub (fpext (fmul x, y)), z) 11813 // -> (fma (fpext x), (fpext y), (fneg z)) 11814 if (N0.getOpcode() == ISD::FP_EXTEND) { 11815 SDValue N00 = N0.getOperand(0); 11816 if (isContractableFMUL(N00) && 11817 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11818 N00.getValueType())) { 11819 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11820 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11821 N00.getOperand(0)), 11822 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11823 N00.getOperand(1)), 11824 DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); 11825 } 11826 } 11827 11828 // fold (fsub x, (fpext (fmul y, z))) 11829 // -> (fma (fneg (fpext y)), (fpext z), x) 11830 // Note: Commutes FSUB operands. 11831 if (N1.getOpcode() == ISD::FP_EXTEND) { 11832 SDValue N10 = N1.getOperand(0); 11833 if (isContractableFMUL(N10) && 11834 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11835 N10.getValueType())) { 11836 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11837 DAG.getNode(ISD::FNEG, SL, VT, 11838 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11839 N10.getOperand(0))), 11840 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11841 N10.getOperand(1)), 11842 N0, Flags); 11843 } 11844 } 11845 11846 // fold (fsub (fpext (fneg (fmul, x, y))), z) 11847 // -> (fneg (fma (fpext x), (fpext y), z)) 11848 // Note: This could be removed with appropriate canonicalization of the 11849 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 11850 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 11851 // from implementing the canonicalization in visitFSUB. 11852 if (N0.getOpcode() == ISD::FP_EXTEND) { 11853 SDValue N00 = N0.getOperand(0); 11854 if (N00.getOpcode() == ISD::FNEG) { 11855 SDValue N000 = N00.getOperand(0); 11856 if (isContractableFMUL(N000) && 11857 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11858 N00.getValueType())) { 11859 return DAG.getNode(ISD::FNEG, SL, VT, 11860 DAG.getNode(PreferredFusedOpcode, SL, VT, 11861 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11862 N000.getOperand(0)), 11863 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11864 N000.getOperand(1)), 11865 N1, Flags)); 11866 } 11867 } 11868 } 11869 11870 // fold (fsub (fneg (fpext (fmul, x, y))), z) 11871 // -> (fneg (fma (fpext x)), (fpext y), z) 11872 // Note: This could be removed with appropriate canonicalization of the 11873 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 11874 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 11875 // from implementing the canonicalization in visitFSUB. 11876 if (N0.getOpcode() == ISD::FNEG) { 11877 SDValue N00 = N0.getOperand(0); 11878 if (N00.getOpcode() == ISD::FP_EXTEND) { 11879 SDValue N000 = N00.getOperand(0); 11880 if (isContractableFMUL(N000) && 11881 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11882 N000.getValueType())) { 11883 return DAG.getNode(ISD::FNEG, SL, VT, 11884 DAG.getNode(PreferredFusedOpcode, SL, VT, 11885 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11886 N000.getOperand(0)), 11887 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11888 N000.getOperand(1)), 11889 N1, Flags)); 11890 } 11891 } 11892 } 11893 11894 // More folding opportunities when target permits. 11895 if (Aggressive) { 11896 // fold (fsub (fma x, y, (fmul u, v)), z) 11897 // -> (fma x, y (fma u, v, (fneg z))) 11898 if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && 11899 isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && 11900 N0.getOperand(2)->hasOneUse()) { 11901 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11902 N0.getOperand(0), N0.getOperand(1), 11903 DAG.getNode(PreferredFusedOpcode, SL, VT, 11904 N0.getOperand(2).getOperand(0), 11905 N0.getOperand(2).getOperand(1), 11906 DAG.getNode(ISD::FNEG, SL, VT, 11907 N1), Flags), Flags); 11908 } 11909 11910 // fold (fsub x, (fma y, z, (fmul u, v))) 11911 // -> (fma (fneg y), z, (fma (fneg u), v, x)) 11912 if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && 11913 isContractableFMUL(N1.getOperand(2)) && 11914 N1->hasOneUse()) { 11915 SDValue N20 = N1.getOperand(2).getOperand(0); 11916 SDValue N21 = N1.getOperand(2).getOperand(1); 11917 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11918 DAG.getNode(ISD::FNEG, SL, VT, 11919 N1.getOperand(0)), 11920 N1.getOperand(1), 11921 DAG.getNode(PreferredFusedOpcode, SL, VT, 11922 DAG.getNode(ISD::FNEG, SL, VT, N20), 11923 N21, N0, Flags), Flags); 11924 } 11925 11926 11927 // fold (fsub (fma x, y, (fpext (fmul u, v))), z) 11928 // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) 11929 if (N0.getOpcode() == PreferredFusedOpcode && 11930 N0->hasOneUse()) { 11931 SDValue N02 = N0.getOperand(2); 11932 if (N02.getOpcode() == ISD::FP_EXTEND) { 11933 SDValue N020 = N02.getOperand(0); 11934 if (isContractableFMUL(N020) && 11935 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11936 N020.getValueType())) { 11937 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11938 N0.getOperand(0), N0.getOperand(1), 11939 DAG.getNode(PreferredFusedOpcode, SL, VT, 11940 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11941 N020.getOperand(0)), 11942 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11943 N020.getOperand(1)), 11944 DAG.getNode(ISD::FNEG, SL, VT, 11945 N1), Flags), Flags); 11946 } 11947 } 11948 } 11949 11950 // fold (fsub (fpext (fma x, y, (fmul u, v))), z) 11951 // -> (fma (fpext x), (fpext y), 11952 // (fma (fpext u), (fpext v), (fneg z))) 11953 // FIXME: This turns two single-precision and one double-precision 11954 // operation into two double-precision operations, which might not be 11955 // interesting for all targets, especially GPUs. 11956 if (N0.getOpcode() == ISD::FP_EXTEND) { 11957 SDValue N00 = N0.getOperand(0); 11958 if (N00.getOpcode() == PreferredFusedOpcode) { 11959 SDValue N002 = N00.getOperand(2); 11960 if (isContractableFMUL(N002) && 11961 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11962 N00.getValueType())) { 11963 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11964 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11965 N00.getOperand(0)), 11966 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11967 N00.getOperand(1)), 11968 DAG.getNode(PreferredFusedOpcode, SL, VT, 11969 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11970 N002.getOperand(0)), 11971 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11972 N002.getOperand(1)), 11973 DAG.getNode(ISD::FNEG, SL, VT, 11974 N1), Flags), Flags); 11975 } 11976 } 11977 } 11978 11979 // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) 11980 // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) 11981 if (N1.getOpcode() == PreferredFusedOpcode && 11982 N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && 11983 N1->hasOneUse()) { 11984 SDValue N120 = N1.getOperand(2).getOperand(0); 11985 if (isContractableFMUL(N120) && 11986 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 11987 N120.getValueType())) { 11988 SDValue N1200 = N120.getOperand(0); 11989 SDValue N1201 = N120.getOperand(1); 11990 return DAG.getNode(PreferredFusedOpcode, SL, VT, 11991 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), 11992 N1.getOperand(1), 11993 DAG.getNode(PreferredFusedOpcode, SL, VT, 11994 DAG.getNode(ISD::FNEG, SL, VT, 11995 DAG.getNode(ISD::FP_EXTEND, SL, 11996 VT, N1200)), 11997 DAG.getNode(ISD::FP_EXTEND, SL, VT, 11998 N1201), 11999 N0, Flags), Flags); 12000 } 12001 } 12002 12003 // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) 12004 // -> (fma (fneg (fpext y)), (fpext z), 12005 // (fma (fneg (fpext u)), (fpext v), x)) 12006 // FIXME: This turns two single-precision and one double-precision 12007 // operation into two double-precision operations, which might not be 12008 // interesting for all targets, especially GPUs. 12009 if (N1.getOpcode() == ISD::FP_EXTEND && 12010 N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { 12011 SDValue CvtSrc = N1.getOperand(0); 12012 SDValue N100 = CvtSrc.getOperand(0); 12013 SDValue N101 = CvtSrc.getOperand(1); 12014 SDValue N102 = CvtSrc.getOperand(2); 12015 if (isContractableFMUL(N102) && 12016 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12017 CvtSrc.getValueType())) { 12018 SDValue N1020 = N102.getOperand(0); 12019 SDValue N1021 = N102.getOperand(1); 12020 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12021 DAG.getNode(ISD::FNEG, SL, VT, 12022 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12023 N100)), 12024 DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), 12025 DAG.getNode(PreferredFusedOpcode, SL, VT, 12026 DAG.getNode(ISD::FNEG, SL, VT, 12027 DAG.getNode(ISD::FP_EXTEND, SL, 12028 VT, N1020)), 12029 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12030 N1021), 12031 N0, Flags), Flags); 12032 } 12033 } 12034 } 12035 12036 return SDValue(); 12037 } 12038 12039 /// Try to perform FMA combining on a given FMUL node based on the distributive 12040 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions, 12041 /// subtraction instead of addition). 12042 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { 12043 SDValue N0 = N->getOperand(0); 12044 SDValue N1 = N->getOperand(1); 12045 EVT VT = N->getValueType(0); 12046 SDLoc SL(N); 12047 const SDNodeFlags Flags = N->getFlags(); 12048 12049 assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); 12050 12051 const TargetOptions &Options = DAG.getTarget().Options; 12052 12053 // The transforms below are incorrect when x == 0 and y == inf, because the 12054 // intermediate multiplication produces a nan. 12055 if (!Options.NoInfsFPMath) 12056 return SDValue(); 12057 12058 // Floating-point multiply-add without intermediate rounding. 12059 bool HasFMA = 12060 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && 12061 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 12062 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 12063 12064 // Floating-point multiply-add with intermediate rounding. This can result 12065 // in a less precise result due to the changed rounding order. 12066 bool HasFMAD = Options.UnsafeFPMath && 12067 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); 12068 12069 // No valid opcode, do not combine. 12070 if (!HasFMAD && !HasFMA) 12071 return SDValue(); 12072 12073 // Always prefer FMAD to FMA for precision. 12074 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 12075 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 12076 12077 // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y) 12078 // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y)) 12079 auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { 12080 if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { 12081 if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) { 12082 if (C->isExactlyValue(+1.0)) 12083 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12084 Y, Flags); 12085 if (C->isExactlyValue(-1.0)) 12086 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12087 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12088 } 12089 } 12090 return SDValue(); 12091 }; 12092 12093 if (SDValue FMA = FuseFADD(N0, N1, Flags)) 12094 return FMA; 12095 if (SDValue FMA = FuseFADD(N1, N0, Flags)) 12096 return FMA; 12097 12098 // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y) 12099 // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y)) 12100 // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y)) 12101 // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y) 12102 auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { 12103 if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { 12104 if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) { 12105 if (C0->isExactlyValue(+1.0)) 12106 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12107 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 12108 Y, Flags); 12109 if (C0->isExactlyValue(-1.0)) 12110 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12111 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 12112 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12113 } 12114 if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) { 12115 if (C1->isExactlyValue(+1.0)) 12116 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12117 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12118 if (C1->isExactlyValue(-1.0)) 12119 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12120 Y, Flags); 12121 } 12122 } 12123 return SDValue(); 12124 }; 12125 12126 if (SDValue FMA = FuseFSUB(N0, N1, Flags)) 12127 return FMA; 12128 if (SDValue FMA = FuseFSUB(N1, N0, Flags)) 12129 return FMA; 12130 12131 return SDValue(); 12132 } 12133 12134 SDValue DAGCombiner::visitFADD(SDNode *N) { 12135 SDValue N0 = N->getOperand(0); 12136 SDValue N1 = N->getOperand(1); 12137 bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); 12138 bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); 12139 EVT VT = N->getValueType(0); 12140 SDLoc DL(N); 12141 const TargetOptions &Options = DAG.getTarget().Options; 12142 const SDNodeFlags Flags = N->getFlags(); 12143 12144 // fold vector ops 12145 if (VT.isVector()) 12146 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12147 return FoldedVOp; 12148 12149 // fold (fadd c1, c2) -> c1 + c2 12150 if (N0CFP && N1CFP) 12151 return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); 12152 12153 // canonicalize constant to RHS 12154 if (N0CFP && !N1CFP) 12155 return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); 12156 12157 // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) 12158 ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); 12159 if (N1C && N1C->isZero()) 12160 if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) 12161 return N0; 12162 12163 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12164 return NewSel; 12165 12166 // fold (fadd A, (fneg B)) -> (fsub A, B) 12167 if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && 12168 TLI.getNegatibleCost(N1, DAG, LegalOperations, ForCodeSize) == 12169 TargetLowering::NegatibleCost::Cheaper) 12170 return DAG.getNode( 12171 ISD::FSUB, DL, VT, N0, 12172 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); 12173 12174 // fold (fadd (fneg A), B) -> (fsub B, A) 12175 if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && 12176 TLI.getNegatibleCost(N0, DAG, LegalOperations, ForCodeSize) == 12177 TargetLowering::NegatibleCost::Cheaper) 12178 return DAG.getNode( 12179 ISD::FSUB, DL, VT, N1, 12180 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags); 12181 12182 auto isFMulNegTwo = [](SDValue FMul) { 12183 if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) 12184 return false; 12185 auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true); 12186 return C && C->isExactlyValue(-2.0); 12187 }; 12188 12189 // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B) 12190 if (isFMulNegTwo(N0)) { 12191 SDValue B = N0.getOperand(0); 12192 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); 12193 return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags); 12194 } 12195 // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B) 12196 if (isFMulNegTwo(N1)) { 12197 SDValue B = N1.getOperand(0); 12198 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); 12199 return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags); 12200 } 12201 12202 // No FP constant should be created after legalization as Instruction 12203 // Selection pass has a hard time dealing with FP constants. 12204 bool AllowNewConst = (Level < AfterLegalizeDAG); 12205 12206 // If nnan is enabled, fold lots of things. 12207 if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) { 12208 // If allowed, fold (fadd (fneg x), x) -> 0.0 12209 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) 12210 return DAG.getConstantFP(0.0, DL, VT); 12211 12212 // If allowed, fold (fadd x, (fneg x)) -> 0.0 12213 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) 12214 return DAG.getConstantFP(0.0, DL, VT); 12215 } 12216 12217 // If 'unsafe math' or reassoc and nsz, fold lots of things. 12218 // TODO: break out portions of the transformations below for which Unsafe is 12219 // considered and which do not require both nsz and reassoc 12220 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 12221 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 12222 AllowNewConst) { 12223 // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 12224 if (N1CFP && N0.getOpcode() == ISD::FADD && 12225 isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 12226 SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags); 12227 return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags); 12228 } 12229 12230 // We can fold chains of FADD's of the same value into multiplications. 12231 // This transform is not safe in general because we are reducing the number 12232 // of rounding steps. 12233 if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { 12234 if (N0.getOpcode() == ISD::FMUL) { 12235 bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 12236 bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); 12237 12238 // (fadd (fmul x, c), x) -> (fmul x, c+1) 12239 if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { 12240 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 12241 DAG.getConstantFP(1.0, DL, VT), Flags); 12242 return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); 12243 } 12244 12245 // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) 12246 if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && 12247 N1.getOperand(0) == N1.getOperand(1) && 12248 N0.getOperand(0) == N1.getOperand(0)) { 12249 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 12250 DAG.getConstantFP(2.0, DL, VT), Flags); 12251 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); 12252 } 12253 } 12254 12255 if (N1.getOpcode() == ISD::FMUL) { 12256 bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 12257 bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); 12258 12259 // (fadd x, (fmul x, c)) -> (fmul x, c+1) 12260 if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { 12261 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 12262 DAG.getConstantFP(1.0, DL, VT), Flags); 12263 return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); 12264 } 12265 12266 // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) 12267 if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && 12268 N0.getOperand(0) == N0.getOperand(1) && 12269 N1.getOperand(0) == N0.getOperand(0)) { 12270 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 12271 DAG.getConstantFP(2.0, DL, VT), Flags); 12272 return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); 12273 } 12274 } 12275 12276 if (N0.getOpcode() == ISD::FADD) { 12277 bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 12278 // (fadd (fadd x, x), x) -> (fmul x, 3.0) 12279 if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && 12280 (N0.getOperand(0) == N1)) { 12281 return DAG.getNode(ISD::FMUL, DL, VT, 12282 N1, DAG.getConstantFP(3.0, DL, VT), Flags); 12283 } 12284 } 12285 12286 if (N1.getOpcode() == ISD::FADD) { 12287 bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 12288 // (fadd x, (fadd x, x)) -> (fmul x, 3.0) 12289 if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && 12290 N1.getOperand(0) == N0) { 12291 return DAG.getNode(ISD::FMUL, DL, VT, 12292 N0, DAG.getConstantFP(3.0, DL, VT), Flags); 12293 } 12294 } 12295 12296 // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) 12297 if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && 12298 N0.getOperand(0) == N0.getOperand(1) && 12299 N1.getOperand(0) == N1.getOperand(1) && 12300 N0.getOperand(0) == N1.getOperand(0)) { 12301 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), 12302 DAG.getConstantFP(4.0, DL, VT), Flags); 12303 } 12304 } 12305 } // enable-unsafe-fp-math 12306 12307 // FADD -> FMA combines: 12308 if (SDValue Fused = visitFADDForFMACombine(N)) { 12309 AddToWorklist(Fused.getNode()); 12310 return Fused; 12311 } 12312 return SDValue(); 12313 } 12314 12315 SDValue DAGCombiner::visitFSUB(SDNode *N) { 12316 SDValue N0 = N->getOperand(0); 12317 SDValue N1 = N->getOperand(1); 12318 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 12319 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 12320 EVT VT = N->getValueType(0); 12321 SDLoc DL(N); 12322 const TargetOptions &Options = DAG.getTarget().Options; 12323 const SDNodeFlags Flags = N->getFlags(); 12324 12325 // fold vector ops 12326 if (VT.isVector()) 12327 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12328 return FoldedVOp; 12329 12330 // fold (fsub c1, c2) -> c1-c2 12331 if (N0CFP && N1CFP) 12332 return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); 12333 12334 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12335 return NewSel; 12336 12337 // (fsub A, 0) -> A 12338 if (N1CFP && N1CFP->isZero()) { 12339 if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath || 12340 Flags.hasNoSignedZeros()) { 12341 return N0; 12342 } 12343 } 12344 12345 if (N0 == N1) { 12346 // (fsub x, x) -> 0.0 12347 if (Options.NoNaNsFPMath || Flags.hasNoNaNs()) 12348 return DAG.getConstantFP(0.0f, DL, VT); 12349 } 12350 12351 // (fsub -0.0, N1) -> -N1 12352 // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the 12353 // FSUB does not specify the sign bit of a NaN. Also note that for 12354 // the same reason, the inverse transform is not safe, unless fast math 12355 // flags are in play. 12356 if (N0CFP && N0CFP->isZero()) { 12357 if (N0CFP->isNegative() || 12358 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { 12359 if (TLI.getNegatibleCost(N1, DAG, LegalOperations, ForCodeSize) != 12360 TargetLowering::NegatibleCost::Expensive) 12361 return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); 12362 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 12363 return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); 12364 } 12365 } 12366 12367 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 12368 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 12369 N1.getOpcode() == ISD::FADD) { 12370 // X - (X + Y) -> -Y 12371 if (N0 == N1->getOperand(0)) 12372 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags); 12373 // X - (Y + X) -> -Y 12374 if (N0 == N1->getOperand(1)) 12375 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags); 12376 } 12377 12378 // fold (fsub A, (fneg B)) -> (fadd A, B) 12379 if (TLI.getNegatibleCost(N1, DAG, LegalOperations, ForCodeSize) != 12380 TargetLowering::NegatibleCost::Expensive) 12381 return DAG.getNode( 12382 ISD::FADD, DL, VT, N0, 12383 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); 12384 12385 // FSUB -> FMA combines: 12386 if (SDValue Fused = visitFSUBForFMACombine(N)) { 12387 AddToWorklist(Fused.getNode()); 12388 return Fused; 12389 } 12390 12391 return SDValue(); 12392 } 12393 12394 /// Return true if both inputs are at least as cheap in negated form and at 12395 /// least one input is strictly cheaper in negated form. 12396 bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) { 12397 TargetLowering::NegatibleCost LHSNeg = 12398 TLI.getNegatibleCost(X, DAG, LegalOperations, ForCodeSize); 12399 if (TargetLowering::NegatibleCost::Expensive == LHSNeg) 12400 return false; 12401 12402 TargetLowering::NegatibleCost RHSNeg = 12403 TLI.getNegatibleCost(Y, DAG, LegalOperations, ForCodeSize); 12404 if (TargetLowering::NegatibleCost::Expensive == RHSNeg) 12405 return false; 12406 12407 // Both negated operands are at least as cheap as their counterparts. 12408 // Check to see if at least one is cheaper negated. 12409 return (TargetLowering::NegatibleCost::Cheaper == LHSNeg || 12410 TargetLowering::NegatibleCost::Cheaper == RHSNeg); 12411 } 12412 12413 SDValue DAGCombiner::visitFMUL(SDNode *N) { 12414 SDValue N0 = N->getOperand(0); 12415 SDValue N1 = N->getOperand(1); 12416 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 12417 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 12418 EVT VT = N->getValueType(0); 12419 SDLoc DL(N); 12420 const TargetOptions &Options = DAG.getTarget().Options; 12421 const SDNodeFlags Flags = N->getFlags(); 12422 12423 // fold vector ops 12424 if (VT.isVector()) { 12425 // This just handles C1 * C2 for vectors. Other vector folds are below. 12426 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12427 return FoldedVOp; 12428 } 12429 12430 // fold (fmul c1, c2) -> c1*c2 12431 if (N0CFP && N1CFP) 12432 return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); 12433 12434 // canonicalize constant to RHS 12435 if (isConstantFPBuildVectorOrConstantFP(N0) && 12436 !isConstantFPBuildVectorOrConstantFP(N1)) 12437 return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); 12438 12439 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12440 return NewSel; 12441 12442 if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) || 12443 (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) { 12444 // fold (fmul A, 0) -> 0 12445 if (N1CFP && N1CFP->isZero()) 12446 return N1; 12447 } 12448 12449 if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) { 12450 // fmul (fmul X, C1), C2 -> fmul X, C1 * C2 12451 if (isConstantFPBuildVectorOrConstantFP(N1) && 12452 N0.getOpcode() == ISD::FMUL) { 12453 SDValue N00 = N0.getOperand(0); 12454 SDValue N01 = N0.getOperand(1); 12455 // Avoid an infinite loop by making sure that N00 is not a constant 12456 // (the inner multiply has not been constant folded yet). 12457 if (isConstantFPBuildVectorOrConstantFP(N01) && 12458 !isConstantFPBuildVectorOrConstantFP(N00)) { 12459 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); 12460 return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); 12461 } 12462 } 12463 12464 // Match a special-case: we convert X * 2.0 into fadd. 12465 // fmul (fadd X, X), C -> fmul X, 2.0 * C 12466 if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() && 12467 N0.getOperand(0) == N0.getOperand(1)) { 12468 const SDValue Two = DAG.getConstantFP(2.0, DL, VT); 12469 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); 12470 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); 12471 } 12472 } 12473 12474 // fold (fmul X, 2.0) -> (fadd X, X) 12475 if (N1CFP && N1CFP->isExactlyValue(+2.0)) 12476 return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); 12477 12478 // fold (fmul X, -1.0) -> (fneg X) 12479 if (N1CFP && N1CFP->isExactlyValue(-1.0)) 12480 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 12481 return DAG.getNode(ISD::FNEG, DL, VT, N0); 12482 12483 // -N0 * -N1 --> N0 * N1 12484 if (isCheaperToUseNegatedFPOps(N0, N1)) { 12485 SDValue NegN0 = 12486 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); 12487 SDValue NegN1 = 12488 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); 12489 return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags); 12490 } 12491 12492 // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) 12493 // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) 12494 if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && 12495 (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && 12496 TLI.isOperationLegal(ISD::FABS, VT)) { 12497 SDValue Select = N0, X = N1; 12498 if (Select.getOpcode() != ISD::SELECT) 12499 std::swap(Select, X); 12500 12501 SDValue Cond = Select.getOperand(0); 12502 auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1)); 12503 auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2)); 12504 12505 if (TrueOpnd && FalseOpnd && 12506 Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && 12507 isa<ConstantFPSDNode>(Cond.getOperand(1)) && 12508 cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) { 12509 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 12510 switch (CC) { 12511 default: break; 12512 case ISD::SETOLT: 12513 case ISD::SETULT: 12514 case ISD::SETOLE: 12515 case ISD::SETULE: 12516 case ISD::SETLT: 12517 case ISD::SETLE: 12518 std::swap(TrueOpnd, FalseOpnd); 12519 LLVM_FALLTHROUGH; 12520 case ISD::SETOGT: 12521 case ISD::SETUGT: 12522 case ISD::SETOGE: 12523 case ISD::SETUGE: 12524 case ISD::SETGT: 12525 case ISD::SETGE: 12526 if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && 12527 TLI.isOperationLegal(ISD::FNEG, VT)) 12528 return DAG.getNode(ISD::FNEG, DL, VT, 12529 DAG.getNode(ISD::FABS, DL, VT, X)); 12530 if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) 12531 return DAG.getNode(ISD::FABS, DL, VT, X); 12532 12533 break; 12534 } 12535 } 12536 } 12537 12538 // FMUL -> FMA combines: 12539 if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { 12540 AddToWorklist(Fused.getNode()); 12541 return Fused; 12542 } 12543 12544 return SDValue(); 12545 } 12546 12547 SDValue DAGCombiner::visitFMA(SDNode *N) { 12548 SDValue N0 = N->getOperand(0); 12549 SDValue N1 = N->getOperand(1); 12550 SDValue N2 = N->getOperand(2); 12551 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 12552 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 12553 EVT VT = N->getValueType(0); 12554 SDLoc DL(N); 12555 const TargetOptions &Options = DAG.getTarget().Options; 12556 12557 // FMA nodes have flags that propagate to the created nodes. 12558 const SDNodeFlags Flags = N->getFlags(); 12559 bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); 12560 12561 // Constant fold FMA. 12562 if (isa<ConstantFPSDNode>(N0) && 12563 isa<ConstantFPSDNode>(N1) && 12564 isa<ConstantFPSDNode>(N2)) { 12565 return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); 12566 } 12567 12568 // (-N0 * -N1) + N2 --> (N0 * N1) + N2 12569 if (isCheaperToUseNegatedFPOps(N0, N1)) { 12570 SDValue NegN0 = 12571 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); 12572 SDValue NegN1 = 12573 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); 12574 return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags); 12575 } 12576 12577 if (UnsafeFPMath) { 12578 if (N0CFP && N0CFP->isZero()) 12579 return N2; 12580 if (N1CFP && N1CFP->isZero()) 12581 return N2; 12582 } 12583 // TODO: The FMA node should have flags that propagate to these nodes. 12584 if (N0CFP && N0CFP->isExactlyValue(1.0)) 12585 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); 12586 if (N1CFP && N1CFP->isExactlyValue(1.0)) 12587 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); 12588 12589 // Canonicalize (fma c, x, y) -> (fma x, c, y) 12590 if (isConstantFPBuildVectorOrConstantFP(N0) && 12591 !isConstantFPBuildVectorOrConstantFP(N1)) 12592 return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); 12593 12594 if (UnsafeFPMath) { 12595 // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) 12596 if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && 12597 isConstantFPBuildVectorOrConstantFP(N1) && 12598 isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { 12599 return DAG.getNode(ISD::FMUL, DL, VT, N0, 12600 DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), 12601 Flags), Flags); 12602 } 12603 12604 // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) 12605 if (N0.getOpcode() == ISD::FMUL && 12606 isConstantFPBuildVectorOrConstantFP(N1) && 12607 isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 12608 return DAG.getNode(ISD::FMA, DL, VT, 12609 N0.getOperand(0), 12610 DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), 12611 Flags), 12612 N2); 12613 } 12614 } 12615 12616 // (fma x, 1, y) -> (fadd x, y) 12617 // (fma x, -1, y) -> (fadd (fneg x), y) 12618 if (N1CFP) { 12619 if (N1CFP->isExactlyValue(1.0)) 12620 // TODO: The FMA node should have flags that propagate to this node. 12621 return DAG.getNode(ISD::FADD, DL, VT, N0, N2); 12622 12623 if (N1CFP->isExactlyValue(-1.0) && 12624 (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { 12625 SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); 12626 AddToWorklist(RHSNeg.getNode()); 12627 // TODO: The FMA node should have flags that propagate to this node. 12628 return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); 12629 } 12630 12631 // fma (fneg x), K, y -> fma x -K, y 12632 if (N0.getOpcode() == ISD::FNEG && 12633 (TLI.isOperationLegal(ISD::ConstantFP, VT) || 12634 (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, 12635 ForCodeSize)))) { 12636 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), 12637 DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); 12638 } 12639 } 12640 12641 if (UnsafeFPMath) { 12642 // (fma x, c, x) -> (fmul x, (c+1)) 12643 if (N1CFP && N0 == N2) { 12644 return DAG.getNode(ISD::FMUL, DL, VT, N0, 12645 DAG.getNode(ISD::FADD, DL, VT, N1, 12646 DAG.getConstantFP(1.0, DL, VT), Flags), 12647 Flags); 12648 } 12649 12650 // (fma x, c, (fneg x)) -> (fmul x, (c-1)) 12651 if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { 12652 return DAG.getNode(ISD::FMUL, DL, VT, N0, 12653 DAG.getNode(ISD::FADD, DL, VT, N1, 12654 DAG.getConstantFP(-1.0, DL, VT), Flags), 12655 Flags); 12656 } 12657 } 12658 12659 // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z)) 12660 // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z)) 12661 if (!TLI.isFNegFree(VT) && 12662 TLI.getNegatibleCost(SDValue(N, 0), DAG, LegalOperations, ForCodeSize) == 12663 TargetLowering::NegatibleCost::Cheaper) 12664 return DAG.getNode(ISD::FNEG, DL, VT, 12665 TLI.getNegatedExpression(SDValue(N, 0), DAG, 12666 LegalOperations, ForCodeSize), 12667 Flags); 12668 return SDValue(); 12669 } 12670 12671 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 12672 // reciprocal. 12673 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) 12674 // Notice that this is not always beneficial. One reason is different targets 12675 // may have different costs for FDIV and FMUL, so sometimes the cost of two 12676 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason 12677 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". 12678 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { 12679 // TODO: Limit this transform based on optsize/minsize - it always creates at 12680 // least 1 extra instruction. But the perf win may be substantial enough 12681 // that only minsize should restrict this. 12682 bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; 12683 const SDNodeFlags Flags = N->getFlags(); 12684 if (!UnsafeMath && !Flags.hasAllowReciprocal()) 12685 return SDValue(); 12686 12687 // Skip if current node is a reciprocal/fneg-reciprocal. 12688 SDValue N0 = N->getOperand(0); 12689 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true); 12690 if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) 12691 return SDValue(); 12692 12693 // Exit early if the target does not want this transform or if there can't 12694 // possibly be enough uses of the divisor to make the transform worthwhile. 12695 SDValue N1 = N->getOperand(1); 12696 unsigned MinUses = TLI.combineRepeatedFPDivisors(); 12697 12698 // For splat vectors, scale the number of uses by the splat factor. If we can 12699 // convert the division into a scalar op, that will likely be much faster. 12700 unsigned NumElts = 1; 12701 EVT VT = N->getValueType(0); 12702 if (VT.isVector() && DAG.isSplatValue(N1)) 12703 NumElts = VT.getVectorNumElements(); 12704 12705 if (!MinUses || (N1->use_size() * NumElts) < MinUses) 12706 return SDValue(); 12707 12708 // Find all FDIV users of the same divisor. 12709 // Use a set because duplicates may be present in the user list. 12710 SetVector<SDNode *> Users; 12711 for (auto *U : N1->uses()) { 12712 if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { 12713 // This division is eligible for optimization only if global unsafe math 12714 // is enabled or if this division allows reciprocal formation. 12715 if (UnsafeMath || U->getFlags().hasAllowReciprocal()) 12716 Users.insert(U); 12717 } 12718 } 12719 12720 // Now that we have the actual number of divisor uses, make sure it meets 12721 // the minimum threshold specified by the target. 12722 if ((Users.size() * NumElts) < MinUses) 12723 return SDValue(); 12724 12725 SDLoc DL(N); 12726 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 12727 SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); 12728 12729 // Dividend / Divisor -> Dividend * Reciprocal 12730 for (auto *U : Users) { 12731 SDValue Dividend = U->getOperand(0); 12732 if (Dividend != FPOne) { 12733 SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, 12734 Reciprocal, Flags); 12735 CombineTo(U, NewNode); 12736 } else if (U != Reciprocal.getNode()) { 12737 // In the absence of fast-math-flags, this user node is always the 12738 // same node as Reciprocal, but with FMF they may be different nodes. 12739 CombineTo(U, Reciprocal); 12740 } 12741 } 12742 return SDValue(N, 0); // N was replaced. 12743 } 12744 12745 SDValue DAGCombiner::visitFDIV(SDNode *N) { 12746 SDValue N0 = N->getOperand(0); 12747 SDValue N1 = N->getOperand(1); 12748 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 12749 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 12750 EVT VT = N->getValueType(0); 12751 SDLoc DL(N); 12752 const TargetOptions &Options = DAG.getTarget().Options; 12753 SDNodeFlags Flags = N->getFlags(); 12754 12755 // fold vector ops 12756 if (VT.isVector()) 12757 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12758 return FoldedVOp; 12759 12760 // fold (fdiv c1, c2) -> c1/c2 12761 if (N0CFP && N1CFP) 12762 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); 12763 12764 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12765 return NewSel; 12766 12767 if (SDValue V = combineRepeatedFPDivisors(N)) 12768 return V; 12769 12770 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { 12771 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. 12772 if (N1CFP) { 12773 // Compute the reciprocal 1.0 / c2. 12774 const APFloat &N1APF = N1CFP->getValueAPF(); 12775 APFloat Recip(N1APF.getSemantics(), 1); // 1.0 12776 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); 12777 // Only do the transform if the reciprocal is a legal fp immediate that 12778 // isn't too nasty (eg NaN, denormal, ...). 12779 if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty 12780 (!LegalOperations || 12781 // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM 12782 // backend)... we should handle this gracefully after Legalize. 12783 // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || 12784 TLI.isOperationLegal(ISD::ConstantFP, VT) || 12785 TLI.isFPImmLegal(Recip, VT, ForCodeSize))) 12786 return DAG.getNode(ISD::FMUL, DL, VT, N0, 12787 DAG.getConstantFP(Recip, DL, VT), Flags); 12788 } 12789 12790 // If this FDIV is part of a reciprocal square root, it may be folded 12791 // into a target-specific square root estimate instruction. 12792 if (N1.getOpcode() == ISD::FSQRT) { 12793 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) 12794 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 12795 } else if (N1.getOpcode() == ISD::FP_EXTEND && 12796 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 12797 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), 12798 Flags)) { 12799 RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); 12800 AddToWorklist(RV.getNode()); 12801 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 12802 } 12803 } else if (N1.getOpcode() == ISD::FP_ROUND && 12804 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 12805 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), 12806 Flags)) { 12807 RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); 12808 AddToWorklist(RV.getNode()); 12809 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 12810 } 12811 } else if (N1.getOpcode() == ISD::FMUL) { 12812 // Look through an FMUL. Even though this won't remove the FDIV directly, 12813 // it's still worthwhile to get rid of the FSQRT if possible. 12814 SDValue SqrtOp; 12815 SDValue OtherOp; 12816 if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { 12817 SqrtOp = N1.getOperand(0); 12818 OtherOp = N1.getOperand(1); 12819 } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { 12820 SqrtOp = N1.getOperand(1); 12821 OtherOp = N1.getOperand(0); 12822 } 12823 if (SqrtOp.getNode()) { 12824 // We found a FSQRT, so try to make this fold: 12825 // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) 12826 if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) { 12827 RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags); 12828 AddToWorklist(RV.getNode()); 12829 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 12830 } 12831 } 12832 } 12833 12834 // Fold into a reciprocal estimate and multiply instead of a real divide. 12835 if (SDValue RV = BuildDivEstimate(N0, N1, Flags)) 12836 return RV; 12837 } 12838 12839 // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) 12840 if (isCheaperToUseNegatedFPOps(N0, N1)) 12841 return DAG.getNode( 12842 ISD::FDIV, SDLoc(N), VT, 12843 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), 12844 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); 12845 12846 return SDValue(); 12847 } 12848 12849 SDValue DAGCombiner::visitFREM(SDNode *N) { 12850 SDValue N0 = N->getOperand(0); 12851 SDValue N1 = N->getOperand(1); 12852 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 12853 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 12854 EVT VT = N->getValueType(0); 12855 12856 // fold (frem c1, c2) -> fmod(c1,c2) 12857 if (N0CFP && N1CFP) 12858 return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); 12859 12860 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12861 return NewSel; 12862 12863 return SDValue(); 12864 } 12865 12866 SDValue DAGCombiner::visitFSQRT(SDNode *N) { 12867 SDNodeFlags Flags = N->getFlags(); 12868 if (!DAG.getTarget().Options.UnsafeFPMath && 12869 !Flags.hasApproximateFuncs()) 12870 return SDValue(); 12871 12872 SDValue N0 = N->getOperand(0); 12873 if (TLI.isFsqrtCheap(N0, DAG)) 12874 return SDValue(); 12875 12876 // FSQRT nodes have flags that propagate to the created nodes. 12877 return buildSqrtEstimate(N0, Flags); 12878 } 12879 12880 /// copysign(x, fp_extend(y)) -> copysign(x, y) 12881 /// copysign(x, fp_round(y)) -> copysign(x, y) 12882 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { 12883 SDValue N1 = N->getOperand(1); 12884 if ((N1.getOpcode() == ISD::FP_EXTEND || 12885 N1.getOpcode() == ISD::FP_ROUND)) { 12886 // Do not optimize out type conversion of f128 type yet. 12887 // For some targets like x86_64, configuration is changed to keep one f128 12888 // value in one SSE register, but instruction selection cannot handle 12889 // FCOPYSIGN on SSE registers yet. 12890 EVT N1VT = N1->getValueType(0); 12891 EVT N1Op0VT = N1->getOperand(0).getValueType(); 12892 return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); 12893 } 12894 return false; 12895 } 12896 12897 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { 12898 SDValue N0 = N->getOperand(0); 12899 SDValue N1 = N->getOperand(1); 12900 bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); 12901 bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); 12902 EVT VT = N->getValueType(0); 12903 12904 if (N0CFP && N1CFP) // Constant fold 12905 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); 12906 12907 if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { 12908 const APFloat &V = N1C->getValueAPF(); 12909 // copysign(x, c1) -> fabs(x) iff ispos(c1) 12910 // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) 12911 if (!V.isNegative()) { 12912 if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) 12913 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 12914 } else { 12915 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 12916 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, 12917 DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); 12918 } 12919 } 12920 12921 // copysign(fabs(x), y) -> copysign(x, y) 12922 // copysign(fneg(x), y) -> copysign(x, y) 12923 // copysign(copysign(x,z), y) -> copysign(x, y) 12924 if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || 12925 N0.getOpcode() == ISD::FCOPYSIGN) 12926 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); 12927 12928 // copysign(x, abs(y)) -> abs(x) 12929 if (N1.getOpcode() == ISD::FABS) 12930 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 12931 12932 // copysign(x, copysign(y,z)) -> copysign(x, z) 12933 if (N1.getOpcode() == ISD::FCOPYSIGN) 12934 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); 12935 12936 // copysign(x, fp_extend(y)) -> copysign(x, y) 12937 // copysign(x, fp_round(y)) -> copysign(x, y) 12938 if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) 12939 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); 12940 12941 return SDValue(); 12942 } 12943 12944 SDValue DAGCombiner::visitFPOW(SDNode *N) { 12945 ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1)); 12946 if (!ExponentC) 12947 return SDValue(); 12948 12949 // Try to convert x ** (1/3) into cube root. 12950 // TODO: Handle the various flavors of long double. 12951 // TODO: Since we're approximating, we don't need an exact 1/3 exponent. 12952 // Some range near 1/3 should be fine. 12953 EVT VT = N->getValueType(0); 12954 if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) || 12955 (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) { 12956 // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0. 12957 // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf. 12958 // pow(-val, 1/3) = nan; cbrt(-val) = -num. 12959 // For regular numbers, rounding may cause the results to differ. 12960 // Therefore, we require { nsz ninf nnan afn } for this transform. 12961 // TODO: We could select out the special cases if we don't have nsz/ninf. 12962 SDNodeFlags Flags = N->getFlags(); 12963 if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() || 12964 !Flags.hasApproximateFuncs()) 12965 return SDValue(); 12966 12967 // Do not create a cbrt() libcall if the target does not have it, and do not 12968 // turn a pow that has lowering support into a cbrt() libcall. 12969 if (!DAG.getLibInfo().has(LibFunc_cbrt) || 12970 (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) && 12971 DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT))) 12972 return SDValue(); 12973 12974 return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags); 12975 } 12976 12977 // Try to convert x ** (1/4) and x ** (3/4) into square roots. 12978 // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case. 12979 // TODO: This could be extended (using a target hook) to handle smaller 12980 // power-of-2 fractional exponents. 12981 bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25); 12982 bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75); 12983 if (ExponentIs025 || ExponentIs075) { 12984 // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0. 12985 // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN. 12986 // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0. 12987 // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN. 12988 // For regular numbers, rounding may cause the results to differ. 12989 // Therefore, we require { nsz ninf afn } for this transform. 12990 // TODO: We could select out the special cases if we don't have nsz/ninf. 12991 SDNodeFlags Flags = N->getFlags(); 12992 12993 // We only need no signed zeros for the 0.25 case. 12994 if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() || 12995 !Flags.hasApproximateFuncs()) 12996 return SDValue(); 12997 12998 // Don't double the number of libcalls. We are trying to inline fast code. 12999 if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT)) 13000 return SDValue(); 13001 13002 // Assume that libcalls are the smallest code. 13003 // TODO: This restriction should probably be lifted for vectors. 13004 if (ForCodeSize) 13005 return SDValue(); 13006 13007 // pow(X, 0.25) --> sqrt(sqrt(X)) 13008 SDLoc DL(N); 13009 SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags); 13010 SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); 13011 if (ExponentIs025) 13012 return SqrtSqrt; 13013 // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X)) 13014 return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags); 13015 } 13016 13017 return SDValue(); 13018 } 13019 13020 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, 13021 const TargetLowering &TLI) { 13022 // This optimization is guarded by a function attribute because it may produce 13023 // unexpected results. Ie, programs may be relying on the platform-specific 13024 // undefined behavior when the float-to-int conversion overflows. 13025 const Function &F = DAG.getMachineFunction().getFunction(); 13026 Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow"); 13027 if (StrictOverflow.getValueAsString().equals("false")) 13028 return SDValue(); 13029 13030 // We only do this if the target has legal ftrunc. Otherwise, we'd likely be 13031 // replacing casts with a libcall. We also must be allowed to ignore -0.0 13032 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer 13033 // conversions would return +0.0. 13034 // FIXME: We should be able to use node-level FMF here. 13035 // TODO: If strict math, should we use FABS (+ range check for signed cast)? 13036 EVT VT = N->getValueType(0); 13037 if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || 13038 !DAG.getTarget().Options.NoSignedZerosFPMath) 13039 return SDValue(); 13040 13041 // fptosi/fptoui round towards zero, so converting from FP to integer and 13042 // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X 13043 SDValue N0 = N->getOperand(0); 13044 if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && 13045 N0.getOperand(0).getValueType() == VT) 13046 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 13047 13048 if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && 13049 N0.getOperand(0).getValueType() == VT) 13050 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 13051 13052 return SDValue(); 13053 } 13054 13055 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { 13056 SDValue N0 = N->getOperand(0); 13057 EVT VT = N->getValueType(0); 13058 EVT OpVT = N0.getValueType(); 13059 13060 // [us]itofp(undef) = 0, because the result value is bounded. 13061 if (N0.isUndef()) 13062 return DAG.getConstantFP(0.0, SDLoc(N), VT); 13063 13064 // fold (sint_to_fp c1) -> c1fp 13065 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 13066 // ...but only if the target supports immediate floating-point values 13067 (!LegalOperations || 13068 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 13069 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 13070 13071 // If the input is a legal type, and SINT_TO_FP is not legal on this target, 13072 // but UINT_TO_FP is legal on this target, try to convert. 13073 if (!hasOperation(ISD::SINT_TO_FP, OpVT) && 13074 hasOperation(ISD::UINT_TO_FP, OpVT)) { 13075 // If the sign bit is known to be zero, we can change this to UINT_TO_FP. 13076 if (DAG.SignBitIsZero(N0)) 13077 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 13078 } 13079 13080 // The next optimizations are desirable only if SELECT_CC can be lowered. 13081 if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { 13082 // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) 13083 if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && 13084 !VT.isVector() && 13085 (!LegalOperations || 13086 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13087 SDLoc DL(N); 13088 SDValue Ops[] = 13089 { N0.getOperand(0), N0.getOperand(1), 13090 DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), 13091 N0.getOperand(2) }; 13092 return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); 13093 } 13094 13095 // fold (sint_to_fp (zext (setcc x, y, cc))) -> 13096 // (select_cc x, y, 1.0, 0.0,, cc) 13097 if (N0.getOpcode() == ISD::ZERO_EXTEND && 13098 N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && 13099 (!LegalOperations || 13100 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13101 SDLoc DL(N); 13102 SDValue Ops[] = 13103 { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), 13104 DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), 13105 N0.getOperand(0).getOperand(2) }; 13106 return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); 13107 } 13108 } 13109 13110 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 13111 return FTrunc; 13112 13113 return SDValue(); 13114 } 13115 13116 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { 13117 SDValue N0 = N->getOperand(0); 13118 EVT VT = N->getValueType(0); 13119 EVT OpVT = N0.getValueType(); 13120 13121 // [us]itofp(undef) = 0, because the result value is bounded. 13122 if (N0.isUndef()) 13123 return DAG.getConstantFP(0.0, SDLoc(N), VT); 13124 13125 // fold (uint_to_fp c1) -> c1fp 13126 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 13127 // ...but only if the target supports immediate floating-point values 13128 (!LegalOperations || 13129 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 13130 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 13131 13132 // If the input is a legal type, and UINT_TO_FP is not legal on this target, 13133 // but SINT_TO_FP is legal on this target, try to convert. 13134 if (!hasOperation(ISD::UINT_TO_FP, OpVT) && 13135 hasOperation(ISD::SINT_TO_FP, OpVT)) { 13136 // If the sign bit is known to be zero, we can change this to SINT_TO_FP. 13137 if (DAG.SignBitIsZero(N0)) 13138 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 13139 } 13140 13141 // The next optimizations are desirable only if SELECT_CC can be lowered. 13142 if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { 13143 // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) 13144 if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && 13145 (!LegalOperations || 13146 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13147 SDLoc DL(N); 13148 SDValue Ops[] = 13149 { N0.getOperand(0), N0.getOperand(1), 13150 DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), 13151 N0.getOperand(2) }; 13152 return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); 13153 } 13154 } 13155 13156 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 13157 return FTrunc; 13158 13159 return SDValue(); 13160 } 13161 13162 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x 13163 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { 13164 SDValue N0 = N->getOperand(0); 13165 EVT VT = N->getValueType(0); 13166 13167 if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) 13168 return SDValue(); 13169 13170 SDValue Src = N0.getOperand(0); 13171 EVT SrcVT = Src.getValueType(); 13172 bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; 13173 bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; 13174 13175 // We can safely assume the conversion won't overflow the output range, 13176 // because (for example) (uint8_t)18293.f is undefined behavior. 13177 13178 // Since we can assume the conversion won't overflow, our decision as to 13179 // whether the input will fit in the float should depend on the minimum 13180 // of the input range and output range. 13181 13182 // This means this is also safe for a signed input and unsigned output, since 13183 // a negative input would lead to undefined behavior. 13184 unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; 13185 unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; 13186 unsigned ActualSize = std::min(InputSize, OutputSize); 13187 const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); 13188 13189 // We can only fold away the float conversion if the input range can be 13190 // represented exactly in the float range. 13191 if (APFloat::semanticsPrecision(sem) >= ActualSize) { 13192 if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { 13193 unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND 13194 : ISD::ZERO_EXTEND; 13195 return DAG.getNode(ExtOp, SDLoc(N), VT, Src); 13196 } 13197 if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) 13198 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); 13199 return DAG.getBitcast(VT, Src); 13200 } 13201 return SDValue(); 13202 } 13203 13204 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { 13205 SDValue N0 = N->getOperand(0); 13206 EVT VT = N->getValueType(0); 13207 13208 // fold (fp_to_sint undef) -> undef 13209 if (N0.isUndef()) 13210 return DAG.getUNDEF(VT); 13211 13212 // fold (fp_to_sint c1fp) -> c1 13213 if (isConstantFPBuildVectorOrConstantFP(N0)) 13214 return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); 13215 13216 return FoldIntToFPToInt(N, DAG); 13217 } 13218 13219 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { 13220 SDValue N0 = N->getOperand(0); 13221 EVT VT = N->getValueType(0); 13222 13223 // fold (fp_to_uint undef) -> undef 13224 if (N0.isUndef()) 13225 return DAG.getUNDEF(VT); 13226 13227 // fold (fp_to_uint c1fp) -> c1 13228 if (isConstantFPBuildVectorOrConstantFP(N0)) 13229 return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); 13230 13231 return FoldIntToFPToInt(N, DAG); 13232 } 13233 13234 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { 13235 SDValue N0 = N->getOperand(0); 13236 SDValue N1 = N->getOperand(1); 13237 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13238 EVT VT = N->getValueType(0); 13239 13240 // fold (fp_round c1fp) -> c1fp 13241 if (N0CFP) 13242 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); 13243 13244 // fold (fp_round (fp_extend x)) -> x 13245 if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) 13246 return N0.getOperand(0); 13247 13248 // fold (fp_round (fp_round x)) -> (fp_round x) 13249 if (N0.getOpcode() == ISD::FP_ROUND) { 13250 const bool NIsTrunc = N->getConstantOperandVal(1) == 1; 13251 const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; 13252 13253 // Skip this folding if it results in an fp_round from f80 to f16. 13254 // 13255 // f80 to f16 always generates an expensive (and as yet, unimplemented) 13256 // libcall to __truncxfhf2 instead of selecting native f16 conversion 13257 // instructions from f32 or f64. Moreover, the first (value-preserving) 13258 // fp_round from f80 to either f32 or f64 may become a NOP in platforms like 13259 // x86. 13260 if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) 13261 return SDValue(); 13262 13263 // If the first fp_round isn't a value preserving truncation, it might 13264 // introduce a tie in the second fp_round, that wouldn't occur in the 13265 // single-step fp_round we want to fold to. 13266 // In other words, double rounding isn't the same as rounding. 13267 // Also, this is a value preserving truncation iff both fp_round's are. 13268 if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { 13269 SDLoc DL(N); 13270 return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), 13271 DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); 13272 } 13273 } 13274 13275 // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) 13276 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { 13277 SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, 13278 N0.getOperand(0), N1); 13279 AddToWorklist(Tmp.getNode()); 13280 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, 13281 Tmp, N0.getOperand(1)); 13282 } 13283 13284 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 13285 return NewVSel; 13286 13287 return SDValue(); 13288 } 13289 13290 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { 13291 SDValue N0 = N->getOperand(0); 13292 EVT VT = N->getValueType(0); 13293 13294 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. 13295 if (N->hasOneUse() && 13296 N->use_begin()->getOpcode() == ISD::FP_ROUND) 13297 return SDValue(); 13298 13299 // fold (fp_extend c1fp) -> c1fp 13300 if (isConstantFPBuildVectorOrConstantFP(N0)) 13301 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); 13302 13303 // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) 13304 if (N0.getOpcode() == ISD::FP16_TO_FP && 13305 TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) 13306 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); 13307 13308 // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the 13309 // value of X. 13310 if (N0.getOpcode() == ISD::FP_ROUND 13311 && N0.getConstantOperandVal(1) == 1) { 13312 SDValue In = N0.getOperand(0); 13313 if (In.getValueType() == VT) return In; 13314 if (VT.bitsLT(In.getValueType())) 13315 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, 13316 In, N0.getOperand(1)); 13317 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); 13318 } 13319 13320 // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) 13321 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 13322 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 13323 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 13324 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 13325 LN0->getChain(), 13326 LN0->getBasePtr(), N0.getValueType(), 13327 LN0->getMemOperand()); 13328 CombineTo(N, ExtLoad); 13329 CombineTo(N0.getNode(), 13330 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), 13331 N0.getValueType(), ExtLoad, 13332 DAG.getIntPtrConstant(1, SDLoc(N0))), 13333 ExtLoad.getValue(1)); 13334 return SDValue(N, 0); // Return N so it doesn't get rechecked! 13335 } 13336 13337 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 13338 return NewVSel; 13339 13340 return SDValue(); 13341 } 13342 13343 SDValue DAGCombiner::visitFCEIL(SDNode *N) { 13344 SDValue N0 = N->getOperand(0); 13345 EVT VT = N->getValueType(0); 13346 13347 // fold (fceil c1) -> fceil(c1) 13348 if (isConstantFPBuildVectorOrConstantFP(N0)) 13349 return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); 13350 13351 return SDValue(); 13352 } 13353 13354 SDValue DAGCombiner::visitFTRUNC(SDNode *N) { 13355 SDValue N0 = N->getOperand(0); 13356 EVT VT = N->getValueType(0); 13357 13358 // fold (ftrunc c1) -> ftrunc(c1) 13359 if (isConstantFPBuildVectorOrConstantFP(N0)) 13360 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); 13361 13362 // fold ftrunc (known rounded int x) -> x 13363 // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is 13364 // likely to be generated to extract integer from a rounded floating value. 13365 switch (N0.getOpcode()) { 13366 default: break; 13367 case ISD::FRINT: 13368 case ISD::FTRUNC: 13369 case ISD::FNEARBYINT: 13370 case ISD::FFLOOR: 13371 case ISD::FCEIL: 13372 return N0; 13373 } 13374 13375 return SDValue(); 13376 } 13377 13378 SDValue DAGCombiner::visitFFLOOR(SDNode *N) { 13379 SDValue N0 = N->getOperand(0); 13380 EVT VT = N->getValueType(0); 13381 13382 // fold (ffloor c1) -> ffloor(c1) 13383 if (isConstantFPBuildVectorOrConstantFP(N0)) 13384 return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); 13385 13386 return SDValue(); 13387 } 13388 13389 // FIXME: FNEG and FABS have a lot in common; refactor. 13390 SDValue DAGCombiner::visitFNEG(SDNode *N) { 13391 SDValue N0 = N->getOperand(0); 13392 EVT VT = N->getValueType(0); 13393 13394 // Constant fold FNEG. 13395 if (isConstantFPBuildVectorOrConstantFP(N0)) 13396 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); 13397 13398 if (TLI.getNegatibleCost(N0, DAG, LegalOperations, ForCodeSize) != 13399 TargetLowering::NegatibleCost::Expensive) 13400 return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); 13401 13402 // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 13403 // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't 13404 // know it was called from a context with a nsz flag if the input fsub does 13405 // not. 13406 if (N0.getOpcode() == ISD::FSUB && 13407 (DAG.getTarget().Options.NoSignedZerosFPMath || 13408 N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { 13409 return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), 13410 N0.getOperand(0), N->getFlags()); 13411 } 13412 13413 // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading 13414 // constant pool values. 13415 if (!TLI.isFNegFree(VT) && 13416 N0.getOpcode() == ISD::BITCAST && 13417 N0.getNode()->hasOneUse()) { 13418 SDValue Int = N0.getOperand(0); 13419 EVT IntVT = Int.getValueType(); 13420 if (IntVT.isInteger() && !IntVT.isVector()) { 13421 APInt SignMask; 13422 if (N0.getValueType().isVector()) { 13423 // For a vector, get a mask such as 0x80... per scalar element 13424 // and splat it. 13425 SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); 13426 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); 13427 } else { 13428 // For a scalar, just generate 0x80... 13429 SignMask = APInt::getSignMask(IntVT.getSizeInBits()); 13430 } 13431 SDLoc DL0(N0); 13432 Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, 13433 DAG.getConstant(SignMask, DL0, IntVT)); 13434 AddToWorklist(Int.getNode()); 13435 return DAG.getBitcast(VT, Int); 13436 } 13437 } 13438 13439 // (fneg (fmul c, x)) -> (fmul -c, x) 13440 if (N0.getOpcode() == ISD::FMUL && 13441 (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) { 13442 ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); 13443 if (CFP1) { 13444 APFloat CVal = CFP1->getValueAPF(); 13445 CVal.changeSign(); 13446 if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || 13447 TLI.isOperationLegal(ISD::ConstantFP, VT))) 13448 return DAG.getNode( 13449 ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), 13450 DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), 13451 N0->getFlags()); 13452 } 13453 } 13454 13455 return SDValue(); 13456 } 13457 13458 static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, 13459 APFloat (*Op)(const APFloat &, const APFloat &)) { 13460 SDValue N0 = N->getOperand(0); 13461 SDValue N1 = N->getOperand(1); 13462 EVT VT = N->getValueType(0); 13463 const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); 13464 const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); 13465 13466 if (N0CFP && N1CFP) { 13467 const APFloat &C0 = N0CFP->getValueAPF(); 13468 const APFloat &C1 = N1CFP->getValueAPF(); 13469 return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT); 13470 } 13471 13472 // Canonicalize to constant on RHS. 13473 if (isConstantFPBuildVectorOrConstantFP(N0) && 13474 !isConstantFPBuildVectorOrConstantFP(N1)) 13475 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 13476 13477 return SDValue(); 13478 } 13479 13480 SDValue DAGCombiner::visitFMINNUM(SDNode *N) { 13481 return visitFMinMax(DAG, N, minnum); 13482 } 13483 13484 SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { 13485 return visitFMinMax(DAG, N, maxnum); 13486 } 13487 13488 SDValue DAGCombiner::visitFMINIMUM(SDNode *N) { 13489 return visitFMinMax(DAG, N, minimum); 13490 } 13491 13492 SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) { 13493 return visitFMinMax(DAG, N, maximum); 13494 } 13495 13496 SDValue DAGCombiner::visitFABS(SDNode *N) { 13497 SDValue N0 = N->getOperand(0); 13498 EVT VT = N->getValueType(0); 13499 13500 // fold (fabs c1) -> fabs(c1) 13501 if (isConstantFPBuildVectorOrConstantFP(N0)) 13502 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 13503 13504 // fold (fabs (fabs x)) -> (fabs x) 13505 if (N0.getOpcode() == ISD::FABS) 13506 return N->getOperand(0); 13507 13508 // fold (fabs (fneg x)) -> (fabs x) 13509 // fold (fabs (fcopysign x, y)) -> (fabs x) 13510 if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) 13511 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); 13512 13513 // fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads. 13514 if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) { 13515 SDValue Int = N0.getOperand(0); 13516 EVT IntVT = Int.getValueType(); 13517 if (IntVT.isInteger() && !IntVT.isVector()) { 13518 APInt SignMask; 13519 if (N0.getValueType().isVector()) { 13520 // For a vector, get a mask such as 0x7f... per scalar element 13521 // and splat it. 13522 SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); 13523 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); 13524 } else { 13525 // For a scalar, just generate 0x7f... 13526 SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); 13527 } 13528 SDLoc DL(N0); 13529 Int = DAG.getNode(ISD::AND, DL, IntVT, Int, 13530 DAG.getConstant(SignMask, DL, IntVT)); 13531 AddToWorklist(Int.getNode()); 13532 return DAG.getBitcast(N->getValueType(0), Int); 13533 } 13534 } 13535 13536 return SDValue(); 13537 } 13538 13539 SDValue DAGCombiner::visitBRCOND(SDNode *N) { 13540 SDValue Chain = N->getOperand(0); 13541 SDValue N1 = N->getOperand(1); 13542 SDValue N2 = N->getOperand(2); 13543 13544 // If N is a constant we could fold this into a fallthrough or unconditional 13545 // branch. However that doesn't happen very often in normal code, because 13546 // Instcombine/SimplifyCFG should have handled the available opportunities. 13547 // If we did this folding here, it would be necessary to update the 13548 // MachineBasicBlock CFG, which is awkward. 13549 13550 // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal 13551 // on the target. 13552 if (N1.getOpcode() == ISD::SETCC && 13553 TLI.isOperationLegalOrCustom(ISD::BR_CC, 13554 N1.getOperand(0).getValueType())) { 13555 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 13556 Chain, N1.getOperand(2), 13557 N1.getOperand(0), N1.getOperand(1), N2); 13558 } 13559 13560 if (N1.hasOneUse()) { 13561 // rebuildSetCC calls visitXor which may change the Chain when there is a 13562 // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes. 13563 HandleSDNode ChainHandle(Chain); 13564 if (SDValue NewN1 = rebuildSetCC(N1)) 13565 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, 13566 ChainHandle.getValue(), NewN1, N2); 13567 } 13568 13569 return SDValue(); 13570 } 13571 13572 SDValue DAGCombiner::rebuildSetCC(SDValue N) { 13573 if (N.getOpcode() == ISD::SRL || 13574 (N.getOpcode() == ISD::TRUNCATE && 13575 (N.getOperand(0).hasOneUse() && 13576 N.getOperand(0).getOpcode() == ISD::SRL))) { 13577 // Look pass the truncate. 13578 if (N.getOpcode() == ISD::TRUNCATE) 13579 N = N.getOperand(0); 13580 13581 // Match this pattern so that we can generate simpler code: 13582 // 13583 // %a = ... 13584 // %b = and i32 %a, 2 13585 // %c = srl i32 %b, 1 13586 // brcond i32 %c ... 13587 // 13588 // into 13589 // 13590 // %a = ... 13591 // %b = and i32 %a, 2 13592 // %c = setcc eq %b, 0 13593 // brcond %c ... 13594 // 13595 // This applies only when the AND constant value has one bit set and the 13596 // SRL constant is equal to the log2 of the AND constant. The back-end is 13597 // smart enough to convert the result into a TEST/JMP sequence. 13598 SDValue Op0 = N.getOperand(0); 13599 SDValue Op1 = N.getOperand(1); 13600 13601 if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { 13602 SDValue AndOp1 = Op0.getOperand(1); 13603 13604 if (AndOp1.getOpcode() == ISD::Constant) { 13605 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue(); 13606 13607 if (AndConst.isPowerOf2() && 13608 cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) { 13609 SDLoc DL(N); 13610 return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()), 13611 Op0, DAG.getConstant(0, DL, Op0.getValueType()), 13612 ISD::SETNE); 13613 } 13614 } 13615 } 13616 } 13617 13618 // Transform br(xor(x, y)) -> br(x != y) 13619 // Transform br(xor(xor(x,y), 1)) -> br (x == y) 13620 if (N.getOpcode() == ISD::XOR) { 13621 // Because we may call this on a speculatively constructed 13622 // SimplifiedSetCC Node, we need to simplify this node first. 13623 // Ideally this should be folded into SimplifySetCC and not 13624 // here. For now, grab a handle to N so we don't lose it from 13625 // replacements interal to the visit. 13626 HandleSDNode XORHandle(N); 13627 while (N.getOpcode() == ISD::XOR) { 13628 SDValue Tmp = visitXOR(N.getNode()); 13629 // No simplification done. 13630 if (!Tmp.getNode()) 13631 break; 13632 // Returning N is form in-visit replacement that may invalidated 13633 // N. Grab value from Handle. 13634 if (Tmp.getNode() == N.getNode()) 13635 N = XORHandle.getValue(); 13636 else // Node simplified. Try simplifying again. 13637 N = Tmp; 13638 } 13639 13640 if (N.getOpcode() != ISD::XOR) 13641 return N; 13642 13643 SDNode *TheXor = N.getNode(); 13644 13645 SDValue Op0 = TheXor->getOperand(0); 13646 SDValue Op1 = TheXor->getOperand(1); 13647 13648 if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { 13649 bool Equal = false; 13650 if (isOneConstant(Op0) && Op0.hasOneUse() && 13651 Op0.getOpcode() == ISD::XOR) { 13652 TheXor = Op0.getNode(); 13653 Equal = true; 13654 } 13655 13656 EVT SetCCVT = N.getValueType(); 13657 if (LegalTypes) 13658 SetCCVT = getSetCCResultType(SetCCVT); 13659 // Replace the uses of XOR with SETCC 13660 return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1, 13661 Equal ? ISD::SETEQ : ISD::SETNE); 13662 } 13663 } 13664 13665 return SDValue(); 13666 } 13667 13668 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. 13669 // 13670 SDValue DAGCombiner::visitBR_CC(SDNode *N) { 13671 CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1)); 13672 SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); 13673 13674 // If N is a constant we could fold this into a fallthrough or unconditional 13675 // branch. However that doesn't happen very often in normal code, because 13676 // Instcombine/SimplifyCFG should have handled the available opportunities. 13677 // If we did this folding here, it would be necessary to update the 13678 // MachineBasicBlock CFG, which is awkward. 13679 13680 // Use SimplifySetCC to simplify SETCC's. 13681 SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), 13682 CondLHS, CondRHS, CC->get(), SDLoc(N), 13683 false); 13684 if (Simp.getNode()) AddToWorklist(Simp.getNode()); 13685 13686 // fold to a simpler setcc 13687 if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) 13688 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 13689 N->getOperand(0), Simp.getOperand(2), 13690 Simp.getOperand(0), Simp.getOperand(1), 13691 N->getOperand(4)); 13692 13693 return SDValue(); 13694 } 13695 13696 /// Return true if 'Use' is a load or a store that uses N as its base pointer 13697 /// and that N may be folded in the load / store addressing mode. 13698 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, 13699 SelectionDAG &DAG, 13700 const TargetLowering &TLI) { 13701 EVT VT; 13702 unsigned AS; 13703 13704 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { 13705 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 13706 return false; 13707 VT = LD->getMemoryVT(); 13708 AS = LD->getAddressSpace(); 13709 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { 13710 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 13711 return false; 13712 VT = ST->getMemoryVT(); 13713 AS = ST->getAddressSpace(); 13714 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { 13715 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 13716 return false; 13717 VT = LD->getMemoryVT(); 13718 AS = LD->getAddressSpace(); 13719 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { 13720 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 13721 return false; 13722 VT = ST->getMemoryVT(); 13723 AS = ST->getAddressSpace(); 13724 } else 13725 return false; 13726 13727 TargetLowering::AddrMode AM; 13728 if (N->getOpcode() == ISD::ADD) { 13729 AM.HasBaseReg = true; 13730 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13731 if (Offset) 13732 // [reg +/- imm] 13733 AM.BaseOffs = Offset->getSExtValue(); 13734 else 13735 // [reg +/- reg] 13736 AM.Scale = 1; 13737 } else if (N->getOpcode() == ISD::SUB) { 13738 AM.HasBaseReg = true; 13739 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13740 if (Offset) 13741 // [reg +/- imm] 13742 AM.BaseOffs = -Offset->getSExtValue(); 13743 else 13744 // [reg +/- reg] 13745 AM.Scale = 1; 13746 } else 13747 return false; 13748 13749 return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, 13750 VT.getTypeForEVT(*DAG.getContext()), AS); 13751 } 13752 13753 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, 13754 bool &IsLoad, bool &IsMasked, SDValue &Ptr, 13755 const TargetLowering &TLI) { 13756 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13757 if (LD->isIndexed()) 13758 return false; 13759 EVT VT = LD->getMemoryVT(); 13760 if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT)) 13761 return false; 13762 Ptr = LD->getBasePtr(); 13763 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13764 if (ST->isIndexed()) 13765 return false; 13766 EVT VT = ST->getMemoryVT(); 13767 if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT)) 13768 return false; 13769 Ptr = ST->getBasePtr(); 13770 IsLoad = false; 13771 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 13772 if (LD->isIndexed()) 13773 return false; 13774 EVT VT = LD->getMemoryVT(); 13775 if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) && 13776 !TLI.isIndexedMaskedLoadLegal(Dec, VT)) 13777 return false; 13778 Ptr = LD->getBasePtr(); 13779 IsMasked = true; 13780 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 13781 if (ST->isIndexed()) 13782 return false; 13783 EVT VT = ST->getMemoryVT(); 13784 if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) && 13785 !TLI.isIndexedMaskedStoreLegal(Dec, VT)) 13786 return false; 13787 Ptr = ST->getBasePtr(); 13788 IsLoad = false; 13789 IsMasked = true; 13790 } else { 13791 return false; 13792 } 13793 return true; 13794 } 13795 13796 /// Try turning a load/store into a pre-indexed load/store when the base 13797 /// pointer is an add or subtract and it has other uses besides the load/store. 13798 /// After the transformation, the new indexed load/store has effectively folded 13799 /// the add/subtract in and all of its other uses are redirected to the 13800 /// new load/store. 13801 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { 13802 if (Level < AfterLegalizeDAG) 13803 return false; 13804 13805 bool IsLoad = true; 13806 bool IsMasked = false; 13807 SDValue Ptr; 13808 if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked, 13809 Ptr, TLI)) 13810 return false; 13811 13812 // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail 13813 // out. There is no reason to make this a preinc/predec. 13814 if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || 13815 Ptr.getNode()->hasOneUse()) 13816 return false; 13817 13818 // Ask the target to do addressing mode selection. 13819 SDValue BasePtr; 13820 SDValue Offset; 13821 ISD::MemIndexedMode AM = ISD::UNINDEXED; 13822 if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) 13823 return false; 13824 13825 // Backends without true r+i pre-indexed forms may need to pass a 13826 // constant base with a variable offset so that constant coercion 13827 // will work with the patterns in canonical form. 13828 bool Swapped = false; 13829 if (isa<ConstantSDNode>(BasePtr)) { 13830 std::swap(BasePtr, Offset); 13831 Swapped = true; 13832 } 13833 13834 // Don't create a indexed load / store with zero offset. 13835 if (isNullConstant(Offset)) 13836 return false; 13837 13838 // Try turning it into a pre-indexed load / store except when: 13839 // 1) The new base ptr is a frame index. 13840 // 2) If N is a store and the new base ptr is either the same as or is a 13841 // predecessor of the value being stored. 13842 // 3) Another use of old base ptr is a predecessor of N. If ptr is folded 13843 // that would create a cycle. 13844 // 4) All uses are load / store ops that use it as old base ptr. 13845 13846 // Check #1. Preinc'ing a frame index would require copying the stack pointer 13847 // (plus the implicit offset) to a register to preinc anyway. 13848 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 13849 return false; 13850 13851 // Check #2. 13852 if (!IsLoad) { 13853 SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue() 13854 : cast<StoreSDNode>(N)->getValue(); 13855 13856 // Would require a copy. 13857 if (Val == BasePtr) 13858 return false; 13859 13860 // Would create a cycle. 13861 if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode())) 13862 return false; 13863 } 13864 13865 // Caches for hasPredecessorHelper. 13866 SmallPtrSet<const SDNode *, 32> Visited; 13867 SmallVector<const SDNode *, 16> Worklist; 13868 Worklist.push_back(N); 13869 13870 // If the offset is a constant, there may be other adds of constants that 13871 // can be folded with this one. We should do this to avoid having to keep 13872 // a copy of the original base pointer. 13873 SmallVector<SDNode *, 16> OtherUses; 13874 if (isa<ConstantSDNode>(Offset)) 13875 for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(), 13876 UE = BasePtr.getNode()->use_end(); 13877 UI != UE; ++UI) { 13878 SDUse &Use = UI.getUse(); 13879 // Skip the use that is Ptr and uses of other results from BasePtr's 13880 // node (important for nodes that return multiple results). 13881 if (Use.getUser() == Ptr.getNode() || Use != BasePtr) 13882 continue; 13883 13884 if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist)) 13885 continue; 13886 13887 if (Use.getUser()->getOpcode() != ISD::ADD && 13888 Use.getUser()->getOpcode() != ISD::SUB) { 13889 OtherUses.clear(); 13890 break; 13891 } 13892 13893 SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1); 13894 if (!isa<ConstantSDNode>(Op1)) { 13895 OtherUses.clear(); 13896 break; 13897 } 13898 13899 // FIXME: In some cases, we can be smarter about this. 13900 if (Op1.getValueType() != Offset.getValueType()) { 13901 OtherUses.clear(); 13902 break; 13903 } 13904 13905 OtherUses.push_back(Use.getUser()); 13906 } 13907 13908 if (Swapped) 13909 std::swap(BasePtr, Offset); 13910 13911 // Now check for #3 and #4. 13912 bool RealUse = false; 13913 13914 for (SDNode *Use : Ptr.getNode()->uses()) { 13915 if (Use == N) 13916 continue; 13917 if (SDNode::hasPredecessorHelper(Use, Visited, Worklist)) 13918 return false; 13919 13920 // If Ptr may be folded in addressing mode of other use, then it's 13921 // not profitable to do this transformation. 13922 if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) 13923 RealUse = true; 13924 } 13925 13926 if (!RealUse) 13927 return false; 13928 13929 SDValue Result; 13930 if (!IsMasked) { 13931 if (IsLoad) 13932 Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 13933 else 13934 Result = 13935 DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 13936 } else { 13937 if (IsLoad) 13938 Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 13939 Offset, AM); 13940 else 13941 Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, 13942 Offset, AM); 13943 } 13944 ++PreIndexedNodes; 13945 ++NodesCombined; 13946 LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; 13947 Result.getNode()->dump(&DAG); dbgs() << '\n'); 13948 WorklistRemover DeadNodes(*this); 13949 if (IsLoad) { 13950 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 13951 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 13952 } else { 13953 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 13954 } 13955 13956 // Finally, since the node is now dead, remove it from the graph. 13957 deleteAndRecombine(N); 13958 13959 if (Swapped) 13960 std::swap(BasePtr, Offset); 13961 13962 // Replace other uses of BasePtr that can be updated to use Ptr 13963 for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { 13964 unsigned OffsetIdx = 1; 13965 if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) 13966 OffsetIdx = 0; 13967 assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == 13968 BasePtr.getNode() && "Expected BasePtr operand"); 13969 13970 // We need to replace ptr0 in the following expression: 13971 // x0 * offset0 + y0 * ptr0 = t0 13972 // knowing that 13973 // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) 13974 // 13975 // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the 13976 // indexed load/store and the expression that needs to be re-written. 13977 // 13978 // Therefore, we have: 13979 // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 13980 13981 ConstantSDNode *CN = 13982 cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); 13983 int X0, X1, Y0, Y1; 13984 const APInt &Offset0 = CN->getAPIntValue(); 13985 APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue(); 13986 13987 X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; 13988 Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; 13989 X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; 13990 Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; 13991 13992 unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; 13993 13994 APInt CNV = Offset0; 13995 if (X0 < 0) CNV = -CNV; 13996 if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; 13997 else CNV = CNV - Offset1; 13998 13999 SDLoc DL(OtherUses[i]); 14000 14001 // We can now generate the new expression. 14002 SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); 14003 SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0); 14004 14005 SDValue NewUse = DAG.getNode(Opcode, 14006 DL, 14007 OtherUses[i]->getValueType(0), NewOp1, NewOp2); 14008 DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); 14009 deleteAndRecombine(OtherUses[i]); 14010 } 14011 14012 // Replace the uses of Ptr with uses of the updated base value. 14013 DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0)); 14014 deleteAndRecombine(Ptr.getNode()); 14015 AddToWorklist(Result.getNode()); 14016 14017 return true; 14018 } 14019 14020 /// Try to combine a load/store with a add/sub of the base pointer node into a 14021 /// post-indexed load/store. The transformation folded the add/subtract into the 14022 /// new indexed load/store effectively and all of its uses are redirected to the 14023 /// new load/store. 14024 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { 14025 if (Level < AfterLegalizeDAG) 14026 return false; 14027 14028 bool IsLoad = true; 14029 bool IsMasked = false; 14030 SDValue Ptr; 14031 if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked, 14032 Ptr, TLI)) 14033 return false; 14034 14035 if (Ptr.getNode()->hasOneUse()) 14036 return false; 14037 14038 for (SDNode *Op : Ptr.getNode()->uses()) { 14039 if (Op == N || 14040 (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)) 14041 continue; 14042 14043 SDValue BasePtr; 14044 SDValue Offset; 14045 ISD::MemIndexedMode AM = ISD::UNINDEXED; 14046 if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) { 14047 // Don't create a indexed load / store with zero offset. 14048 if (isNullConstant(Offset)) 14049 continue; 14050 14051 // Try turning it into a post-indexed load / store except when 14052 // 1) All uses are load / store ops that use it as base ptr (and 14053 // it may be folded as addressing mmode). 14054 // 2) Op must be independent of N, i.e. Op is neither a predecessor 14055 // nor a successor of N. Otherwise, if Op is folded that would 14056 // create a cycle. 14057 14058 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 14059 continue; 14060 14061 // Check for #1. 14062 bool TryNext = false; 14063 for (SDNode *Use : BasePtr.getNode()->uses()) { 14064 if (Use == Ptr.getNode()) 14065 continue; 14066 14067 // If all the uses are load / store addresses, then don't do the 14068 // transformation. 14069 if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { 14070 bool RealUse = false; 14071 for (SDNode *UseUse : Use->uses()) { 14072 if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) 14073 RealUse = true; 14074 } 14075 14076 if (!RealUse) { 14077 TryNext = true; 14078 break; 14079 } 14080 } 14081 } 14082 14083 if (TryNext) 14084 continue; 14085 14086 // Check for #2. 14087 SmallPtrSet<const SDNode *, 32> Visited; 14088 SmallVector<const SDNode *, 8> Worklist; 14089 // Ptr is predecessor to both N and Op. 14090 Visited.insert(Ptr.getNode()); 14091 Worklist.push_back(N); 14092 Worklist.push_back(Op); 14093 if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && 14094 !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) { 14095 SDValue Result; 14096 if (!IsMasked) 14097 Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 14098 Offset, AM) 14099 : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), 14100 BasePtr, Offset, AM); 14101 else 14102 Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), 14103 BasePtr, Offset, AM) 14104 : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), 14105 BasePtr, Offset, AM); 14106 ++PostIndexedNodes; 14107 ++NodesCombined; 14108 LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); 14109 dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); 14110 dbgs() << '\n'); 14111 WorklistRemover DeadNodes(*this); 14112 if (IsLoad) { 14113 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 14114 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 14115 } else { 14116 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 14117 } 14118 14119 // Finally, since the node is now dead, remove it from the graph. 14120 deleteAndRecombine(N); 14121 14122 // Replace the uses of Use with uses of the updated base value. 14123 DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), 14124 Result.getValue(IsLoad ? 1 : 0)); 14125 deleteAndRecombine(Op); 14126 return true; 14127 } 14128 } 14129 } 14130 14131 return false; 14132 } 14133 14134 /// Return the base-pointer arithmetic from an indexed \p LD. 14135 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { 14136 ISD::MemIndexedMode AM = LD->getAddressingMode(); 14137 assert(AM != ISD::UNINDEXED); 14138 SDValue BP = LD->getOperand(1); 14139 SDValue Inc = LD->getOperand(2); 14140 14141 // Some backends use TargetConstants for load offsets, but don't expect 14142 // TargetConstants in general ADD nodes. We can convert these constants into 14143 // regular Constants (if the constant is not opaque). 14144 assert((Inc.getOpcode() != ISD::TargetConstant || 14145 !cast<ConstantSDNode>(Inc)->isOpaque()) && 14146 "Cannot split out indexing using opaque target constants"); 14147 if (Inc.getOpcode() == ISD::TargetConstant) { 14148 ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc); 14149 Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc), 14150 ConstInc->getValueType(0)); 14151 } 14152 14153 unsigned Opc = 14154 (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); 14155 return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); 14156 } 14157 14158 static inline int numVectorEltsOrZero(EVT T) { 14159 return T.isVector() ? T.getVectorNumElements() : 0; 14160 } 14161 14162 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { 14163 Val = ST->getValue(); 14164 EVT STType = Val.getValueType(); 14165 EVT STMemType = ST->getMemoryVT(); 14166 if (STType == STMemType) 14167 return true; 14168 if (isTypeLegal(STMemType)) 14169 return false; // fail. 14170 if (STType.isFloatingPoint() && STMemType.isFloatingPoint() && 14171 TLI.isOperationLegal(ISD::FTRUNC, STMemType)) { 14172 Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val); 14173 return true; 14174 } 14175 if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) && 14176 STType.isInteger() && STMemType.isInteger()) { 14177 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val); 14178 return true; 14179 } 14180 if (STType.getSizeInBits() == STMemType.getSizeInBits()) { 14181 Val = DAG.getBitcast(STMemType, Val); 14182 return true; 14183 } 14184 return false; // fail. 14185 } 14186 14187 bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { 14188 EVT LDMemType = LD->getMemoryVT(); 14189 EVT LDType = LD->getValueType(0); 14190 assert(Val.getValueType() == LDMemType && 14191 "Attempting to extend value of non-matching type"); 14192 if (LDType == LDMemType) 14193 return true; 14194 if (LDMemType.isInteger() && LDType.isInteger()) { 14195 switch (LD->getExtensionType()) { 14196 case ISD::NON_EXTLOAD: 14197 Val = DAG.getBitcast(LDType, Val); 14198 return true; 14199 case ISD::EXTLOAD: 14200 Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val); 14201 return true; 14202 case ISD::SEXTLOAD: 14203 Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val); 14204 return true; 14205 case ISD::ZEXTLOAD: 14206 Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val); 14207 return true; 14208 } 14209 } 14210 return false; 14211 } 14212 14213 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { 14214 if (OptLevel == CodeGenOpt::None || !LD->isSimple()) 14215 return SDValue(); 14216 SDValue Chain = LD->getOperand(0); 14217 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode()); 14218 // TODO: Relax this restriction for unordered atomics (see D66309) 14219 if (!ST || !ST->isSimple()) 14220 return SDValue(); 14221 14222 EVT LDType = LD->getValueType(0); 14223 EVT LDMemType = LD->getMemoryVT(); 14224 EVT STMemType = ST->getMemoryVT(); 14225 EVT STType = ST->getValue().getValueType(); 14226 14227 BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); 14228 BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); 14229 int64_t Offset; 14230 if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) 14231 return SDValue(); 14232 14233 // Normalize for Endianness. After this Offset=0 will denote that the least 14234 // significant bit in the loaded value maps to the least significant bit in 14235 // the stored value). With Offset=n (for n > 0) the loaded value starts at the 14236 // n:th least significant byte of the stored value. 14237 if (DAG.getDataLayout().isBigEndian()) 14238 Offset = ((int64_t)STMemType.getStoreSizeInBits() - 14239 (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; 14240 14241 // Check that the stored value cover all bits that are loaded. 14242 bool STCoversLD = 14243 (Offset >= 0) && 14244 (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); 14245 14246 auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { 14247 if (LD->isIndexed()) { 14248 bool IsSub = (LD->getAddressingMode() == ISD::PRE_DEC || 14249 LD->getAddressingMode() == ISD::POST_DEC); 14250 unsigned Opc = IsSub ? ISD::SUB : ISD::ADD; 14251 SDValue Idx = DAG.getNode(Opc, SDLoc(LD), LD->getOperand(1).getValueType(), 14252 LD->getOperand(1), LD->getOperand(2)); 14253 SDValue Ops[] = {Val, Idx, Chain}; 14254 return CombineTo(LD, Ops, 3); 14255 } 14256 return CombineTo(LD, Val, Chain); 14257 }; 14258 14259 if (!STCoversLD) 14260 return SDValue(); 14261 14262 // Memory as copy space (potentially masked). 14263 if (Offset == 0 && LDType == STType && STMemType == LDMemType) { 14264 // Simple case: Direct non-truncating forwarding 14265 if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) 14266 return ReplaceLd(LD, ST->getValue(), Chain); 14267 // Can we model the truncate and extension with an and mask? 14268 if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && 14269 !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { 14270 // Mask to size of LDMemType 14271 auto Mask = 14272 DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), 14273 STMemType.getSizeInBits()), 14274 SDLoc(ST), STType); 14275 auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); 14276 return ReplaceLd(LD, Val, Chain); 14277 } 14278 } 14279 14280 // TODO: Deal with nonzero offset. 14281 if (LD->getBasePtr().isUndef() || Offset != 0) 14282 return SDValue(); 14283 // Model necessary truncations / extenstions. 14284 SDValue Val; 14285 // Truncate Value To Stored Memory Size. 14286 do { 14287 if (!getTruncatedStoreValue(ST, Val)) 14288 continue; 14289 if (!isTypeLegal(LDMemType)) 14290 continue; 14291 if (STMemType != LDMemType) { 14292 // TODO: Support vectors? This requires extract_subvector/bitcast. 14293 if (!STMemType.isVector() && !LDMemType.isVector() && 14294 STMemType.isInteger() && LDMemType.isInteger()) 14295 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val); 14296 else 14297 continue; 14298 } 14299 if (!extendLoadedValueToExtension(LD, Val)) 14300 continue; 14301 return ReplaceLd(LD, Val, Chain); 14302 } while (false); 14303 14304 // On failure, cleanup dead nodes we may have created. 14305 if (Val->use_empty()) 14306 deleteAndRecombine(Val.getNode()); 14307 return SDValue(); 14308 } 14309 14310 SDValue DAGCombiner::visitLOAD(SDNode *N) { 14311 LoadSDNode *LD = cast<LoadSDNode>(N); 14312 SDValue Chain = LD->getChain(); 14313 SDValue Ptr = LD->getBasePtr(); 14314 14315 // If load is not volatile and there are no uses of the loaded value (and 14316 // the updated indexed value in case of indexed loads), change uses of the 14317 // chain value into uses of the chain input (i.e. delete the dead load). 14318 // TODO: Allow this for unordered atomics (see D66309) 14319 if (LD->isSimple()) { 14320 if (N->getValueType(1) == MVT::Other) { 14321 // Unindexed loads. 14322 if (!N->hasAnyUseOfValue(0)) { 14323 // It's not safe to use the two value CombineTo variant here. e.g. 14324 // v1, chain2 = load chain1, loc 14325 // v2, chain3 = load chain2, loc 14326 // v3 = add v2, c 14327 // Now we replace use of chain2 with chain1. This makes the second load 14328 // isomorphic to the one we are deleting, and thus makes this load live. 14329 LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); 14330 dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); 14331 dbgs() << "\n"); 14332 WorklistRemover DeadNodes(*this); 14333 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 14334 AddUsersToWorklist(Chain.getNode()); 14335 if (N->use_empty()) 14336 deleteAndRecombine(N); 14337 14338 return SDValue(N, 0); // Return N so it doesn't get rechecked! 14339 } 14340 } else { 14341 // Indexed loads. 14342 assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); 14343 14344 // If this load has an opaque TargetConstant offset, then we cannot split 14345 // the indexing into an add/sub directly (that TargetConstant may not be 14346 // valid for a different type of node, and we cannot convert an opaque 14347 // target constant into a regular constant). 14348 bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant && 14349 cast<ConstantSDNode>(LD->getOperand(2))->isOpaque(); 14350 14351 if (!N->hasAnyUseOfValue(0) && 14352 ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) { 14353 SDValue Undef = DAG.getUNDEF(N->getValueType(0)); 14354 SDValue Index; 14355 if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) { 14356 Index = SplitIndexingFromLoad(LD); 14357 // Try to fold the base pointer arithmetic into subsequent loads and 14358 // stores. 14359 AddUsersToWorklist(N); 14360 } else 14361 Index = DAG.getUNDEF(N->getValueType(1)); 14362 LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); 14363 dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); 14364 dbgs() << " and 2 other values\n"); 14365 WorklistRemover DeadNodes(*this); 14366 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); 14367 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); 14368 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); 14369 deleteAndRecombine(N); 14370 return SDValue(N, 0); // Return N so it doesn't get rechecked! 14371 } 14372 } 14373 } 14374 14375 // If this load is directly stored, replace the load value with the stored 14376 // value. 14377 if (auto V = ForwardStoreValueToDirectLoad(LD)) 14378 return V; 14379 14380 // Try to infer better alignment information than the load already has. 14381 if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) { 14382 if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { 14383 if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) { 14384 SDValue NewLoad = DAG.getExtLoad( 14385 LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, 14386 LD->getPointerInfo(), LD->getMemoryVT(), Align, 14387 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 14388 // NewLoad will always be N as we are only refining the alignment 14389 assert(NewLoad.getNode() == N); 14390 (void)NewLoad; 14391 } 14392 } 14393 } 14394 14395 if (LD->isUnindexed()) { 14396 // Walk up chain skipping non-aliasing memory nodes. 14397 SDValue BetterChain = FindBetterChain(LD, Chain); 14398 14399 // If there is a better chain. 14400 if (Chain != BetterChain) { 14401 SDValue ReplLoad; 14402 14403 // Replace the chain to void dependency. 14404 if (LD->getExtensionType() == ISD::NON_EXTLOAD) { 14405 ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), 14406 BetterChain, Ptr, LD->getMemOperand()); 14407 } else { 14408 ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), 14409 LD->getValueType(0), 14410 BetterChain, Ptr, LD->getMemoryVT(), 14411 LD->getMemOperand()); 14412 } 14413 14414 // Create token factor to keep old chain connected. 14415 SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), 14416 MVT::Other, Chain, ReplLoad.getValue(1)); 14417 14418 // Replace uses with load result and token factor 14419 return CombineTo(N, ReplLoad.getValue(0), Token); 14420 } 14421 } 14422 14423 // Try transforming N to an indexed load. 14424 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 14425 return SDValue(N, 0); 14426 14427 // Try to slice up N to more direct loads if the slices are mapped to 14428 // different register banks or pairing can take place. 14429 if (SliceUpLoad(N)) 14430 return SDValue(N, 0); 14431 14432 return SDValue(); 14433 } 14434 14435 namespace { 14436 14437 /// Helper structure used to slice a load in smaller loads. 14438 /// Basically a slice is obtained from the following sequence: 14439 /// Origin = load Ty1, Base 14440 /// Shift = srl Ty1 Origin, CstTy Amount 14441 /// Inst = trunc Shift to Ty2 14442 /// 14443 /// Then, it will be rewritten into: 14444 /// Slice = load SliceTy, Base + SliceOffset 14445 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 14446 /// 14447 /// SliceTy is deduced from the number of bits that are actually used to 14448 /// build Inst. 14449 struct LoadedSlice { 14450 /// Helper structure used to compute the cost of a slice. 14451 struct Cost { 14452 /// Are we optimizing for code size. 14453 bool ForCodeSize = false; 14454 14455 /// Various cost. 14456 unsigned Loads = 0; 14457 unsigned Truncates = 0; 14458 unsigned CrossRegisterBanksCopies = 0; 14459 unsigned ZExts = 0; 14460 unsigned Shift = 0; 14461 14462 explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {} 14463 14464 /// Get the cost of one isolated slice. 14465 Cost(const LoadedSlice &LS, bool ForCodeSize) 14466 : ForCodeSize(ForCodeSize), Loads(1) { 14467 EVT TruncType = LS.Inst->getValueType(0); 14468 EVT LoadedType = LS.getLoadedType(); 14469 if (TruncType != LoadedType && 14470 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) 14471 ZExts = 1; 14472 } 14473 14474 /// Account for slicing gain in the current cost. 14475 /// Slicing provide a few gains like removing a shift or a 14476 /// truncate. This method allows to grow the cost of the original 14477 /// load with the gain from this slice. 14478 void addSliceGain(const LoadedSlice &LS) { 14479 // Each slice saves a truncate. 14480 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); 14481 if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), 14482 LS.Inst->getValueType(0))) 14483 ++Truncates; 14484 // If there is a shift amount, this slice gets rid of it. 14485 if (LS.Shift) 14486 ++Shift; 14487 // If this slice can merge a cross register bank copy, account for it. 14488 if (LS.canMergeExpensiveCrossRegisterBankCopy()) 14489 ++CrossRegisterBanksCopies; 14490 } 14491 14492 Cost &operator+=(const Cost &RHS) { 14493 Loads += RHS.Loads; 14494 Truncates += RHS.Truncates; 14495 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; 14496 ZExts += RHS.ZExts; 14497 Shift += RHS.Shift; 14498 return *this; 14499 } 14500 14501 bool operator==(const Cost &RHS) const { 14502 return Loads == RHS.Loads && Truncates == RHS.Truncates && 14503 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && 14504 ZExts == RHS.ZExts && Shift == RHS.Shift; 14505 } 14506 14507 bool operator!=(const Cost &RHS) const { return !(*this == RHS); } 14508 14509 bool operator<(const Cost &RHS) const { 14510 // Assume cross register banks copies are as expensive as loads. 14511 // FIXME: Do we want some more target hooks? 14512 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; 14513 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; 14514 // Unless we are optimizing for code size, consider the 14515 // expensive operation first. 14516 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) 14517 return ExpensiveOpsLHS < ExpensiveOpsRHS; 14518 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < 14519 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); 14520 } 14521 14522 bool operator>(const Cost &RHS) const { return RHS < *this; } 14523 14524 bool operator<=(const Cost &RHS) const { return !(RHS < *this); } 14525 14526 bool operator>=(const Cost &RHS) const { return !(*this < RHS); } 14527 }; 14528 14529 // The last instruction that represent the slice. This should be a 14530 // truncate instruction. 14531 SDNode *Inst; 14532 14533 // The original load instruction. 14534 LoadSDNode *Origin; 14535 14536 // The right shift amount in bits from the original load. 14537 unsigned Shift; 14538 14539 // The DAG from which Origin came from. 14540 // This is used to get some contextual information about legal types, etc. 14541 SelectionDAG *DAG; 14542 14543 LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, 14544 unsigned Shift = 0, SelectionDAG *DAG = nullptr) 14545 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} 14546 14547 /// Get the bits used in a chunk of bits \p BitWidth large. 14548 /// \return Result is \p BitWidth and has used bits set to 1 and 14549 /// not used bits set to 0. 14550 APInt getUsedBits() const { 14551 // Reproduce the trunc(lshr) sequence: 14552 // - Start from the truncated value. 14553 // - Zero extend to the desired bit width. 14554 // - Shift left. 14555 assert(Origin && "No original load to compare against."); 14556 unsigned BitWidth = Origin->getValueSizeInBits(0); 14557 assert(Inst && "This slice is not bound to an instruction"); 14558 assert(Inst->getValueSizeInBits(0) <= BitWidth && 14559 "Extracted slice is bigger than the whole type!"); 14560 APInt UsedBits(Inst->getValueSizeInBits(0), 0); 14561 UsedBits.setAllBits(); 14562 UsedBits = UsedBits.zext(BitWidth); 14563 UsedBits <<= Shift; 14564 return UsedBits; 14565 } 14566 14567 /// Get the size of the slice to be loaded in bytes. 14568 unsigned getLoadedSize() const { 14569 unsigned SliceSize = getUsedBits().countPopulation(); 14570 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); 14571 return SliceSize / 8; 14572 } 14573 14574 /// Get the type that will be loaded for this slice. 14575 /// Note: This may not be the final type for the slice. 14576 EVT getLoadedType() const { 14577 assert(DAG && "Missing context"); 14578 LLVMContext &Ctxt = *DAG->getContext(); 14579 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); 14580 } 14581 14582 /// Get the alignment of the load used for this slice. 14583 unsigned getAlignment() const { 14584 unsigned Alignment = Origin->getAlignment(); 14585 uint64_t Offset = getOffsetFromBase(); 14586 if (Offset != 0) 14587 Alignment = MinAlign(Alignment, Alignment + Offset); 14588 return Alignment; 14589 } 14590 14591 /// Check if this slice can be rewritten with legal operations. 14592 bool isLegal() const { 14593 // An invalid slice is not legal. 14594 if (!Origin || !Inst || !DAG) 14595 return false; 14596 14597 // Offsets are for indexed load only, we do not handle that. 14598 if (!Origin->getOffset().isUndef()) 14599 return false; 14600 14601 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 14602 14603 // Check that the type is legal. 14604 EVT SliceType = getLoadedType(); 14605 if (!TLI.isTypeLegal(SliceType)) 14606 return false; 14607 14608 // Check that the load is legal for this type. 14609 if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) 14610 return false; 14611 14612 // Check that the offset can be computed. 14613 // 1. Check its type. 14614 EVT PtrType = Origin->getBasePtr().getValueType(); 14615 if (PtrType == MVT::Untyped || PtrType.isExtended()) 14616 return false; 14617 14618 // 2. Check that it fits in the immediate. 14619 if (!TLI.isLegalAddImmediate(getOffsetFromBase())) 14620 return false; 14621 14622 // 3. Check that the computation is legal. 14623 if (!TLI.isOperationLegal(ISD::ADD, PtrType)) 14624 return false; 14625 14626 // Check that the zext is legal if it needs one. 14627 EVT TruncateType = Inst->getValueType(0); 14628 if (TruncateType != SliceType && 14629 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) 14630 return false; 14631 14632 return true; 14633 } 14634 14635 /// Get the offset in bytes of this slice in the original chunk of 14636 /// bits. 14637 /// \pre DAG != nullptr. 14638 uint64_t getOffsetFromBase() const { 14639 assert(DAG && "Missing context."); 14640 bool IsBigEndian = DAG->getDataLayout().isBigEndian(); 14641 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); 14642 uint64_t Offset = Shift / 8; 14643 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; 14644 assert(!(Origin->getValueSizeInBits(0) & 0x7) && 14645 "The size of the original loaded type is not a multiple of a" 14646 " byte."); 14647 // If Offset is bigger than TySizeInBytes, it means we are loading all 14648 // zeros. This should have been optimized before in the process. 14649 assert(TySizeInBytes > Offset && 14650 "Invalid shift amount for given loaded size"); 14651 if (IsBigEndian) 14652 Offset = TySizeInBytes - Offset - getLoadedSize(); 14653 return Offset; 14654 } 14655 14656 /// Generate the sequence of instructions to load the slice 14657 /// represented by this object and redirect the uses of this slice to 14658 /// this new sequence of instructions. 14659 /// \pre this->Inst && this->Origin are valid Instructions and this 14660 /// object passed the legal check: LoadedSlice::isLegal returned true. 14661 /// \return The last instruction of the sequence used to load the slice. 14662 SDValue loadSlice() const { 14663 assert(Inst && Origin && "Unable to replace a non-existing slice."); 14664 const SDValue &OldBaseAddr = Origin->getBasePtr(); 14665 SDValue BaseAddr = OldBaseAddr; 14666 // Get the offset in that chunk of bytes w.r.t. the endianness. 14667 int64_t Offset = static_cast<int64_t>(getOffsetFromBase()); 14668 assert(Offset >= 0 && "Offset too big to fit in int64_t!"); 14669 if (Offset) { 14670 // BaseAddr = BaseAddr + Offset. 14671 EVT ArithType = BaseAddr.getValueType(); 14672 SDLoc DL(Origin); 14673 BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr, 14674 DAG->getConstant(Offset, DL, ArithType)); 14675 } 14676 14677 // Create the type of the loaded slice according to its size. 14678 EVT SliceType = getLoadedType(); 14679 14680 // Create the load for the slice. 14681 SDValue LastInst = 14682 DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, 14683 Origin->getPointerInfo().getWithOffset(Offset), 14684 getAlignment(), Origin->getMemOperand()->getFlags()); 14685 // If the final type is not the same as the loaded type, this means that 14686 // we have to pad with zero. Create a zero extend for that. 14687 EVT FinalType = Inst->getValueType(0); 14688 if (SliceType != FinalType) 14689 LastInst = 14690 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); 14691 return LastInst; 14692 } 14693 14694 /// Check if this slice can be merged with an expensive cross register 14695 /// bank copy. E.g., 14696 /// i = load i32 14697 /// f = bitcast i32 i to float 14698 bool canMergeExpensiveCrossRegisterBankCopy() const { 14699 if (!Inst || !Inst->hasOneUse()) 14700 return false; 14701 SDNode *Use = *Inst->use_begin(); 14702 if (Use->getOpcode() != ISD::BITCAST) 14703 return false; 14704 assert(DAG && "Missing context"); 14705 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 14706 EVT ResVT = Use->getValueType(0); 14707 const TargetRegisterClass *ResRC = 14708 TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent()); 14709 const TargetRegisterClass *ArgRC = 14710 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(), 14711 Use->getOperand(0)->isDivergent()); 14712 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) 14713 return false; 14714 14715 // At this point, we know that we perform a cross-register-bank copy. 14716 // Check if it is expensive. 14717 const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); 14718 // Assume bitcasts are cheap, unless both register classes do not 14719 // explicitly share a common sub class. 14720 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) 14721 return false; 14722 14723 // Check if it will be merged with the load. 14724 // 1. Check the alignment constraint. 14725 unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment( 14726 ResVT.getTypeForEVT(*DAG->getContext())); 14727 14728 if (RequiredAlignment > getAlignment()) 14729 return false; 14730 14731 // 2. Check that the load is a legal operation for that type. 14732 if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) 14733 return false; 14734 14735 // 3. Check that we do not have a zext in the way. 14736 if (Inst->getValueType(0) != getLoadedType()) 14737 return false; 14738 14739 return true; 14740 } 14741 }; 14742 14743 } // end anonymous namespace 14744 14745 /// Check that all bits set in \p UsedBits form a dense region, i.e., 14746 /// \p UsedBits looks like 0..0 1..1 0..0. 14747 static bool areUsedBitsDense(const APInt &UsedBits) { 14748 // If all the bits are one, this is dense! 14749 if (UsedBits.isAllOnesValue()) 14750 return true; 14751 14752 // Get rid of the unused bits on the right. 14753 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); 14754 // Get rid of the unused bits on the left. 14755 if (NarrowedUsedBits.countLeadingZeros()) 14756 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); 14757 // Check that the chunk of bits is completely used. 14758 return NarrowedUsedBits.isAllOnesValue(); 14759 } 14760 14761 /// Check whether or not \p First and \p Second are next to each other 14762 /// in memory. This means that there is no hole between the bits loaded 14763 /// by \p First and the bits loaded by \p Second. 14764 static bool areSlicesNextToEachOther(const LoadedSlice &First, 14765 const LoadedSlice &Second) { 14766 assert(First.Origin == Second.Origin && First.Origin && 14767 "Unable to match different memory origins."); 14768 APInt UsedBits = First.getUsedBits(); 14769 assert((UsedBits & Second.getUsedBits()) == 0 && 14770 "Slices are not supposed to overlap."); 14771 UsedBits |= Second.getUsedBits(); 14772 return areUsedBitsDense(UsedBits); 14773 } 14774 14775 /// Adjust the \p GlobalLSCost according to the target 14776 /// paring capabilities and the layout of the slices. 14777 /// \pre \p GlobalLSCost should account for at least as many loads as 14778 /// there is in the slices in \p LoadedSlices. 14779 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices, 14780 LoadedSlice::Cost &GlobalLSCost) { 14781 unsigned NumberOfSlices = LoadedSlices.size(); 14782 // If there is less than 2 elements, no pairing is possible. 14783 if (NumberOfSlices < 2) 14784 return; 14785 14786 // Sort the slices so that elements that are likely to be next to each 14787 // other in memory are next to each other in the list. 14788 llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) { 14789 assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); 14790 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); 14791 }); 14792 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); 14793 // First (resp. Second) is the first (resp. Second) potentially candidate 14794 // to be placed in a paired load. 14795 const LoadedSlice *First = nullptr; 14796 const LoadedSlice *Second = nullptr; 14797 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, 14798 // Set the beginning of the pair. 14799 First = Second) { 14800 Second = &LoadedSlices[CurrSlice]; 14801 14802 // If First is NULL, it means we start a new pair. 14803 // Get to the next slice. 14804 if (!First) 14805 continue; 14806 14807 EVT LoadedType = First->getLoadedType(); 14808 14809 // If the types of the slices are different, we cannot pair them. 14810 if (LoadedType != Second->getLoadedType()) 14811 continue; 14812 14813 // Check if the target supplies paired loads for this type. 14814 unsigned RequiredAlignment = 0; 14815 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { 14816 // move to the next pair, this type is hopeless. 14817 Second = nullptr; 14818 continue; 14819 } 14820 // Check if we meet the alignment requirement. 14821 if (RequiredAlignment > First->getAlignment()) 14822 continue; 14823 14824 // Check that both loads are next to each other in memory. 14825 if (!areSlicesNextToEachOther(*First, *Second)) 14826 continue; 14827 14828 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); 14829 --GlobalLSCost.Loads; 14830 // Move to the next pair. 14831 Second = nullptr; 14832 } 14833 } 14834 14835 /// Check the profitability of all involved LoadedSlice. 14836 /// Currently, it is considered profitable if there is exactly two 14837 /// involved slices (1) which are (2) next to each other in memory, and 14838 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). 14839 /// 14840 /// Note: The order of the elements in \p LoadedSlices may be modified, but not 14841 /// the elements themselves. 14842 /// 14843 /// FIXME: When the cost model will be mature enough, we can relax 14844 /// constraints (1) and (2). 14845 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices, 14846 const APInt &UsedBits, bool ForCodeSize) { 14847 unsigned NumberOfSlices = LoadedSlices.size(); 14848 if (StressLoadSlicing) 14849 return NumberOfSlices > 1; 14850 14851 // Check (1). 14852 if (NumberOfSlices != 2) 14853 return false; 14854 14855 // Check (2). 14856 if (!areUsedBitsDense(UsedBits)) 14857 return false; 14858 14859 // Check (3). 14860 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); 14861 // The original code has one big load. 14862 OrigCost.Loads = 1; 14863 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { 14864 const LoadedSlice &LS = LoadedSlices[CurrSlice]; 14865 // Accumulate the cost of all the slices. 14866 LoadedSlice::Cost SliceCost(LS, ForCodeSize); 14867 GlobalSlicingCost += SliceCost; 14868 14869 // Account as cost in the original configuration the gain obtained 14870 // with the current slices. 14871 OrigCost.addSliceGain(LS); 14872 } 14873 14874 // If the target supports paired load, adjust the cost accordingly. 14875 adjustCostForPairing(LoadedSlices, GlobalSlicingCost); 14876 return OrigCost > GlobalSlicingCost; 14877 } 14878 14879 /// If the given load, \p LI, is used only by trunc or trunc(lshr) 14880 /// operations, split it in the various pieces being extracted. 14881 /// 14882 /// This sort of thing is introduced by SROA. 14883 /// This slicing takes care not to insert overlapping loads. 14884 /// \pre LI is a simple load (i.e., not an atomic or volatile load). 14885 bool DAGCombiner::SliceUpLoad(SDNode *N) { 14886 if (Level < AfterLegalizeDAG) 14887 return false; 14888 14889 LoadSDNode *LD = cast<LoadSDNode>(N); 14890 if (!LD->isSimple() || !ISD::isNormalLoad(LD) || 14891 !LD->getValueType(0).isInteger()) 14892 return false; 14893 14894 // Keep track of already used bits to detect overlapping values. 14895 // In that case, we will just abort the transformation. 14896 APInt UsedBits(LD->getValueSizeInBits(0), 0); 14897 14898 SmallVector<LoadedSlice, 4> LoadedSlices; 14899 14900 // Check if this load is used as several smaller chunks of bits. 14901 // Basically, look for uses in trunc or trunc(lshr) and record a new chain 14902 // of computation for each trunc. 14903 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); 14904 UI != UIEnd; ++UI) { 14905 // Skip the uses of the chain. 14906 if (UI.getUse().getResNo() != 0) 14907 continue; 14908 14909 SDNode *User = *UI; 14910 unsigned Shift = 0; 14911 14912 // Check if this is a trunc(lshr). 14913 if (User->getOpcode() == ISD::SRL && User->hasOneUse() && 14914 isa<ConstantSDNode>(User->getOperand(1))) { 14915 Shift = User->getConstantOperandVal(1); 14916 User = *User->use_begin(); 14917 } 14918 14919 // At this point, User is a Truncate, iff we encountered, trunc or 14920 // trunc(lshr). 14921 if (User->getOpcode() != ISD::TRUNCATE) 14922 return false; 14923 14924 // The width of the type must be a power of 2 and greater than 8-bits. 14925 // Otherwise the load cannot be represented in LLVM IR. 14926 // Moreover, if we shifted with a non-8-bits multiple, the slice 14927 // will be across several bytes. We do not support that. 14928 unsigned Width = User->getValueSizeInBits(0); 14929 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) 14930 return false; 14931 14932 // Build the slice for this chain of computations. 14933 LoadedSlice LS(User, LD, Shift, &DAG); 14934 APInt CurrentUsedBits = LS.getUsedBits(); 14935 14936 // Check if this slice overlaps with another. 14937 if ((CurrentUsedBits & UsedBits) != 0) 14938 return false; 14939 // Update the bits used globally. 14940 UsedBits |= CurrentUsedBits; 14941 14942 // Check if the new slice would be legal. 14943 if (!LS.isLegal()) 14944 return false; 14945 14946 // Record the slice. 14947 LoadedSlices.push_back(LS); 14948 } 14949 14950 // Abort slicing if it does not seem to be profitable. 14951 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) 14952 return false; 14953 14954 ++SlicedLoads; 14955 14956 // Rewrite each chain to use an independent load. 14957 // By construction, each chain can be represented by a unique load. 14958 14959 // Prepare the argument for the new token factor for all the slices. 14960 SmallVector<SDValue, 8> ArgChains; 14961 for (SmallVectorImpl<LoadedSlice>::const_iterator 14962 LSIt = LoadedSlices.begin(), 14963 LSItEnd = LoadedSlices.end(); 14964 LSIt != LSItEnd; ++LSIt) { 14965 SDValue SliceInst = LSIt->loadSlice(); 14966 CombineTo(LSIt->Inst, SliceInst, true); 14967 if (SliceInst.getOpcode() != ISD::LOAD) 14968 SliceInst = SliceInst.getOperand(0); 14969 assert(SliceInst->getOpcode() == ISD::LOAD && 14970 "It takes more than a zext to get to the loaded slice!!"); 14971 ArgChains.push_back(SliceInst.getValue(1)); 14972 } 14973 14974 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, 14975 ArgChains); 14976 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 14977 AddToWorklist(Chain.getNode()); 14978 return true; 14979 } 14980 14981 /// Check to see if V is (and load (ptr), imm), where the load is having 14982 /// specific bytes cleared out. If so, return the byte size being masked out 14983 /// and the shift amount. 14984 static std::pair<unsigned, unsigned> 14985 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { 14986 std::pair<unsigned, unsigned> Result(0, 0); 14987 14988 // Check for the structure we're looking for. 14989 if (V->getOpcode() != ISD::AND || 14990 !isa<ConstantSDNode>(V->getOperand(1)) || 14991 !ISD::isNormalLoad(V->getOperand(0).getNode())) 14992 return Result; 14993 14994 // Check the chain and pointer. 14995 LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0)); 14996 if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. 14997 14998 // This only handles simple types. 14999 if (V.getValueType() != MVT::i16 && 15000 V.getValueType() != MVT::i32 && 15001 V.getValueType() != MVT::i64) 15002 return Result; 15003 15004 // Check the constant mask. Invert it so that the bits being masked out are 15005 // 0 and the bits being kept are 1. Use getSExtValue so that leading bits 15006 // follow the sign bit for uniformity. 15007 uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue(); 15008 unsigned NotMaskLZ = countLeadingZeros(NotMask); 15009 if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. 15010 unsigned NotMaskTZ = countTrailingZeros(NotMask); 15011 if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. 15012 if (NotMaskLZ == 64) return Result; // All zero mask. 15013 15014 // See if we have a continuous run of bits. If so, we have 0*1+0* 15015 if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) 15016 return Result; 15017 15018 // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. 15019 if (V.getValueType() != MVT::i64 && NotMaskLZ) 15020 NotMaskLZ -= 64-V.getValueSizeInBits(); 15021 15022 unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; 15023 switch (MaskedBytes) { 15024 case 1: 15025 case 2: 15026 case 4: break; 15027 default: return Result; // All one mask, or 5-byte mask. 15028 } 15029 15030 // Verify that the first bit starts at a multiple of mask so that the access 15031 // is aligned the same as the access width. 15032 if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; 15033 15034 // For narrowing to be valid, it must be the case that the load the 15035 // immediately preceding memory operation before the store. 15036 if (LD == Chain.getNode()) 15037 ; // ok. 15038 else if (Chain->getOpcode() == ISD::TokenFactor && 15039 SDValue(LD, 1).hasOneUse()) { 15040 // LD has only 1 chain use so they are no indirect dependencies. 15041 if (!LD->isOperandOf(Chain.getNode())) 15042 return Result; 15043 } else 15044 return Result; // Fail. 15045 15046 Result.first = MaskedBytes; 15047 Result.second = NotMaskTZ/8; 15048 return Result; 15049 } 15050 15051 /// Check to see if IVal is something that provides a value as specified by 15052 /// MaskInfo. If so, replace the specified store with a narrower store of 15053 /// truncated IVal. 15054 static SDValue 15055 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, 15056 SDValue IVal, StoreSDNode *St, 15057 DAGCombiner *DC) { 15058 unsigned NumBytes = MaskInfo.first; 15059 unsigned ByteShift = MaskInfo.second; 15060 SelectionDAG &DAG = DC->getDAG(); 15061 15062 // Check to see if IVal is all zeros in the part being masked in by the 'or' 15063 // that uses this. If not, this is not a replacement. 15064 APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), 15065 ByteShift*8, (ByteShift+NumBytes)*8); 15066 if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue(); 15067 15068 // Check that it is legal on the target to do this. It is legal if the new 15069 // VT we're shrinking to (i8/i16/i32) is legal or we're still before type 15070 // legalization (and the target doesn't explicitly think this is a bad idea). 15071 MVT VT = MVT::getIntegerVT(NumBytes * 8); 15072 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15073 if (!DC->isTypeLegal(VT)) 15074 return SDValue(); 15075 if (St->getMemOperand() && 15076 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 15077 *St->getMemOperand())) 15078 return SDValue(); 15079 15080 // Okay, we can do this! Replace the 'St' store with a store of IVal that is 15081 // shifted by ByteShift and truncated down to NumBytes. 15082 if (ByteShift) { 15083 SDLoc DL(IVal); 15084 IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal, 15085 DAG.getConstant(ByteShift*8, DL, 15086 DC->getShiftAmountTy(IVal.getValueType()))); 15087 } 15088 15089 // Figure out the offset for the store and the alignment of the access. 15090 unsigned StOffset; 15091 unsigned NewAlign = St->getAlignment(); 15092 15093 if (DAG.getDataLayout().isLittleEndian()) 15094 StOffset = ByteShift; 15095 else 15096 StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; 15097 15098 SDValue Ptr = St->getBasePtr(); 15099 if (StOffset) { 15100 SDLoc DL(IVal); 15101 Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL); 15102 NewAlign = MinAlign(NewAlign, StOffset); 15103 } 15104 15105 // Truncate down to the new size. 15106 IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); 15107 15108 ++OpsNarrowed; 15109 return DAG 15110 .getStore(St->getChain(), SDLoc(St), IVal, Ptr, 15111 St->getPointerInfo().getWithOffset(StOffset), NewAlign); 15112 } 15113 15114 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and 15115 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try 15116 /// narrowing the load and store if it would end up being a win for performance 15117 /// or code size. 15118 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { 15119 StoreSDNode *ST = cast<StoreSDNode>(N); 15120 if (!ST->isSimple()) 15121 return SDValue(); 15122 15123 SDValue Chain = ST->getChain(); 15124 SDValue Value = ST->getValue(); 15125 SDValue Ptr = ST->getBasePtr(); 15126 EVT VT = Value.getValueType(); 15127 15128 if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) 15129 return SDValue(); 15130 15131 unsigned Opc = Value.getOpcode(); 15132 15133 // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst 15134 // is a byte mask indicating a consecutive number of bytes, check to see if 15135 // Y is known to provide just those bytes. If so, we try to replace the 15136 // load + replace + store sequence with a single (narrower) store, which makes 15137 // the load dead. 15138 if (Opc == ISD::OR) { 15139 std::pair<unsigned, unsigned> MaskedLoad; 15140 MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); 15141 if (MaskedLoad.first) 15142 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 15143 Value.getOperand(1), ST,this)) 15144 return NewST; 15145 15146 // Or is commutative, so try swapping X and Y. 15147 MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); 15148 if (MaskedLoad.first) 15149 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 15150 Value.getOperand(0), ST,this)) 15151 return NewST; 15152 } 15153 15154 if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || 15155 Value.getOperand(1).getOpcode() != ISD::Constant) 15156 return SDValue(); 15157 15158 SDValue N0 = Value.getOperand(0); 15159 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 15160 Chain == SDValue(N0.getNode(), 1)) { 15161 LoadSDNode *LD = cast<LoadSDNode>(N0); 15162 if (LD->getBasePtr() != Ptr || 15163 LD->getPointerInfo().getAddrSpace() != 15164 ST->getPointerInfo().getAddrSpace()) 15165 return SDValue(); 15166 15167 // Find the type to narrow it the load / op / store to. 15168 SDValue N1 = Value.getOperand(1); 15169 unsigned BitWidth = N1.getValueSizeInBits(); 15170 APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue(); 15171 if (Opc == ISD::AND) 15172 Imm ^= APInt::getAllOnesValue(BitWidth); 15173 if (Imm == 0 || Imm.isAllOnesValue()) 15174 return SDValue(); 15175 unsigned ShAmt = Imm.countTrailingZeros(); 15176 unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; 15177 unsigned NewBW = NextPowerOf2(MSB - ShAmt); 15178 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 15179 // The narrowing should be profitable, the load/store operation should be 15180 // legal (or custom) and the store size should be equal to the NewVT width. 15181 while (NewBW < BitWidth && 15182 (NewVT.getStoreSizeInBits() != NewBW || 15183 !TLI.isOperationLegalOrCustom(Opc, NewVT) || 15184 !TLI.isNarrowingProfitable(VT, NewVT))) { 15185 NewBW = NextPowerOf2(NewBW); 15186 NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 15187 } 15188 if (NewBW >= BitWidth) 15189 return SDValue(); 15190 15191 // If the lsb changed does not start at the type bitwidth boundary, 15192 // start at the previous one. 15193 if (ShAmt % NewBW) 15194 ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; 15195 APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, 15196 std::min(BitWidth, ShAmt + NewBW)); 15197 if ((Imm & Mask) == Imm) { 15198 APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); 15199 if (Opc == ISD::AND) 15200 NewImm ^= APInt::getAllOnesValue(NewBW); 15201 uint64_t PtrOff = ShAmt / 8; 15202 // For big endian targets, we need to adjust the offset to the pointer to 15203 // load the correct bytes. 15204 if (DAG.getDataLayout().isBigEndian()) 15205 PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; 15206 15207 unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); 15208 Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); 15209 if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy)) 15210 return SDValue(); 15211 15212 SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD)); 15213 SDValue NewLD = 15214 DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, 15215 LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, 15216 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 15217 SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, 15218 DAG.getConstant(NewImm, SDLoc(Value), 15219 NewVT)); 15220 SDValue NewST = 15221 DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, 15222 ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); 15223 15224 AddToWorklist(NewPtr.getNode()); 15225 AddToWorklist(NewLD.getNode()); 15226 AddToWorklist(NewVal.getNode()); 15227 WorklistRemover DeadNodes(*this); 15228 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); 15229 ++OpsNarrowed; 15230 return NewST; 15231 } 15232 } 15233 15234 return SDValue(); 15235 } 15236 15237 /// For a given floating point load / store pair, if the load value isn't used 15238 /// by any other operations, then consider transforming the pair to integer 15239 /// load / store operations if the target deems the transformation profitable. 15240 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { 15241 StoreSDNode *ST = cast<StoreSDNode>(N); 15242 SDValue Value = ST->getValue(); 15243 if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && 15244 Value.hasOneUse()) { 15245 LoadSDNode *LD = cast<LoadSDNode>(Value); 15246 EVT VT = LD->getMemoryVT(); 15247 if (!VT.isFloatingPoint() || 15248 VT != ST->getMemoryVT() || 15249 LD->isNonTemporal() || 15250 ST->isNonTemporal() || 15251 LD->getPointerInfo().getAddrSpace() != 0 || 15252 ST->getPointerInfo().getAddrSpace() != 0) 15253 return SDValue(); 15254 15255 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); 15256 if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || 15257 !TLI.isOperationLegal(ISD::STORE, IntVT) || 15258 !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || 15259 !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) 15260 return SDValue(); 15261 15262 unsigned LDAlign = LD->getAlignment(); 15263 unsigned STAlign = ST->getAlignment(); 15264 Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); 15265 unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy); 15266 if (LDAlign < ABIAlign || STAlign < ABIAlign) 15267 return SDValue(); 15268 15269 SDValue NewLD = 15270 DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), 15271 LD->getPointerInfo(), LDAlign); 15272 15273 SDValue NewST = 15274 DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(), 15275 ST->getPointerInfo(), STAlign); 15276 15277 AddToWorklist(NewLD.getNode()); 15278 AddToWorklist(NewST.getNode()); 15279 WorklistRemover DeadNodes(*this); 15280 DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); 15281 ++LdStFP2Int; 15282 return NewST; 15283 } 15284 15285 return SDValue(); 15286 } 15287 15288 // This is a helper function for visitMUL to check the profitability 15289 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 15290 // MulNode is the original multiply, AddNode is (add x, c1), 15291 // and ConstNode is c2. 15292 // 15293 // If the (add x, c1) has multiple uses, we could increase 15294 // the number of adds if we make this transformation. 15295 // It would only be worth doing this if we can remove a 15296 // multiply in the process. Check for that here. 15297 // To illustrate: 15298 // (A + c1) * c3 15299 // (A + c2) * c3 15300 // We're checking for cases where we have common "c3 * A" expressions. 15301 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, 15302 SDValue &AddNode, 15303 SDValue &ConstNode) { 15304 APInt Val; 15305 15306 // If the add only has one use, this would be OK to do. 15307 if (AddNode.getNode()->hasOneUse()) 15308 return true; 15309 15310 // Walk all the users of the constant with which we're multiplying. 15311 for (SDNode *Use : ConstNode->uses()) { 15312 if (Use == MulNode) // This use is the one we're on right now. Skip it. 15313 continue; 15314 15315 if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. 15316 SDNode *OtherOp; 15317 SDNode *MulVar = AddNode.getOperand(0).getNode(); 15318 15319 // OtherOp is what we're multiplying against the constant. 15320 if (Use->getOperand(0) == ConstNode) 15321 OtherOp = Use->getOperand(1).getNode(); 15322 else 15323 OtherOp = Use->getOperand(0).getNode(); 15324 15325 // Check to see if multiply is with the same operand of our "add". 15326 // 15327 // ConstNode = CONST 15328 // Use = ConstNode * A <-- visiting Use. OtherOp is A. 15329 // ... 15330 // AddNode = (A + c1) <-- MulVar is A. 15331 // = AddNode * ConstNode <-- current visiting instruction. 15332 // 15333 // If we make this transformation, we will have a common 15334 // multiply (ConstNode * A) that we can save. 15335 if (OtherOp == MulVar) 15336 return true; 15337 15338 // Now check to see if a future expansion will give us a common 15339 // multiply. 15340 // 15341 // ConstNode = CONST 15342 // AddNode = (A + c1) 15343 // ... = AddNode * ConstNode <-- current visiting instruction. 15344 // ... 15345 // OtherOp = (A + c2) 15346 // Use = OtherOp * ConstNode <-- visiting Use. 15347 // 15348 // If we make this transformation, we will have a common 15349 // multiply (CONST * A) after we also do the same transformation 15350 // to the "t2" instruction. 15351 if (OtherOp->getOpcode() == ISD::ADD && 15352 DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && 15353 OtherOp->getOperand(0).getNode() == MulVar) 15354 return true; 15355 } 15356 } 15357 15358 // Didn't find a case where this would be profitable. 15359 return false; 15360 } 15361 15362 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 15363 unsigned NumStores) { 15364 SmallVector<SDValue, 8> Chains; 15365 SmallPtrSet<const SDNode *, 8> Visited; 15366 SDLoc StoreDL(StoreNodes[0].MemNode); 15367 15368 for (unsigned i = 0; i < NumStores; ++i) { 15369 Visited.insert(StoreNodes[i].MemNode); 15370 } 15371 15372 // don't include nodes that are children or repeated nodes. 15373 for (unsigned i = 0; i < NumStores; ++i) { 15374 if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second) 15375 Chains.push_back(StoreNodes[i].MemNode->getChain()); 15376 } 15377 15378 assert(Chains.size() > 0 && "Chain should have generated a chain"); 15379 return DAG.getTokenFactor(StoreDL, Chains); 15380 } 15381 15382 bool DAGCombiner::MergeStoresOfConstantsOrVecElts( 15383 SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, 15384 bool IsConstantSrc, bool UseVector, bool UseTrunc) { 15385 // Make sure we have something to merge. 15386 if (NumStores < 2) 15387 return false; 15388 15389 // The latest Node in the DAG. 15390 SDLoc DL(StoreNodes[0].MemNode); 15391 15392 TypeSize ElementSizeBits = MemVT.getStoreSizeInBits(); 15393 unsigned SizeInBits = NumStores * ElementSizeBits; 15394 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 15395 15396 EVT StoreTy; 15397 if (UseVector) { 15398 unsigned Elts = NumStores * NumMemElts; 15399 // Get the type for the merged vector store. 15400 StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 15401 } else 15402 StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); 15403 15404 SDValue StoredVal; 15405 if (UseVector) { 15406 if (IsConstantSrc) { 15407 SmallVector<SDValue, 8> BuildVector; 15408 for (unsigned I = 0; I != NumStores; ++I) { 15409 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); 15410 SDValue Val = St->getValue(); 15411 // If constant is of the wrong type, convert it now. 15412 if (MemVT != Val.getValueType()) { 15413 Val = peekThroughBitcasts(Val); 15414 // Deal with constants of wrong size. 15415 if (ElementSizeBits != Val.getValueSizeInBits()) { 15416 EVT IntMemVT = 15417 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 15418 if (isa<ConstantFPSDNode>(Val)) { 15419 // Not clear how to truncate FP values. 15420 return false; 15421 } else if (auto *C = dyn_cast<ConstantSDNode>(Val)) 15422 Val = DAG.getConstant(C->getAPIntValue() 15423 .zextOrTrunc(Val.getValueSizeInBits()) 15424 .zextOrTrunc(ElementSizeBits), 15425 SDLoc(C), IntMemVT); 15426 } 15427 // Make sure correctly size type is the correct type. 15428 Val = DAG.getBitcast(MemVT, Val); 15429 } 15430 BuildVector.push_back(Val); 15431 } 15432 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 15433 : ISD::BUILD_VECTOR, 15434 DL, StoreTy, BuildVector); 15435 } else { 15436 SmallVector<SDValue, 8> Ops; 15437 for (unsigned i = 0; i < NumStores; ++i) { 15438 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 15439 SDValue Val = peekThroughBitcasts(St->getValue()); 15440 // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of 15441 // type MemVT. If the underlying value is not the correct 15442 // type, but it is an extraction of an appropriate vector we 15443 // can recast Val to be of the correct type. This may require 15444 // converting between EXTRACT_VECTOR_ELT and 15445 // EXTRACT_SUBVECTOR. 15446 if ((MemVT != Val.getValueType()) && 15447 (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 15448 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { 15449 EVT MemVTScalarTy = MemVT.getScalarType(); 15450 // We may need to add a bitcast here to get types to line up. 15451 if (MemVTScalarTy != Val.getValueType().getScalarType()) { 15452 Val = DAG.getBitcast(MemVT, Val); 15453 } else { 15454 unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR 15455 : ISD::EXTRACT_VECTOR_ELT; 15456 SDValue Vec = Val.getOperand(0); 15457 SDValue Idx = Val.getOperand(1); 15458 Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx); 15459 } 15460 } 15461 Ops.push_back(Val); 15462 } 15463 15464 // Build the extracted vector elements back into a vector. 15465 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 15466 : ISD::BUILD_VECTOR, 15467 DL, StoreTy, Ops); 15468 } 15469 } else { 15470 // We should always use a vector store when merging extracted vector 15471 // elements, so this path implies a store of constants. 15472 assert(IsConstantSrc && "Merged vector elements should use vector store"); 15473 15474 APInt StoreInt(SizeInBits, 0); 15475 15476 // Construct a single integer constant which is made of the smaller 15477 // constant inputs. 15478 bool IsLE = DAG.getDataLayout().isLittleEndian(); 15479 for (unsigned i = 0; i < NumStores; ++i) { 15480 unsigned Idx = IsLE ? (NumStores - 1 - i) : i; 15481 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); 15482 15483 SDValue Val = St->getValue(); 15484 Val = peekThroughBitcasts(Val); 15485 StoreInt <<= ElementSizeBits; 15486 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { 15487 StoreInt |= C->getAPIntValue() 15488 .zextOrTrunc(ElementSizeBits) 15489 .zextOrTrunc(SizeInBits); 15490 } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { 15491 StoreInt |= C->getValueAPF() 15492 .bitcastToAPInt() 15493 .zextOrTrunc(ElementSizeBits) 15494 .zextOrTrunc(SizeInBits); 15495 // If fp truncation is necessary give up for now. 15496 if (MemVT.getSizeInBits() != ElementSizeBits) 15497 return false; 15498 } else { 15499 llvm_unreachable("Invalid constant element type"); 15500 } 15501 } 15502 15503 // Create the new Load and Store operations. 15504 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); 15505 } 15506 15507 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 15508 SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); 15509 15510 // make sure we use trunc store if it's necessary to be legal. 15511 SDValue NewStore; 15512 if (!UseTrunc) { 15513 NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), 15514 FirstInChain->getPointerInfo(), 15515 FirstInChain->getAlignment()); 15516 } else { // Must be realized as a trunc store 15517 EVT LegalizedStoredValTy = 15518 TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); 15519 unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits(); 15520 ConstantSDNode *C = cast<ConstantSDNode>(StoredVal); 15521 SDValue ExtendedStoreVal = 15522 DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, 15523 LegalizedStoredValTy); 15524 NewStore = DAG.getTruncStore( 15525 NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), 15526 FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, 15527 FirstInChain->getAlignment(), 15528 FirstInChain->getMemOperand()->getFlags()); 15529 } 15530 15531 // Replace all merged stores with the new store. 15532 for (unsigned i = 0; i < NumStores; ++i) 15533 CombineTo(StoreNodes[i].MemNode, NewStore); 15534 15535 AddToWorklist(NewChain.getNode()); 15536 return true; 15537 } 15538 15539 void DAGCombiner::getStoreMergeCandidates( 15540 StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes, 15541 SDNode *&RootNode) { 15542 // This holds the base pointer, index, and the offset in bytes from the base 15543 // pointer. 15544 BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 15545 EVT MemVT = St->getMemoryVT(); 15546 15547 SDValue Val = peekThroughBitcasts(St->getValue()); 15548 // We must have a base and an offset. 15549 if (!BasePtr.getBase().getNode()) 15550 return; 15551 15552 // Do not handle stores to undef base pointers. 15553 if (BasePtr.getBase().isUndef()) 15554 return; 15555 15556 bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val); 15557 bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 15558 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); 15559 bool IsLoadSrc = isa<LoadSDNode>(Val); 15560 BaseIndexOffset LBasePtr; 15561 // Match on loadbaseptr if relevant. 15562 EVT LoadVT; 15563 if (IsLoadSrc) { 15564 auto *Ld = cast<LoadSDNode>(Val); 15565 LBasePtr = BaseIndexOffset::match(Ld, DAG); 15566 LoadVT = Ld->getMemoryVT(); 15567 // Load and store should be the same type. 15568 if (MemVT != LoadVT) 15569 return; 15570 // Loads must only have one use. 15571 if (!Ld->hasNUsesOfValue(1, 0)) 15572 return; 15573 // The memory operands must not be volatile/indexed/atomic. 15574 // TODO: May be able to relax for unordered atomics (see D66309) 15575 if (!Ld->isSimple() || Ld->isIndexed()) 15576 return; 15577 } 15578 auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, 15579 int64_t &Offset) -> bool { 15580 // The memory operands must not be volatile/indexed/atomic. 15581 // TODO: May be able to relax for unordered atomics (see D66309) 15582 if (!Other->isSimple() || Other->isIndexed()) 15583 return false; 15584 // Don't mix temporal stores with non-temporal stores. 15585 if (St->isNonTemporal() != Other->isNonTemporal()) 15586 return false; 15587 SDValue OtherBC = peekThroughBitcasts(Other->getValue()); 15588 // Allow merging constants of different types as integers. 15589 bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) 15590 : Other->getMemoryVT() != MemVT; 15591 if (IsLoadSrc) { 15592 if (NoTypeMatch) 15593 return false; 15594 // The Load's Base Ptr must also match 15595 if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) { 15596 BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); 15597 if (LoadVT != OtherLd->getMemoryVT()) 15598 return false; 15599 // Loads must only have one use. 15600 if (!OtherLd->hasNUsesOfValue(1, 0)) 15601 return false; 15602 // The memory operands must not be volatile/indexed/atomic. 15603 // TODO: May be able to relax for unordered atomics (see D66309) 15604 if (!OtherLd->isSimple() || 15605 OtherLd->isIndexed()) 15606 return false; 15607 // Don't mix temporal loads with non-temporal loads. 15608 if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) 15609 return false; 15610 if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) 15611 return false; 15612 } else 15613 return false; 15614 } 15615 if (IsConstantSrc) { 15616 if (NoTypeMatch) 15617 return false; 15618 if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC))) 15619 return false; 15620 } 15621 if (IsExtractVecSrc) { 15622 // Do not merge truncated stores here. 15623 if (Other->isTruncatingStore()) 15624 return false; 15625 if (!MemVT.bitsEq(OtherBC.getValueType())) 15626 return false; 15627 if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT && 15628 OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) 15629 return false; 15630 } 15631 Ptr = BaseIndexOffset::match(Other, DAG); 15632 return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); 15633 }; 15634 15635 // Check if the pair of StoreNode and the RootNode already bail out many 15636 // times which is over the limit in dependence check. 15637 auto OverLimitInDependenceCheck = [&](SDNode *StoreNode, 15638 SDNode *RootNode) -> bool { 15639 auto RootCount = StoreRootCountMap.find(StoreNode); 15640 if (RootCount != StoreRootCountMap.end() && 15641 RootCount->second.first == RootNode && 15642 RootCount->second.second > StoreMergeDependenceLimit) 15643 return true; 15644 return false; 15645 }; 15646 15647 // We looking for a root node which is an ancestor to all mergable 15648 // stores. We search up through a load, to our root and then down 15649 // through all children. For instance we will find Store{1,2,3} if 15650 // St is Store1, Store2. or Store3 where the root is not a load 15651 // which always true for nonvolatile ops. TODO: Expand 15652 // the search to find all valid candidates through multiple layers of loads. 15653 // 15654 // Root 15655 // |-------|-------| 15656 // Load Load Store3 15657 // | | 15658 // Store1 Store2 15659 // 15660 // FIXME: We should be able to climb and 15661 // descend TokenFactors to find candidates as well. 15662 15663 RootNode = St->getChain().getNode(); 15664 15665 unsigned NumNodesExplored = 0; 15666 if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { 15667 RootNode = Ldn->getChain().getNode(); 15668 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 15669 I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) 15670 if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain 15671 for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) 15672 if (I2.getOperandNo() == 0) 15673 if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { 15674 BaseIndexOffset Ptr; 15675 int64_t PtrDiff; 15676 if (CandidateMatch(OtherST, Ptr, PtrDiff) && 15677 !OverLimitInDependenceCheck(OtherST, RootNode)) 15678 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); 15679 } 15680 } else 15681 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 15682 I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) 15683 if (I.getOperandNo() == 0) 15684 if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { 15685 BaseIndexOffset Ptr; 15686 int64_t PtrDiff; 15687 if (CandidateMatch(OtherST, Ptr, PtrDiff) && 15688 !OverLimitInDependenceCheck(OtherST, RootNode)) 15689 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); 15690 } 15691 } 15692 15693 // We need to check that merging these stores does not cause a loop in 15694 // the DAG. Any store candidate may depend on another candidate 15695 // indirectly through its operand (we already consider dependencies 15696 // through the chain). Check in parallel by searching up from 15697 // non-chain operands of candidates. 15698 bool DAGCombiner::checkMergeStoreCandidatesForDependencies( 15699 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 15700 SDNode *RootNode) { 15701 // FIXME: We should be able to truncate a full search of 15702 // predecessors by doing a BFS and keeping tabs the originating 15703 // stores from which worklist nodes come from in a similar way to 15704 // TokenFactor simplfication. 15705 15706 SmallPtrSet<const SDNode *, 32> Visited; 15707 SmallVector<const SDNode *, 8> Worklist; 15708 15709 // RootNode is a predecessor to all candidates so we need not search 15710 // past it. Add RootNode (peeking through TokenFactors). Do not count 15711 // these towards size check. 15712 15713 Worklist.push_back(RootNode); 15714 while (!Worklist.empty()) { 15715 auto N = Worklist.pop_back_val(); 15716 if (!Visited.insert(N).second) 15717 continue; // Already present in Visited. 15718 if (N->getOpcode() == ISD::TokenFactor) { 15719 for (SDValue Op : N->ops()) 15720 Worklist.push_back(Op.getNode()); 15721 } 15722 } 15723 15724 // Don't count pruning nodes towards max. 15725 unsigned int Max = 1024 + Visited.size(); 15726 // Search Ops of store candidates. 15727 for (unsigned i = 0; i < NumStores; ++i) { 15728 SDNode *N = StoreNodes[i].MemNode; 15729 // Of the 4 Store Operands: 15730 // * Chain (Op 0) -> We have already considered these 15731 // in candidate selection and can be 15732 // safely ignored 15733 // * Value (Op 1) -> Cycles may happen (e.g. through load chains) 15734 // * Address (Op 2) -> Merged addresses may only vary by a fixed constant, 15735 // but aren't necessarily fromt the same base node, so 15736 // cycles possible (e.g. via indexed store). 15737 // * (Op 3) -> Represents the pre or post-indexing offset (or undef for 15738 // non-indexed stores). Not constant on all targets (e.g. ARM) 15739 // and so can participate in a cycle. 15740 for (unsigned j = 1; j < N->getNumOperands(); ++j) 15741 Worklist.push_back(N->getOperand(j).getNode()); 15742 } 15743 // Search through DAG. We can stop early if we find a store node. 15744 for (unsigned i = 0; i < NumStores; ++i) 15745 if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, 15746 Max)) { 15747 // If the searching bail out, record the StoreNode and RootNode in the 15748 // StoreRootCountMap. If we have seen the pair many times over a limit, 15749 // we won't add the StoreNode into StoreNodes set again. 15750 if (Visited.size() >= Max) { 15751 auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode]; 15752 if (RootCount.first == RootNode) 15753 RootCount.second++; 15754 else 15755 RootCount = {RootNode, 1}; 15756 } 15757 return false; 15758 } 15759 return true; 15760 } 15761 15762 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { 15763 if (OptLevel == CodeGenOpt::None || !EnableStoreMerging) 15764 return false; 15765 15766 // TODO: Extend this function to merge stores of scalable vectors. 15767 // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8> 15768 // store since we know <vscale x 16 x i8> is exactly twice as large as 15769 // <vscale x 8 x i8>). Until then, bail out for scalable vectors. 15770 EVT MemVT = St->getMemoryVT(); 15771 if (MemVT.isScalableVector()) 15772 return false; 15773 15774 int64_t ElementSizeBytes = MemVT.getStoreSize(); 15775 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 15776 15777 if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) 15778 return false; 15779 15780 bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute( 15781 Attribute::NoImplicitFloat); 15782 15783 // This function cannot currently deal with non-byte-sized memory sizes. 15784 if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits()) 15785 return false; 15786 15787 if (!MemVT.isSimple()) 15788 return false; 15789 15790 // Perform an early exit check. Do not bother looking at stored values that 15791 // are not constants, loads, or extracted vector elements. 15792 SDValue StoredVal = peekThroughBitcasts(St->getValue()); 15793 bool IsLoadSrc = isa<LoadSDNode>(StoredVal); 15794 bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) || 15795 isa<ConstantFPSDNode>(StoredVal); 15796 bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 15797 StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); 15798 bool IsNonTemporalStore = St->isNonTemporal(); 15799 bool IsNonTemporalLoad = 15800 IsLoadSrc && cast<LoadSDNode>(StoredVal)->isNonTemporal(); 15801 15802 if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) 15803 return false; 15804 15805 SmallVector<MemOpLink, 8> StoreNodes; 15806 SDNode *RootNode; 15807 // Find potential store merge candidates by searching through chain sub-DAG 15808 getStoreMergeCandidates(St, StoreNodes, RootNode); 15809 15810 // Check if there is anything to merge. 15811 if (StoreNodes.size() < 2) 15812 return false; 15813 15814 // Sort the memory operands according to their distance from the 15815 // base pointer. 15816 llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) { 15817 return LHS.OffsetFromBase < RHS.OffsetFromBase; 15818 }); 15819 15820 // Store Merge attempts to merge the lowest stores. This generally 15821 // works out as if successful, as the remaining stores are checked 15822 // after the first collection of stores is merged. However, in the 15823 // case that a non-mergeable store is found first, e.g., {p[-2], 15824 // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent 15825 // mergeable cases. To prevent this, we prune such stores from the 15826 // front of StoreNodes here. 15827 15828 bool RV = false; 15829 while (StoreNodes.size() > 1) { 15830 size_t StartIdx = 0; 15831 while ((StartIdx + 1 < StoreNodes.size()) && 15832 StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != 15833 StoreNodes[StartIdx + 1].OffsetFromBase) 15834 ++StartIdx; 15835 15836 // Bail if we don't have enough candidates to merge. 15837 if (StartIdx + 1 >= StoreNodes.size()) 15838 return RV; 15839 15840 if (StartIdx) 15841 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); 15842 15843 // Scan the memory operations on the chain and find the first 15844 // non-consecutive store memory address. 15845 unsigned NumConsecutiveStores = 1; 15846 int64_t StartAddress = StoreNodes[0].OffsetFromBase; 15847 // Check that the addresses are consecutive starting from the second 15848 // element in the list of stores. 15849 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { 15850 int64_t CurrAddress = StoreNodes[i].OffsetFromBase; 15851 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 15852 break; 15853 NumConsecutiveStores = i + 1; 15854 } 15855 15856 if (NumConsecutiveStores < 2) { 15857 StoreNodes.erase(StoreNodes.begin(), 15858 StoreNodes.begin() + NumConsecutiveStores); 15859 continue; 15860 } 15861 15862 // The node with the lowest store address. 15863 LLVMContext &Context = *DAG.getContext(); 15864 const DataLayout &DL = DAG.getDataLayout(); 15865 15866 // Store the constants into memory as one consecutive store. 15867 if (IsConstantSrc) { 15868 while (NumConsecutiveStores >= 2) { 15869 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 15870 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 15871 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 15872 unsigned LastLegalType = 1; 15873 unsigned LastLegalVectorType = 1; 15874 bool LastIntegerTrunc = false; 15875 bool NonZero = false; 15876 unsigned FirstZeroAfterNonZero = NumConsecutiveStores; 15877 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 15878 StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); 15879 SDValue StoredVal = ST->getValue(); 15880 bool IsElementZero = false; 15881 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) 15882 IsElementZero = C->isNullValue(); 15883 else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) 15884 IsElementZero = C->getConstantFPValue()->isNullValue(); 15885 if (IsElementZero) { 15886 if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) 15887 FirstZeroAfterNonZero = i; 15888 } 15889 NonZero |= !IsElementZero; 15890 15891 // Find a legal type for the constant store. 15892 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 15893 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); 15894 bool IsFast = false; 15895 15896 // Break early when size is too large to be legal. 15897 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 15898 break; 15899 15900 if (TLI.isTypeLegal(StoreTy) && 15901 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 15902 TLI.allowsMemoryAccess(Context, DL, StoreTy, 15903 *FirstInChain->getMemOperand(), &IsFast) && 15904 IsFast) { 15905 LastIntegerTrunc = false; 15906 LastLegalType = i + 1; 15907 // Or check whether a truncstore is legal. 15908 } else if (TLI.getTypeAction(Context, StoreTy) == 15909 TargetLowering::TypePromoteInteger) { 15910 EVT LegalizedStoredValTy = 15911 TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); 15912 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 15913 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 15914 TLI.allowsMemoryAccess(Context, DL, StoreTy, 15915 *FirstInChain->getMemOperand(), 15916 &IsFast) && 15917 IsFast) { 15918 LastIntegerTrunc = true; 15919 LastLegalType = i + 1; 15920 } 15921 } 15922 15923 // We only use vectors if the constant is known to be zero or the 15924 // target allows it and the function is not marked with the 15925 // noimplicitfloat attribute. 15926 if ((!NonZero || 15927 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && 15928 !NoVectors) { 15929 // Find a legal type for the vector store. 15930 unsigned Elts = (i + 1) * NumMemElts; 15931 EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 15932 if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && 15933 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 15934 TLI.allowsMemoryAccess( 15935 Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) && 15936 IsFast) 15937 LastLegalVectorType = i + 1; 15938 } 15939 } 15940 15941 bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; 15942 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; 15943 15944 // Check if we found a legal integer type that creates a meaningful 15945 // merge. 15946 if (NumElem < 2) { 15947 // We know that candidate stores are in order and of correct 15948 // shape. While there is no mergeable sequence from the 15949 // beginning one may start later in the sequence. The only 15950 // reason a merge of size N could have failed where another of 15951 // the same size would not have, is if the alignment has 15952 // improved or we've dropped a non-zero value. Drop as many 15953 // candidates as we can here. 15954 unsigned NumSkip = 1; 15955 while ( 15956 (NumSkip < NumConsecutiveStores) && 15957 (NumSkip < FirstZeroAfterNonZero) && 15958 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 15959 NumSkip++; 15960 15961 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 15962 NumConsecutiveStores -= NumSkip; 15963 continue; 15964 } 15965 15966 // Check that we can merge these candidates without causing a cycle. 15967 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 15968 RootNode)) { 15969 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 15970 NumConsecutiveStores -= NumElem; 15971 continue; 15972 } 15973 15974 RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true, 15975 UseVector, LastIntegerTrunc); 15976 15977 // Remove merged stores for next iteration. 15978 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 15979 NumConsecutiveStores -= NumElem; 15980 } 15981 continue; 15982 } 15983 15984 // When extracting multiple vector elements, try to store them 15985 // in one vector store rather than a sequence of scalar stores. 15986 if (IsExtractVecSrc) { 15987 // Loop on Consecutive Stores on success. 15988 while (NumConsecutiveStores >= 2) { 15989 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 15990 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 15991 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 15992 unsigned NumStoresToMerge = 1; 15993 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 15994 // Find a legal type for the vector store. 15995 unsigned Elts = (i + 1) * NumMemElts; 15996 EVT Ty = 15997 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 15998 bool IsFast; 15999 16000 // Break early when size is too large to be legal. 16001 if (Ty.getSizeInBits() > MaximumLegalStoreInBits) 16002 break; 16003 16004 if (TLI.isTypeLegal(Ty) && 16005 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 16006 TLI.allowsMemoryAccess(Context, DL, Ty, 16007 *FirstInChain->getMemOperand(), &IsFast) && 16008 IsFast) 16009 NumStoresToMerge = i + 1; 16010 } 16011 16012 // Check if we found a legal integer type creating a meaningful 16013 // merge. 16014 if (NumStoresToMerge < 2) { 16015 // We know that candidate stores are in order and of correct 16016 // shape. While there is no mergeable sequence from the 16017 // beginning one may start later in the sequence. The only 16018 // reason a merge of size N could have failed where another of 16019 // the same size would not have, is if the alignment has 16020 // improved. Drop as many candidates as we can here. 16021 unsigned NumSkip = 1; 16022 while ( 16023 (NumSkip < NumConsecutiveStores) && 16024 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 16025 NumSkip++; 16026 16027 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 16028 NumConsecutiveStores -= NumSkip; 16029 continue; 16030 } 16031 16032 // Check that we can merge these candidates without causing a cycle. 16033 if (!checkMergeStoreCandidatesForDependencies( 16034 StoreNodes, NumStoresToMerge, RootNode)) { 16035 StoreNodes.erase(StoreNodes.begin(), 16036 StoreNodes.begin() + NumStoresToMerge); 16037 NumConsecutiveStores -= NumStoresToMerge; 16038 continue; 16039 } 16040 16041 RV |= MergeStoresOfConstantsOrVecElts( 16042 StoreNodes, MemVT, NumStoresToMerge, false, true, false); 16043 16044 StoreNodes.erase(StoreNodes.begin(), 16045 StoreNodes.begin() + NumStoresToMerge); 16046 NumConsecutiveStores -= NumStoresToMerge; 16047 } 16048 continue; 16049 } 16050 16051 // Below we handle the case of multiple consecutive stores that 16052 // come from multiple consecutive loads. We merge them into a single 16053 // wide load and a single wide store. 16054 16055 // Look for load nodes which are used by the stored values. 16056 SmallVector<MemOpLink, 8> LoadNodes; 16057 16058 // Find acceptable loads. Loads need to have the same chain (token factor), 16059 // must not be zext, volatile, indexed, and they must be consecutive. 16060 BaseIndexOffset LdBasePtr; 16061 16062 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 16063 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 16064 SDValue Val = peekThroughBitcasts(St->getValue()); 16065 LoadSDNode *Ld = cast<LoadSDNode>(Val); 16066 16067 BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG); 16068 // If this is not the first ptr that we check. 16069 int64_t LdOffset = 0; 16070 if (LdBasePtr.getBase().getNode()) { 16071 // The base ptr must be the same. 16072 if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) 16073 break; 16074 } else { 16075 // Check that all other base pointers are the same as this one. 16076 LdBasePtr = LdPtr; 16077 } 16078 16079 // We found a potential memory operand to merge. 16080 LoadNodes.push_back(MemOpLink(Ld, LdOffset)); 16081 } 16082 16083 while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) { 16084 // If we have load/store pair instructions and we only have two values, 16085 // don't bother merging. 16086 unsigned RequiredAlignment; 16087 if (LoadNodes.size() == 2 && 16088 TLI.hasPairedLoad(MemVT, RequiredAlignment) && 16089 StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { 16090 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); 16091 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); 16092 break; 16093 } 16094 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16095 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 16096 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 16097 LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); 16098 unsigned FirstLoadAlign = FirstLoad->getAlignment(); 16099 16100 // Scan the memory operations on the chain and find the first 16101 // non-consecutive load memory address. These variables hold the index in 16102 // the store node array. 16103 16104 unsigned LastConsecutiveLoad = 1; 16105 16106 // This variable refers to the size and not index in the array. 16107 unsigned LastLegalVectorType = 1; 16108 unsigned LastLegalIntegerType = 1; 16109 bool isDereferenceable = true; 16110 bool DoIntegerTruncate = false; 16111 StartAddress = LoadNodes[0].OffsetFromBase; 16112 SDValue FirstChain = FirstLoad->getChain(); 16113 for (unsigned i = 1; i < LoadNodes.size(); ++i) { 16114 // All loads must share the same chain. 16115 if (LoadNodes[i].MemNode->getChain() != FirstChain) 16116 break; 16117 16118 int64_t CurrAddress = LoadNodes[i].OffsetFromBase; 16119 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 16120 break; 16121 LastConsecutiveLoad = i; 16122 16123 if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) 16124 isDereferenceable = false; 16125 16126 // Find a legal type for the vector store. 16127 unsigned Elts = (i + 1) * NumMemElts; 16128 EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 16129 16130 // Break early when size is too large to be legal. 16131 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 16132 break; 16133 16134 bool IsFastSt, IsFastLd; 16135 if (TLI.isTypeLegal(StoreTy) && 16136 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 16137 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16138 *FirstInChain->getMemOperand(), &IsFastSt) && 16139 IsFastSt && 16140 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16141 *FirstLoad->getMemOperand(), &IsFastLd) && 16142 IsFastLd) { 16143 LastLegalVectorType = i + 1; 16144 } 16145 16146 // Find a legal type for the integer store. 16147 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 16148 StoreTy = EVT::getIntegerVT(Context, SizeInBits); 16149 if (TLI.isTypeLegal(StoreTy) && 16150 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 16151 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16152 *FirstInChain->getMemOperand(), &IsFastSt) && 16153 IsFastSt && 16154 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16155 *FirstLoad->getMemOperand(), &IsFastLd) && 16156 IsFastLd) { 16157 LastLegalIntegerType = i + 1; 16158 DoIntegerTruncate = false; 16159 // Or check whether a truncstore and extload is legal. 16160 } else if (TLI.getTypeAction(Context, StoreTy) == 16161 TargetLowering::TypePromoteInteger) { 16162 EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy); 16163 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 16164 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 16165 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, 16166 StoreTy) && 16167 TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, 16168 StoreTy) && 16169 TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && 16170 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16171 *FirstInChain->getMemOperand(), 16172 &IsFastSt) && 16173 IsFastSt && 16174 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16175 *FirstLoad->getMemOperand(), &IsFastLd) && 16176 IsFastLd) { 16177 LastLegalIntegerType = i + 1; 16178 DoIntegerTruncate = true; 16179 } 16180 } 16181 } 16182 16183 // Only use vector types if the vector type is larger than the integer 16184 // type. If they are the same, use integers. 16185 bool UseVectorTy = 16186 LastLegalVectorType > LastLegalIntegerType && !NoVectors; 16187 unsigned LastLegalType = 16188 std::max(LastLegalVectorType, LastLegalIntegerType); 16189 16190 // We add +1 here because the LastXXX variables refer to location while 16191 // the NumElem refers to array/index size. 16192 unsigned NumElem = 16193 std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); 16194 NumElem = std::min(LastLegalType, NumElem); 16195 16196 if (NumElem < 2) { 16197 // We know that candidate stores are in order and of correct 16198 // shape. While there is no mergeable sequence from the 16199 // beginning one may start later in the sequence. The only 16200 // reason a merge of size N could have failed where another of 16201 // the same size would not have is if the alignment or either 16202 // the load or store has improved. Drop as many candidates as we 16203 // can here. 16204 unsigned NumSkip = 1; 16205 while ((NumSkip < LoadNodes.size()) && 16206 (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && 16207 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 16208 NumSkip++; 16209 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 16210 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); 16211 NumConsecutiveStores -= NumSkip; 16212 continue; 16213 } 16214 16215 // Check that we can merge these candidates without causing a cycle. 16216 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 16217 RootNode)) { 16218 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16219 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 16220 NumConsecutiveStores -= NumElem; 16221 continue; 16222 } 16223 16224 // Find if it is better to use vectors or integers to load and store 16225 // to memory. 16226 EVT JointMemOpVT; 16227 if (UseVectorTy) { 16228 // Find a legal type for the vector store. 16229 unsigned Elts = NumElem * NumMemElts; 16230 JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 16231 } else { 16232 unsigned SizeInBits = NumElem * ElementSizeBytes * 8; 16233 JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); 16234 } 16235 16236 SDLoc LoadDL(LoadNodes[0].MemNode); 16237 SDLoc StoreDL(StoreNodes[0].MemNode); 16238 16239 // The merged loads are required to have the same incoming chain, so 16240 // using the first's chain is acceptable. 16241 16242 SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); 16243 AddToWorklist(NewStoreChain.getNode()); 16244 16245 MachineMemOperand::Flags LdMMOFlags = 16246 isDereferenceable ? MachineMemOperand::MODereferenceable 16247 : MachineMemOperand::MONone; 16248 if (IsNonTemporalLoad) 16249 LdMMOFlags |= MachineMemOperand::MONonTemporal; 16250 16251 MachineMemOperand::Flags StMMOFlags = 16252 IsNonTemporalStore ? MachineMemOperand::MONonTemporal 16253 : MachineMemOperand::MONone; 16254 16255 SDValue NewLoad, NewStore; 16256 if (UseVectorTy || !DoIntegerTruncate) { 16257 NewLoad = 16258 DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), 16259 FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), 16260 FirstLoadAlign, LdMMOFlags); 16261 NewStore = DAG.getStore( 16262 NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), 16263 FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); 16264 } else { // This must be the truncstore/extload case 16265 EVT ExtendedTy = 16266 TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); 16267 NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, 16268 FirstLoad->getChain(), FirstLoad->getBasePtr(), 16269 FirstLoad->getPointerInfo(), JointMemOpVT, 16270 FirstLoadAlign, LdMMOFlags); 16271 NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, 16272 FirstInChain->getBasePtr(), 16273 FirstInChain->getPointerInfo(), 16274 JointMemOpVT, FirstInChain->getAlignment(), 16275 FirstInChain->getMemOperand()->getFlags()); 16276 } 16277 16278 // Transfer chain users from old loads to the new load. 16279 for (unsigned i = 0; i < NumElem; ++i) { 16280 LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); 16281 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), 16282 SDValue(NewLoad.getNode(), 1)); 16283 } 16284 16285 // Replace the all stores with the new store. Recursively remove 16286 // corresponding value if its no longer used. 16287 for (unsigned i = 0; i < NumElem; ++i) { 16288 SDValue Val = StoreNodes[i].MemNode->getOperand(1); 16289 CombineTo(StoreNodes[i].MemNode, NewStore); 16290 if (Val.getNode()->use_empty()) 16291 recursivelyDeleteUnusedNodes(Val.getNode()); 16292 } 16293 16294 RV = true; 16295 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16296 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 16297 NumConsecutiveStores -= NumElem; 16298 } 16299 } 16300 return RV; 16301 } 16302 16303 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { 16304 SDLoc SL(ST); 16305 SDValue ReplStore; 16306 16307 // Replace the chain to avoid dependency. 16308 if (ST->isTruncatingStore()) { 16309 ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), 16310 ST->getBasePtr(), ST->getMemoryVT(), 16311 ST->getMemOperand()); 16312 } else { 16313 ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), 16314 ST->getMemOperand()); 16315 } 16316 16317 // Create token to keep both nodes around. 16318 SDValue Token = DAG.getNode(ISD::TokenFactor, SL, 16319 MVT::Other, ST->getChain(), ReplStore); 16320 16321 // Make sure the new and old chains are cleaned up. 16322 AddToWorklist(Token.getNode()); 16323 16324 // Don't add users to work list. 16325 return CombineTo(ST, Token, false); 16326 } 16327 16328 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { 16329 SDValue Value = ST->getValue(); 16330 if (Value.getOpcode() == ISD::TargetConstantFP) 16331 return SDValue(); 16332 16333 if (!ISD::isNormalStore(ST)) 16334 return SDValue(); 16335 16336 SDLoc DL(ST); 16337 16338 SDValue Chain = ST->getChain(); 16339 SDValue Ptr = ST->getBasePtr(); 16340 16341 const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value); 16342 16343 // NOTE: If the original store is volatile, this transform must not increase 16344 // the number of stores. For example, on x86-32 an f64 can be stored in one 16345 // processor operation but an i64 (which is not legal) requires two. So the 16346 // transform should not be done in this case. 16347 16348 SDValue Tmp; 16349 switch (CFP->getSimpleValueType(0).SimpleTy) { 16350 default: 16351 llvm_unreachable("Unknown FP type"); 16352 case MVT::f16: // We don't do this for these yet. 16353 case MVT::f80: 16354 case MVT::f128: 16355 case MVT::ppcf128: 16356 return SDValue(); 16357 case MVT::f32: 16358 if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) || 16359 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 16360 ; 16361 Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). 16362 bitcastToAPInt().getZExtValue(), SDLoc(CFP), 16363 MVT::i32); 16364 return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); 16365 } 16366 16367 return SDValue(); 16368 case MVT::f64: 16369 if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && 16370 ST->isSimple()) || 16371 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { 16372 ; 16373 Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). 16374 getZExtValue(), SDLoc(CFP), MVT::i64); 16375 return DAG.getStore(Chain, DL, Tmp, 16376 Ptr, ST->getMemOperand()); 16377 } 16378 16379 if (ST->isSimple() && 16380 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 16381 // Many FP stores are not made apparent until after legalize, e.g. for 16382 // argument passing. Since this is so common, custom legalize the 16383 // 64-bit integer store into two 32-bit stores. 16384 uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); 16385 SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); 16386 SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); 16387 if (DAG.getDataLayout().isBigEndian()) 16388 std::swap(Lo, Hi); 16389 16390 unsigned Alignment = ST->getAlignment(); 16391 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 16392 AAMDNodes AAInfo = ST->getAAInfo(); 16393 16394 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 16395 ST->getAlignment(), MMOFlags, AAInfo); 16396 Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL); 16397 Alignment = MinAlign(Alignment, 4U); 16398 SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, 16399 ST->getPointerInfo().getWithOffset(4), 16400 Alignment, MMOFlags, AAInfo); 16401 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 16402 St0, St1); 16403 } 16404 16405 return SDValue(); 16406 } 16407 } 16408 16409 SDValue DAGCombiner::visitSTORE(SDNode *N) { 16410 StoreSDNode *ST = cast<StoreSDNode>(N); 16411 SDValue Chain = ST->getChain(); 16412 SDValue Value = ST->getValue(); 16413 SDValue Ptr = ST->getBasePtr(); 16414 16415 // If this is a store of a bit convert, store the input value if the 16416 // resultant store does not need a higher alignment than the original. 16417 if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && 16418 ST->isUnindexed()) { 16419 EVT SVT = Value.getOperand(0).getValueType(); 16420 // If the store is volatile, we only want to change the store type if the 16421 // resulting store is legal. Otherwise we might increase the number of 16422 // memory accesses. We don't care if the original type was legal or not 16423 // as we assume software couldn't rely on the number of accesses of an 16424 // illegal type. 16425 // TODO: May be able to relax for unordered atomics (see D66309) 16426 if (((!LegalOperations && ST->isSimple()) || 16427 TLI.isOperationLegal(ISD::STORE, SVT)) && 16428 TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, 16429 DAG, *ST->getMemOperand())) { 16430 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, 16431 ST->getMemOperand()); 16432 } 16433 } 16434 16435 // Turn 'store undef, Ptr' -> nothing. 16436 if (Value.isUndef() && ST->isUnindexed()) 16437 return Chain; 16438 16439 // Try to infer better alignment information than the store already has. 16440 if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) { 16441 if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { 16442 if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) { 16443 SDValue NewStore = 16444 DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), 16445 ST->getMemoryVT(), Align, 16446 ST->getMemOperand()->getFlags(), ST->getAAInfo()); 16447 // NewStore will always be N as we are only refining the alignment 16448 assert(NewStore.getNode() == N); 16449 (void)NewStore; 16450 } 16451 } 16452 } 16453 16454 // Try transforming a pair floating point load / store ops to integer 16455 // load / store ops. 16456 if (SDValue NewST = TransformFPLoadStorePair(N)) 16457 return NewST; 16458 16459 // Try transforming several stores into STORE (BSWAP). 16460 if (SDValue Store = MatchStoreCombine(ST)) 16461 return Store; 16462 16463 if (ST->isUnindexed()) { 16464 // Walk up chain skipping non-aliasing memory nodes, on this store and any 16465 // adjacent stores. 16466 if (findBetterNeighborChains(ST)) { 16467 // replaceStoreChain uses CombineTo, which handled all of the worklist 16468 // manipulation. Return the original node to not do anything else. 16469 return SDValue(ST, 0); 16470 } 16471 Chain = ST->getChain(); 16472 } 16473 16474 // FIXME: is there such a thing as a truncating indexed store? 16475 if (ST->isTruncatingStore() && ST->isUnindexed() && 16476 Value.getValueType().isInteger() && 16477 (!isa<ConstantSDNode>(Value) || 16478 !cast<ConstantSDNode>(Value)->isOpaque())) { 16479 APInt TruncDemandedBits = 16480 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), 16481 ST->getMemoryVT().getScalarSizeInBits()); 16482 16483 // See if we can simplify the input to this truncstore with knowledge that 16484 // only the low bits are being used. For example: 16485 // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" 16486 AddToWorklist(Value.getNode()); 16487 if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits)) 16488 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), 16489 ST->getMemOperand()); 16490 16491 // Otherwise, see if we can simplify the operation with 16492 // SimplifyDemandedBits, which only works if the value has a single use. 16493 if (SimplifyDemandedBits(Value, TruncDemandedBits)) { 16494 // Re-visit the store if anything changed and the store hasn't been merged 16495 // with another node (N is deleted) SimplifyDemandedBits will add Value's 16496 // node back to the worklist if necessary, but we also need to re-visit 16497 // the Store node itself. 16498 if (N->getOpcode() != ISD::DELETED_NODE) 16499 AddToWorklist(N); 16500 return SDValue(N, 0); 16501 } 16502 } 16503 16504 // If this is a load followed by a store to the same location, then the store 16505 // is dead/noop. 16506 // TODO: Can relax for unordered atomics (see D66309) 16507 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) { 16508 if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && 16509 ST->isUnindexed() && ST->isSimple() && 16510 // There can't be any side effects between the load and store, such as 16511 // a call or store. 16512 Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { 16513 // The store is dead, remove it. 16514 return Chain; 16515 } 16516 } 16517 16518 // TODO: Can relax for unordered atomics (see D66309) 16519 if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { 16520 if (ST->isUnindexed() && ST->isSimple() && 16521 ST1->isUnindexed() && ST1->isSimple()) { 16522 if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value && 16523 ST->getMemoryVT() == ST1->getMemoryVT()) { 16524 // If this is a store followed by a store with the same value to the 16525 // same location, then the store is dead/noop. 16526 return Chain; 16527 } 16528 16529 if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && 16530 !ST1->getBasePtr().isUndef()) { 16531 const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); 16532 const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); 16533 unsigned STBitSize = ST->getMemoryVT().getSizeInBits(); 16534 unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits(); 16535 // If this is a store who's preceding store to a subset of the current 16536 // location and no one other node is chained to that store we can 16537 // effectively drop the store. Do not remove stores to undef as they may 16538 // be used as data sinks. 16539 if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) { 16540 CombineTo(ST1, ST1->getChain()); 16541 return SDValue(); 16542 } 16543 16544 // If ST stores to a subset of preceding store's write set, we may be 16545 // able to fold ST's value into the preceding stored value. As we know 16546 // the other uses of ST1's chain are unconcerned with ST, this folding 16547 // will not affect those nodes. 16548 int64_t BitOffset; 16549 if (ChainBase.contains(DAG, ChainBitSize, STBase, STBitSize, 16550 BitOffset)) { 16551 SDValue ChainValue = ST1->getValue(); 16552 if (auto *C1 = dyn_cast<ConstantSDNode>(ChainValue)) { 16553 if (auto *C = dyn_cast<ConstantSDNode>(Value)) { 16554 APInt Val = C1->getAPIntValue(); 16555 APInt InsertVal = C->getAPIntValue().zextOrTrunc(STBitSize); 16556 // FIXME: Handle Big-endian mode. 16557 if (!DAG.getDataLayout().isBigEndian()) { 16558 Val.insertBits(InsertVal, BitOffset); 16559 SDValue NewSDVal = 16560 DAG.getConstant(Val, SDLoc(C), ChainValue.getValueType(), 16561 C1->isTargetOpcode(), C1->isOpaque()); 16562 SDNode *NewST1 = DAG.UpdateNodeOperands( 16563 ST1, ST1->getChain(), NewSDVal, ST1->getOperand(2), 16564 ST1->getOperand(3)); 16565 return CombineTo(ST, SDValue(NewST1, 0)); 16566 } 16567 } 16568 } 16569 } // End ST subset of ST1 case. 16570 } 16571 } 16572 } 16573 16574 // If this is an FP_ROUND or TRUNC followed by a store, fold this into a 16575 // truncating store. We can do this even if this is already a truncstore. 16576 if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) 16577 && Value.getNode()->hasOneUse() && ST->isUnindexed() && 16578 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), 16579 ST->getMemoryVT())) { 16580 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), 16581 Ptr, ST->getMemoryVT(), ST->getMemOperand()); 16582 } 16583 16584 // Always perform this optimization before types are legal. If the target 16585 // prefers, also try this after legalization to catch stores that were created 16586 // by intrinsics or other nodes. 16587 if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) { 16588 while (true) { 16589 // There can be multiple store sequences on the same chain. 16590 // Keep trying to merge store sequences until we are unable to do so 16591 // or until we merge the last store on the chain. 16592 bool Changed = MergeConsecutiveStores(ST); 16593 if (!Changed) break; 16594 // Return N as merge only uses CombineTo and no worklist clean 16595 // up is necessary. 16596 if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) 16597 return SDValue(N, 0); 16598 } 16599 } 16600 16601 // Try transforming N to an indexed store. 16602 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 16603 return SDValue(N, 0); 16604 16605 // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' 16606 // 16607 // Make sure to do this only after attempting to merge stores in order to 16608 // avoid changing the types of some subset of stores due to visit order, 16609 // preventing their merging. 16610 if (isa<ConstantFPSDNode>(ST->getValue())) { 16611 if (SDValue NewSt = replaceStoreOfFPConstant(ST)) 16612 return NewSt; 16613 } 16614 16615 if (SDValue NewSt = splitMergedValStore(ST)) 16616 return NewSt; 16617 16618 return ReduceLoadOpStoreWidth(N); 16619 } 16620 16621 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { 16622 const auto *LifetimeEnd = cast<LifetimeSDNode>(N); 16623 if (!LifetimeEnd->hasOffset()) 16624 return SDValue(); 16625 16626 const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 16627 LifetimeEnd->getOffset(), false); 16628 16629 // We walk up the chains to find stores. 16630 SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; 16631 while (!Chains.empty()) { 16632 SDValue Chain = Chains.back(); 16633 Chains.pop_back(); 16634 if (!Chain.hasOneUse()) 16635 continue; 16636 switch (Chain.getOpcode()) { 16637 case ISD::TokenFactor: 16638 for (unsigned Nops = Chain.getNumOperands(); Nops;) 16639 Chains.push_back(Chain.getOperand(--Nops)); 16640 break; 16641 case ISD::LIFETIME_START: 16642 case ISD::LIFETIME_END: 16643 // We can forward past any lifetime start/end that can be proven not to 16644 // alias the node. 16645 if (!isAlias(Chain.getNode(), N)) 16646 Chains.push_back(Chain.getOperand(0)); 16647 break; 16648 case ISD::STORE: { 16649 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain); 16650 // TODO: Can relax for unordered atomics (see D66309) 16651 if (!ST->isSimple() || ST->isIndexed()) 16652 continue; 16653 const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); 16654 // If we store purely within object bounds just before its lifetime ends, 16655 // we can remove the store. 16656 if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, 16657 ST->getMemoryVT().getStoreSizeInBits())) { 16658 LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); 16659 dbgs() << "\nwithin LIFETIME_END of : "; 16660 LifetimeEndBase.dump(); dbgs() << "\n"); 16661 CombineTo(ST, ST->getChain()); 16662 return SDValue(N, 0); 16663 } 16664 } 16665 } 16666 } 16667 return SDValue(); 16668 } 16669 16670 /// For the instruction sequence of store below, F and I values 16671 /// are bundled together as an i64 value before being stored into memory. 16672 /// Sometimes it is more efficent to generate separate stores for F and I, 16673 /// which can remove the bitwise instructions or sink them to colder places. 16674 /// 16675 /// (store (or (zext (bitcast F to i32) to i64), 16676 /// (shl (zext I to i64), 32)), addr) --> 16677 /// (store F, addr) and (store I, addr+4) 16678 /// 16679 /// Similarly, splitting for other merged store can also be beneficial, like: 16680 /// For pair of {i32, i32}, i64 store --> two i32 stores. 16681 /// For pair of {i32, i16}, i64 store --> two i32 stores. 16682 /// For pair of {i16, i16}, i32 store --> two i16 stores. 16683 /// For pair of {i16, i8}, i32 store --> two i16 stores. 16684 /// For pair of {i8, i8}, i16 store --> two i8 stores. 16685 /// 16686 /// We allow each target to determine specifically which kind of splitting is 16687 /// supported. 16688 /// 16689 /// The store patterns are commonly seen from the simple code snippet below 16690 /// if only std::make_pair(...) is sroa transformed before inlined into hoo. 16691 /// void goo(const std::pair<int, float> &); 16692 /// hoo() { 16693 /// ... 16694 /// goo(std::make_pair(tmp, ftmp)); 16695 /// ... 16696 /// } 16697 /// 16698 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { 16699 if (OptLevel == CodeGenOpt::None) 16700 return SDValue(); 16701 16702 // Can't change the number of memory accesses for a volatile store or break 16703 // atomicity for an atomic one. 16704 if (!ST->isSimple()) 16705 return SDValue(); 16706 16707 SDValue Val = ST->getValue(); 16708 SDLoc DL(ST); 16709 16710 // Match OR operand. 16711 if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) 16712 return SDValue(); 16713 16714 // Match SHL operand and get Lower and Higher parts of Val. 16715 SDValue Op1 = Val.getOperand(0); 16716 SDValue Op2 = Val.getOperand(1); 16717 SDValue Lo, Hi; 16718 if (Op1.getOpcode() != ISD::SHL) { 16719 std::swap(Op1, Op2); 16720 if (Op1.getOpcode() != ISD::SHL) 16721 return SDValue(); 16722 } 16723 Lo = Op2; 16724 Hi = Op1.getOperand(0); 16725 if (!Op1.hasOneUse()) 16726 return SDValue(); 16727 16728 // Match shift amount to HalfValBitSize. 16729 unsigned HalfValBitSize = Val.getValueSizeInBits() / 2; 16730 ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1)); 16731 if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) 16732 return SDValue(); 16733 16734 // Lo and Hi are zero-extended from int with size less equal than 32 16735 // to i64. 16736 if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || 16737 !Lo.getOperand(0).getValueType().isScalarInteger() || 16738 Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize || 16739 Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || 16740 !Hi.getOperand(0).getValueType().isScalarInteger() || 16741 Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize) 16742 return SDValue(); 16743 16744 // Use the EVT of low and high parts before bitcast as the input 16745 // of target query. 16746 EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST) 16747 ? Lo.getOperand(0).getValueType() 16748 : Lo.getValueType(); 16749 EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST) 16750 ? Hi.getOperand(0).getValueType() 16751 : Hi.getValueType(); 16752 if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) 16753 return SDValue(); 16754 16755 // Start to split store. 16756 unsigned Alignment = ST->getAlignment(); 16757 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 16758 AAMDNodes AAInfo = ST->getAAInfo(); 16759 16760 // Change the sizes of Lo and Hi's value types to HalfValBitSize. 16761 EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); 16762 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); 16763 Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); 16764 16765 SDValue Chain = ST->getChain(); 16766 SDValue Ptr = ST->getBasePtr(); 16767 // Lower value store. 16768 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 16769 ST->getAlignment(), MMOFlags, AAInfo); 16770 Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL); 16771 // Higher value store. 16772 SDValue St1 = 16773 DAG.getStore(St0, DL, Hi, Ptr, 16774 ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), 16775 Alignment / 2, MMOFlags, AAInfo); 16776 return St1; 16777 } 16778 16779 /// Convert a disguised subvector insertion into a shuffle: 16780 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { 16781 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && 16782 "Expected extract_vector_elt"); 16783 SDValue InsertVal = N->getOperand(1); 16784 SDValue Vec = N->getOperand(0); 16785 16786 // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), 16787 // InsIndex) 16788 // --> (vector_shuffle X, Y) and variations where shuffle operands may be 16789 // CONCAT_VECTORS. 16790 if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() && 16791 InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 16792 isa<ConstantSDNode>(InsertVal.getOperand(1))) { 16793 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode()); 16794 ArrayRef<int> Mask = SVN->getMask(); 16795 16796 SDValue X = Vec.getOperand(0); 16797 SDValue Y = Vec.getOperand(1); 16798 16799 // Vec's operand 0 is using indices from 0 to N-1 and 16800 // operand 1 from N to 2N - 1, where N is the number of 16801 // elements in the vectors. 16802 SDValue InsertVal0 = InsertVal.getOperand(0); 16803 int ElementOffset = -1; 16804 16805 // We explore the inputs of the shuffle in order to see if we find the 16806 // source of the extract_vector_elt. If so, we can use it to modify the 16807 // shuffle rather than perform an insert_vector_elt. 16808 SmallVector<std::pair<int, SDValue>, 8> ArgWorkList; 16809 ArgWorkList.emplace_back(Mask.size(), Y); 16810 ArgWorkList.emplace_back(0, X); 16811 16812 while (!ArgWorkList.empty()) { 16813 int ArgOffset; 16814 SDValue ArgVal; 16815 std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val(); 16816 16817 if (ArgVal == InsertVal0) { 16818 ElementOffset = ArgOffset; 16819 break; 16820 } 16821 16822 // Peek through concat_vector. 16823 if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) { 16824 int CurrentArgOffset = 16825 ArgOffset + ArgVal.getValueType().getVectorNumElements(); 16826 int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements(); 16827 for (SDValue Op : reverse(ArgVal->ops())) { 16828 CurrentArgOffset -= Step; 16829 ArgWorkList.emplace_back(CurrentArgOffset, Op); 16830 } 16831 16832 // Make sure we went through all the elements and did not screw up index 16833 // computation. 16834 assert(CurrentArgOffset == ArgOffset); 16835 } 16836 } 16837 16838 if (ElementOffset != -1) { 16839 SmallVector<int, 16> NewMask(Mask.begin(), Mask.end()); 16840 16841 auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1)); 16842 NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue(); 16843 assert(NewMask[InsIndex] < 16844 (int)(2 * Vec.getValueType().getVectorNumElements()) && 16845 NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound"); 16846 16847 SDValue LegalShuffle = 16848 TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X, 16849 Y, NewMask, DAG); 16850 if (LegalShuffle) 16851 return LegalShuffle; 16852 } 16853 } 16854 16855 // insert_vector_elt V, (bitcast X from vector type), IdxC --> 16856 // bitcast(shuffle (bitcast V), (extended X), Mask) 16857 // Note: We do not use an insert_subvector node because that requires a 16858 // legal subvector type. 16859 if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || 16860 !InsertVal.getOperand(0).getValueType().isVector()) 16861 return SDValue(); 16862 16863 SDValue SubVec = InsertVal.getOperand(0); 16864 SDValue DestVec = N->getOperand(0); 16865 EVT SubVecVT = SubVec.getValueType(); 16866 EVT VT = DestVec.getValueType(); 16867 unsigned NumSrcElts = SubVecVT.getVectorNumElements(); 16868 unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); 16869 unsigned NumMaskVals = ExtendRatio * NumSrcElts; 16870 16871 // Step 1: Create a shuffle mask that implements this insert operation. The 16872 // vector that we are inserting into will be operand 0 of the shuffle, so 16873 // those elements are just 'i'. The inserted subvector is in the first 16874 // positions of operand 1 of the shuffle. Example: 16875 // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} 16876 SmallVector<int, 16> Mask(NumMaskVals); 16877 for (unsigned i = 0; i != NumMaskVals; ++i) { 16878 if (i / NumSrcElts == InsIndex) 16879 Mask[i] = (i % NumSrcElts) + NumMaskVals; 16880 else 16881 Mask[i] = i; 16882 } 16883 16884 // Bail out if the target can not handle the shuffle we want to create. 16885 EVT SubVecEltVT = SubVecVT.getVectorElementType(); 16886 EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); 16887 if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) 16888 return SDValue(); 16889 16890 // Step 2: Create a wide vector from the inserted source vector by appending 16891 // undefined elements. This is the same size as our destination vector. 16892 SDLoc DL(N); 16893 SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); 16894 ConcatOps[0] = SubVec; 16895 SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); 16896 16897 // Step 3: Shuffle in the padded subvector. 16898 SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); 16899 SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); 16900 AddToWorklist(PaddedSubV.getNode()); 16901 AddToWorklist(DestVecBC.getNode()); 16902 AddToWorklist(Shuf.getNode()); 16903 return DAG.getBitcast(VT, Shuf); 16904 } 16905 16906 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { 16907 SDValue InVec = N->getOperand(0); 16908 SDValue InVal = N->getOperand(1); 16909 SDValue EltNo = N->getOperand(2); 16910 SDLoc DL(N); 16911 16912 EVT VT = InVec.getValueType(); 16913 unsigned NumElts = VT.getVectorNumElements(); 16914 auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); 16915 16916 // Insert into out-of-bounds element is undefined. 16917 if (IndexC && IndexC->getZExtValue() >= VT.getVectorNumElements()) 16918 return DAG.getUNDEF(VT); 16919 16920 // Remove redundant insertions: 16921 // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x 16922 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 16923 InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) 16924 return InVec; 16925 16926 if (!IndexC) { 16927 // If this is variable insert to undef vector, it might be better to splat: 16928 // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > 16929 if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) { 16930 SmallVector<SDValue, 8> Ops(NumElts, InVal); 16931 return DAG.getBuildVector(VT, DL, Ops); 16932 } 16933 return SDValue(); 16934 } 16935 16936 // We must know which element is being inserted for folds below here. 16937 unsigned Elt = IndexC->getZExtValue(); 16938 if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) 16939 return Shuf; 16940 16941 // Canonicalize insert_vector_elt dag nodes. 16942 // Example: 16943 // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) 16944 // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) 16945 // 16946 // Do this only if the child insert_vector node has one use; also 16947 // do this only if indices are both constants and Idx1 < Idx0. 16948 if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() 16949 && isa<ConstantSDNode>(InVec.getOperand(2))) { 16950 unsigned OtherElt = InVec.getConstantOperandVal(2); 16951 if (Elt < OtherElt) { 16952 // Swap nodes. 16953 SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, 16954 InVec.getOperand(0), InVal, EltNo); 16955 AddToWorklist(NewOp.getNode()); 16956 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), 16957 VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); 16958 } 16959 } 16960 16961 // If we can't generate a legal BUILD_VECTOR, exit 16962 if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 16963 return SDValue(); 16964 16965 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 16966 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 16967 // vector elements. 16968 SmallVector<SDValue, 8> Ops; 16969 // Do not combine these two vectors if the output vector will not replace 16970 // the input vector. 16971 if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { 16972 Ops.append(InVec.getNode()->op_begin(), 16973 InVec.getNode()->op_end()); 16974 } else if (InVec.isUndef()) { 16975 Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); 16976 } else { 16977 return SDValue(); 16978 } 16979 assert(Ops.size() == NumElts && "Unexpected vector size"); 16980 16981 // Insert the element 16982 if (Elt < Ops.size()) { 16983 // All the operands of BUILD_VECTOR must have the same type; 16984 // we enforce that here. 16985 EVT OpVT = Ops[0].getValueType(); 16986 Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; 16987 } 16988 16989 // Return the new vector 16990 return DAG.getBuildVector(VT, DL, Ops); 16991 } 16992 16993 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 16994 SDValue EltNo, 16995 LoadSDNode *OriginalLoad) { 16996 assert(OriginalLoad->isSimple()); 16997 16998 EVT ResultVT = EVE->getValueType(0); 16999 EVT VecEltVT = InVecVT.getVectorElementType(); 17000 unsigned Align = OriginalLoad->getAlignment(); 17001 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( 17002 VecEltVT.getTypeForEVT(*DAG.getContext())); 17003 17004 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) 17005 return SDValue(); 17006 17007 ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? 17008 ISD::NON_EXTLOAD : ISD::EXTLOAD; 17009 if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) 17010 return SDValue(); 17011 17012 Align = NewAlign; 17013 17014 SDValue NewPtr = OriginalLoad->getBasePtr(); 17015 SDValue Offset; 17016 EVT PtrType = NewPtr.getValueType(); 17017 MachinePointerInfo MPI; 17018 SDLoc DL(EVE); 17019 if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) { 17020 int Elt = ConstEltNo->getZExtValue(); 17021 unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; 17022 Offset = DAG.getConstant(PtrOff, DL, PtrType); 17023 MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); 17024 } else { 17025 Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType); 17026 Offset = DAG.getNode( 17027 ISD::MUL, DL, PtrType, Offset, 17028 DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); 17029 // Discard the pointer info except the address space because the memory 17030 // operand can't represent this new access since the offset is variable. 17031 MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); 17032 } 17033 NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL); 17034 17035 // The replacement we need to do here is a little tricky: we need to 17036 // replace an extractelement of a load with a load. 17037 // Use ReplaceAllUsesOfValuesWith to do the replacement. 17038 // Note that this replacement assumes that the extractvalue is the only 17039 // use of the load; that's okay because we don't want to perform this 17040 // transformation in other cases anyway. 17041 SDValue Load; 17042 SDValue Chain; 17043 if (ResultVT.bitsGT(VecEltVT)) { 17044 // If the result type of vextract is wider than the load, then issue an 17045 // extending load instead. 17046 ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, 17047 VecEltVT) 17048 ? ISD::ZEXTLOAD 17049 : ISD::EXTLOAD; 17050 Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, 17051 OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, 17052 Align, OriginalLoad->getMemOperand()->getFlags(), 17053 OriginalLoad->getAAInfo()); 17054 Chain = Load.getValue(1); 17055 } else { 17056 Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, 17057 MPI, Align, OriginalLoad->getMemOperand()->getFlags(), 17058 OriginalLoad->getAAInfo()); 17059 Chain = Load.getValue(1); 17060 if (ResultVT.bitsLT(VecEltVT)) 17061 Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); 17062 else 17063 Load = DAG.getBitcast(ResultVT, Load); 17064 } 17065 WorklistRemover DeadNodes(*this); 17066 SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; 17067 SDValue To[] = { Load, Chain }; 17068 DAG.ReplaceAllUsesOfValuesWith(From, To, 2); 17069 // Make sure to revisit this node to clean it up; it will usually be dead. 17070 AddToWorklist(EVE); 17071 // Since we're explicitly calling ReplaceAllUses, add the new node to the 17072 // worklist explicitly as well. 17073 AddToWorklistWithUsers(Load.getNode()); 17074 ++OpsNarrowed; 17075 return SDValue(EVE, 0); 17076 } 17077 17078 /// Transform a vector binary operation into a scalar binary operation by moving 17079 /// the math/logic after an extract element of a vector. 17080 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, 17081 bool LegalOperations) { 17082 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17083 SDValue Vec = ExtElt->getOperand(0); 17084 SDValue Index = ExtElt->getOperand(1); 17085 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 17086 if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() || 17087 Vec.getNode()->getNumValues() != 1) 17088 return SDValue(); 17089 17090 // Targets may want to avoid this to prevent an expensive register transfer. 17091 if (!TLI.shouldScalarizeBinop(Vec)) 17092 return SDValue(); 17093 17094 // Extracting an element of a vector constant is constant-folded, so this 17095 // transform is just replacing a vector op with a scalar op while moving the 17096 // extract. 17097 SDValue Op0 = Vec.getOperand(0); 17098 SDValue Op1 = Vec.getOperand(1); 17099 if (isAnyConstantBuildVector(Op0, true) || 17100 isAnyConstantBuildVector(Op1, true)) { 17101 // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C' 17102 // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC) 17103 SDLoc DL(ExtElt); 17104 EVT VT = ExtElt->getValueType(0); 17105 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index); 17106 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index); 17107 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1); 17108 } 17109 17110 return SDValue(); 17111 } 17112 17113 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { 17114 SDValue VecOp = N->getOperand(0); 17115 SDValue Index = N->getOperand(1); 17116 EVT ScalarVT = N->getValueType(0); 17117 EVT VecVT = VecOp.getValueType(); 17118 if (VecOp.isUndef()) 17119 return DAG.getUNDEF(ScalarVT); 17120 17121 // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val 17122 // 17123 // This only really matters if the index is non-constant since other combines 17124 // on the constant elements already work. 17125 SDLoc DL(N); 17126 if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT && 17127 Index == VecOp.getOperand(2)) { 17128 SDValue Elt = VecOp.getOperand(1); 17129 return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt; 17130 } 17131 17132 // (vextract (scalar_to_vector val, 0) -> val 17133 if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { 17134 // Only 0'th element of SCALAR_TO_VECTOR is defined. 17135 if (DAG.isKnownNeverZero(Index)) 17136 return DAG.getUNDEF(ScalarVT); 17137 17138 // Check if the result type doesn't match the inserted element type. A 17139 // SCALAR_TO_VECTOR may truncate the inserted element and the 17140 // EXTRACT_VECTOR_ELT may widen the extracted vector. 17141 SDValue InOp = VecOp.getOperand(0); 17142 if (InOp.getValueType() != ScalarVT) { 17143 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 17144 return DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 17145 } 17146 return InOp; 17147 } 17148 17149 // extract_vector_elt of out-of-bounds element -> UNDEF 17150 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 17151 unsigned NumElts = VecVT.getVectorNumElements(); 17152 if (IndexC && IndexC->getAPIntValue().uge(NumElts)) 17153 return DAG.getUNDEF(ScalarVT); 17154 17155 // extract_vector_elt (build_vector x, y), 1 -> y 17156 if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR && 17157 TLI.isTypeLegal(VecVT) && 17158 (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { 17159 SDValue Elt = VecOp.getOperand(IndexC->getZExtValue()); 17160 EVT InEltVT = Elt.getValueType(); 17161 17162 // Sometimes build_vector's scalar input types do not match result type. 17163 if (ScalarVT == InEltVT) 17164 return Elt; 17165 17166 // TODO: It may be useful to truncate if free if the build_vector implicitly 17167 // converts. 17168 } 17169 17170 // TODO: These transforms should not require the 'hasOneUse' restriction, but 17171 // there are regressions on multiple targets without it. We can end up with a 17172 // mess of scalar and vector code if we reduce only part of the DAG to scalar. 17173 if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() && 17174 VecOp.hasOneUse()) { 17175 // The vector index of the LSBs of the source depend on the endian-ness. 17176 bool IsLE = DAG.getDataLayout().isLittleEndian(); 17177 unsigned ExtractIndex = IndexC->getZExtValue(); 17178 // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) 17179 unsigned BCTruncElt = IsLE ? 0 : NumElts - 1; 17180 SDValue BCSrc = VecOp.getOperand(0); 17181 if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger()) 17182 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc); 17183 17184 if (LegalTypes && BCSrc.getValueType().isInteger() && 17185 BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) { 17186 // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> 17187 // trunc i64 X to i32 17188 SDValue X = BCSrc.getOperand(0); 17189 assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() && 17190 "Extract element and scalar to vector can't change element type " 17191 "from FP to integer."); 17192 unsigned XBitWidth = X.getValueSizeInBits(); 17193 unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); 17194 BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; 17195 17196 // An extract element return value type can be wider than its vector 17197 // operand element type. In that case, the high bits are undefined, so 17198 // it's possible that we may need to extend rather than truncate. 17199 if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) { 17200 assert(XBitWidth % VecEltBitWidth == 0 && 17201 "Scalar bitwidth must be a multiple of vector element bitwidth"); 17202 return DAG.getAnyExtOrTrunc(X, DL, ScalarVT); 17203 } 17204 } 17205 } 17206 17207 if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) 17208 return BO; 17209 17210 // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. 17211 // We only perform this optimization before the op legalization phase because 17212 // we may introduce new vector instructions which are not backed by TD 17213 // patterns. For example on AVX, extracting elements from a wide vector 17214 // without using extract_subvector. However, if we can find an underlying 17215 // scalar value, then we can always use that. 17216 if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) { 17217 auto *Shuf = cast<ShuffleVectorSDNode>(VecOp); 17218 // Find the new index to extract from. 17219 int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue()); 17220 17221 // Extracting an undef index is undef. 17222 if (OrigElt == -1) 17223 return DAG.getUNDEF(ScalarVT); 17224 17225 // Select the right vector half to extract from. 17226 SDValue SVInVec; 17227 if (OrigElt < (int)NumElts) { 17228 SVInVec = VecOp.getOperand(0); 17229 } else { 17230 SVInVec = VecOp.getOperand(1); 17231 OrigElt -= NumElts; 17232 } 17233 17234 if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { 17235 SDValue InOp = SVInVec.getOperand(OrigElt); 17236 if (InOp.getValueType() != ScalarVT) { 17237 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 17238 InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 17239 } 17240 17241 return InOp; 17242 } 17243 17244 // FIXME: We should handle recursing on other vector shuffles and 17245 // scalar_to_vector here as well. 17246 17247 if (!LegalOperations || 17248 // FIXME: Should really be just isOperationLegalOrCustom. 17249 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) || 17250 TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) { 17251 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec, 17252 DAG.getVectorIdxConstant(OrigElt, DL)); 17253 } 17254 } 17255 17256 // If only EXTRACT_VECTOR_ELT nodes use the source vector we can 17257 // simplify it based on the (valid) extraction indices. 17258 if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) { 17259 return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17260 Use->getOperand(0) == VecOp && 17261 isa<ConstantSDNode>(Use->getOperand(1)); 17262 })) { 17263 APInt DemandedElts = APInt::getNullValue(NumElts); 17264 for (SDNode *Use : VecOp->uses()) { 17265 auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1)); 17266 if (CstElt->getAPIntValue().ult(NumElts)) 17267 DemandedElts.setBit(CstElt->getZExtValue()); 17268 } 17269 if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) { 17270 // We simplified the vector operand of this extract element. If this 17271 // extract is not dead, visit it again so it is folded properly. 17272 if (N->getOpcode() != ISD::DELETED_NODE) 17273 AddToWorklist(N); 17274 return SDValue(N, 0); 17275 } 17276 } 17277 17278 // Everything under here is trying to match an extract of a loaded value. 17279 // If the result of load has to be truncated, then it's not necessarily 17280 // profitable. 17281 bool BCNumEltsChanged = false; 17282 EVT ExtVT = VecVT.getVectorElementType(); 17283 EVT LVT = ExtVT; 17284 if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT)) 17285 return SDValue(); 17286 17287 if (VecOp.getOpcode() == ISD::BITCAST) { 17288 // Don't duplicate a load with other uses. 17289 if (!VecOp.hasOneUse()) 17290 return SDValue(); 17291 17292 EVT BCVT = VecOp.getOperand(0).getValueType(); 17293 if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) 17294 return SDValue(); 17295 if (NumElts != BCVT.getVectorNumElements()) 17296 BCNumEltsChanged = true; 17297 VecOp = VecOp.getOperand(0); 17298 ExtVT = BCVT.getVectorElementType(); 17299 } 17300 17301 // extract (vector load $addr), i --> load $addr + i * size 17302 if (!LegalOperations && !IndexC && VecOp.hasOneUse() && 17303 ISD::isNormalLoad(VecOp.getNode()) && 17304 !Index->hasPredecessor(VecOp.getNode())) { 17305 auto *VecLoad = dyn_cast<LoadSDNode>(VecOp); 17306 if (VecLoad && VecLoad->isSimple()) 17307 return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad); 17308 } 17309 17310 // Perform only after legalization to ensure build_vector / vector_shuffle 17311 // optimizations have already been done. 17312 if (!LegalOperations || !IndexC) 17313 return SDValue(); 17314 17315 // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) 17316 // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) 17317 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) 17318 int Elt = IndexC->getZExtValue(); 17319 LoadSDNode *LN0 = nullptr; 17320 if (ISD::isNormalLoad(VecOp.getNode())) { 17321 LN0 = cast<LoadSDNode>(VecOp); 17322 } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 17323 VecOp.getOperand(0).getValueType() == ExtVT && 17324 ISD::isNormalLoad(VecOp.getOperand(0).getNode())) { 17325 // Don't duplicate a load with other uses. 17326 if (!VecOp.hasOneUse()) 17327 return SDValue(); 17328 17329 LN0 = cast<LoadSDNode>(VecOp.getOperand(0)); 17330 } 17331 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) { 17332 // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) 17333 // => 17334 // (load $addr+1*size) 17335 17336 // Don't duplicate a load with other uses. 17337 if (!VecOp.hasOneUse()) 17338 return SDValue(); 17339 17340 // If the bit convert changed the number of elements, it is unsafe 17341 // to examine the mask. 17342 if (BCNumEltsChanged) 17343 return SDValue(); 17344 17345 // Select the input vector, guarding against out of range extract vector. 17346 int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt); 17347 VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1); 17348 17349 if (VecOp.getOpcode() == ISD::BITCAST) { 17350 // Don't duplicate a load with other uses. 17351 if (!VecOp.hasOneUse()) 17352 return SDValue(); 17353 17354 VecOp = VecOp.getOperand(0); 17355 } 17356 if (ISD::isNormalLoad(VecOp.getNode())) { 17357 LN0 = cast<LoadSDNode>(VecOp); 17358 Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts; 17359 Index = DAG.getConstant(Elt, DL, Index.getValueType()); 17360 } 17361 } 17362 17363 // Make sure we found a non-volatile load and the extractelement is 17364 // the only use. 17365 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple()) 17366 return SDValue(); 17367 17368 // If Idx was -1 above, Elt is going to be -1, so just return undef. 17369 if (Elt == -1) 17370 return DAG.getUNDEF(LVT); 17371 17372 return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0); 17373 } 17374 17375 // Simplify (build_vec (ext )) to (bitcast (build_vec )) 17376 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { 17377 // We perform this optimization post type-legalization because 17378 // the type-legalizer often scalarizes integer-promoted vectors. 17379 // Performing this optimization before may create bit-casts which 17380 // will be type-legalized to complex code sequences. 17381 // We perform this optimization only before the operation legalizer because we 17382 // may introduce illegal operations. 17383 if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) 17384 return SDValue(); 17385 17386 unsigned NumInScalars = N->getNumOperands(); 17387 SDLoc DL(N); 17388 EVT VT = N->getValueType(0); 17389 17390 // Check to see if this is a BUILD_VECTOR of a bunch of values 17391 // which come from any_extend or zero_extend nodes. If so, we can create 17392 // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR 17393 // optimizations. We do not handle sign-extend because we can't fill the sign 17394 // using shuffles. 17395 EVT SourceType = MVT::Other; 17396 bool AllAnyExt = true; 17397 17398 for (unsigned i = 0; i != NumInScalars; ++i) { 17399 SDValue In = N->getOperand(i); 17400 // Ignore undef inputs. 17401 if (In.isUndef()) continue; 17402 17403 bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; 17404 bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; 17405 17406 // Abort if the element is not an extension. 17407 if (!ZeroExt && !AnyExt) { 17408 SourceType = MVT::Other; 17409 break; 17410 } 17411 17412 // The input is a ZeroExt or AnyExt. Check the original type. 17413 EVT InTy = In.getOperand(0).getValueType(); 17414 17415 // Check that all of the widened source types are the same. 17416 if (SourceType == MVT::Other) 17417 // First time. 17418 SourceType = InTy; 17419 else if (InTy != SourceType) { 17420 // Multiple income types. Abort. 17421 SourceType = MVT::Other; 17422 break; 17423 } 17424 17425 // Check if all of the extends are ANY_EXTENDs. 17426 AllAnyExt &= AnyExt; 17427 } 17428 17429 // In order to have valid types, all of the inputs must be extended from the 17430 // same source type and all of the inputs must be any or zero extend. 17431 // Scalar sizes must be a power of two. 17432 EVT OutScalarTy = VT.getScalarType(); 17433 bool ValidTypes = SourceType != MVT::Other && 17434 isPowerOf2_32(OutScalarTy.getSizeInBits()) && 17435 isPowerOf2_32(SourceType.getSizeInBits()); 17436 17437 // Create a new simpler BUILD_VECTOR sequence which other optimizations can 17438 // turn into a single shuffle instruction. 17439 if (!ValidTypes) 17440 return SDValue(); 17441 17442 bool isLE = DAG.getDataLayout().isLittleEndian(); 17443 unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); 17444 assert(ElemRatio > 1 && "Invalid element size ratio"); 17445 SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): 17446 DAG.getConstant(0, DL, SourceType); 17447 17448 unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); 17449 SmallVector<SDValue, 8> Ops(NewBVElems, Filler); 17450 17451 // Populate the new build_vector 17452 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 17453 SDValue Cast = N->getOperand(i); 17454 assert((Cast.getOpcode() == ISD::ANY_EXTEND || 17455 Cast.getOpcode() == ISD::ZERO_EXTEND || 17456 Cast.isUndef()) && "Invalid cast opcode"); 17457 SDValue In; 17458 if (Cast.isUndef()) 17459 In = DAG.getUNDEF(SourceType); 17460 else 17461 In = Cast->getOperand(0); 17462 unsigned Index = isLE ? (i * ElemRatio) : 17463 (i * ElemRatio + (ElemRatio - 1)); 17464 17465 assert(Index < Ops.size() && "Invalid index"); 17466 Ops[Index] = In; 17467 } 17468 17469 // The type of the new BUILD_VECTOR node. 17470 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); 17471 assert(VecVT.getSizeInBits() == VT.getSizeInBits() && 17472 "Invalid vector size"); 17473 // Check if the new vector type is legal. 17474 if (!isTypeLegal(VecVT) || 17475 (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) && 17476 TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))) 17477 return SDValue(); 17478 17479 // Make the new BUILD_VECTOR. 17480 SDValue BV = DAG.getBuildVector(VecVT, DL, Ops); 17481 17482 // The new BUILD_VECTOR node has the potential to be further optimized. 17483 AddToWorklist(BV.getNode()); 17484 // Bitcast to the desired type. 17485 return DAG.getBitcast(VT, BV); 17486 } 17487 17488 // Simplify (build_vec (trunc $1) 17489 // (trunc (srl $1 half-width)) 17490 // (trunc (srl $1 (2 * half-width))) …) 17491 // to (bitcast $1) 17492 SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) { 17493 assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 17494 17495 // Only for little endian 17496 if (!DAG.getDataLayout().isLittleEndian()) 17497 return SDValue(); 17498 17499 SDLoc DL(N); 17500 EVT VT = N->getValueType(0); 17501 EVT OutScalarTy = VT.getScalarType(); 17502 uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits(); 17503 17504 // Only for power of two types to be sure that bitcast works well 17505 if (!isPowerOf2_64(ScalarTypeBitsize)) 17506 return SDValue(); 17507 17508 unsigned NumInScalars = N->getNumOperands(); 17509 17510 // Look through bitcasts 17511 auto PeekThroughBitcast = [](SDValue Op) { 17512 if (Op.getOpcode() == ISD::BITCAST) 17513 return Op.getOperand(0); 17514 return Op; 17515 }; 17516 17517 // The source value where all the parts are extracted. 17518 SDValue Src; 17519 for (unsigned i = 0; i != NumInScalars; ++i) { 17520 SDValue In = PeekThroughBitcast(N->getOperand(i)); 17521 // Ignore undef inputs. 17522 if (In.isUndef()) continue; 17523 17524 if (In.getOpcode() != ISD::TRUNCATE) 17525 return SDValue(); 17526 17527 In = PeekThroughBitcast(In.getOperand(0)); 17528 17529 if (In.getOpcode() != ISD::SRL) { 17530 // For now only build_vec without shuffling, handle shifts here in the 17531 // future. 17532 if (i != 0) 17533 return SDValue(); 17534 17535 Src = In; 17536 } else { 17537 // In is SRL 17538 SDValue part = PeekThroughBitcast(In.getOperand(0)); 17539 17540 if (!Src) { 17541 Src = part; 17542 } else if (Src != part) { 17543 // Vector parts do not stem from the same variable 17544 return SDValue(); 17545 } 17546 17547 SDValue ShiftAmtVal = In.getOperand(1); 17548 if (!isa<ConstantSDNode>(ShiftAmtVal)) 17549 return SDValue(); 17550 17551 uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1); 17552 17553 // The extracted value is not extracted at the right position 17554 if (ShiftAmt != i * ScalarTypeBitsize) 17555 return SDValue(); 17556 } 17557 } 17558 17559 // Only cast if the size is the same 17560 if (Src.getValueType().getSizeInBits() != VT.getSizeInBits()) 17561 return SDValue(); 17562 17563 return DAG.getBitcast(VT, Src); 17564 } 17565 17566 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, 17567 ArrayRef<int> VectorMask, 17568 SDValue VecIn1, SDValue VecIn2, 17569 unsigned LeftIdx, bool DidSplitVec) { 17570 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 17571 17572 EVT VT = N->getValueType(0); 17573 EVT InVT1 = VecIn1.getValueType(); 17574 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; 17575 17576 unsigned NumElems = VT.getVectorNumElements(); 17577 unsigned ShuffleNumElems = NumElems; 17578 17579 // If we artificially split a vector in two already, then the offsets in the 17580 // operands will all be based off of VecIn1, even those in VecIn2. 17581 unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); 17582 17583 // We can't generate a shuffle node with mismatched input and output types. 17584 // Try to make the types match the type of the output. 17585 if (InVT1 != VT || InVT2 != VT) { 17586 if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) { 17587 // If the output vector length is a multiple of both input lengths, 17588 // we can concatenate them and pad the rest with undefs. 17589 unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits(); 17590 assert(NumConcats >= 2 && "Concat needs at least two inputs!"); 17591 SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); 17592 ConcatOps[0] = VecIn1; 17593 ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); 17594 VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 17595 VecIn2 = SDValue(); 17596 } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { 17597 if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) 17598 return SDValue(); 17599 17600 if (!VecIn2.getNode()) { 17601 // If we only have one input vector, and it's twice the size of the 17602 // output, split it in two. 17603 VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, 17604 DAG.getVectorIdxConstant(NumElems, DL)); 17605 VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx); 17606 // Since we now have shorter input vectors, adjust the offset of the 17607 // second vector's start. 17608 Vec2Offset = NumElems; 17609 } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { 17610 // VecIn1 is wider than the output, and we have another, possibly 17611 // smaller input. Pad the smaller input with undefs, shuffle at the 17612 // input vector width, and extract the output. 17613 // The shuffle type is different than VT, so check legality again. 17614 if (LegalOperations && 17615 !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) 17616 return SDValue(); 17617 17618 // Legalizing INSERT_SUBVECTOR is tricky - you basically have to 17619 // lower it back into a BUILD_VECTOR. So if the inserted type is 17620 // illegal, don't even try. 17621 if (InVT1 != InVT2) { 17622 if (!TLI.isTypeLegal(InVT2)) 17623 return SDValue(); 17624 VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1, 17625 DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); 17626 } 17627 ShuffleNumElems = NumElems * 2; 17628 } else { 17629 // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider 17630 // than VecIn1. We can't handle this for now - this case will disappear 17631 // when we start sorting the vectors by type. 17632 return SDValue(); 17633 } 17634 } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && 17635 InVT1.getSizeInBits() == VT.getSizeInBits()) { 17636 SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2)); 17637 ConcatOps[0] = VecIn2; 17638 VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 17639 } else { 17640 // TODO: Support cases where the length mismatch isn't exactly by a 17641 // factor of 2. 17642 // TODO: Move this check upwards, so that if we have bad type 17643 // mismatches, we don't create any DAG nodes. 17644 return SDValue(); 17645 } 17646 } 17647 17648 // Initialize mask to undef. 17649 SmallVector<int, 8> Mask(ShuffleNumElems, -1); 17650 17651 // Only need to run up to the number of elements actually used, not the 17652 // total number of elements in the shuffle - if we are shuffling a wider 17653 // vector, the high lanes should be set to undef. 17654 for (unsigned i = 0; i != NumElems; ++i) { 17655 if (VectorMask[i] <= 0) 17656 continue; 17657 17658 unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1); 17659 if (VectorMask[i] == (int)LeftIdx) { 17660 Mask[i] = ExtIndex; 17661 } else if (VectorMask[i] == (int)LeftIdx + 1) { 17662 Mask[i] = Vec2Offset + ExtIndex; 17663 } 17664 } 17665 17666 // The type the input vectors may have changed above. 17667 InVT1 = VecIn1.getValueType(); 17668 17669 // If we already have a VecIn2, it should have the same type as VecIn1. 17670 // If we don't, get an undef/zero vector of the appropriate type. 17671 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1); 17672 assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); 17673 17674 SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask); 17675 if (ShuffleNumElems > NumElems) 17676 Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx); 17677 17678 return Shuffle; 17679 } 17680 17681 static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { 17682 assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 17683 17684 // First, determine where the build vector is not undef. 17685 // TODO: We could extend this to handle zero elements as well as undefs. 17686 int NumBVOps = BV->getNumOperands(); 17687 int ZextElt = -1; 17688 for (int i = 0; i != NumBVOps; ++i) { 17689 SDValue Op = BV->getOperand(i); 17690 if (Op.isUndef()) 17691 continue; 17692 if (ZextElt == -1) 17693 ZextElt = i; 17694 else 17695 return SDValue(); 17696 } 17697 // Bail out if there's no non-undef element. 17698 if (ZextElt == -1) 17699 return SDValue(); 17700 17701 // The build vector contains some number of undef elements and exactly 17702 // one other element. That other element must be a zero-extended scalar 17703 // extracted from a vector at a constant index to turn this into a shuffle. 17704 // Also, require that the build vector does not implicitly truncate/extend 17705 // its elements. 17706 // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND. 17707 EVT VT = BV->getValueType(0); 17708 SDValue Zext = BV->getOperand(ZextElt); 17709 if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() || 17710 Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 17711 !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) || 17712 Zext.getValueSizeInBits() != VT.getScalarSizeInBits()) 17713 return SDValue(); 17714 17715 // The zero-extend must be a multiple of the source size, and we must be 17716 // building a vector of the same size as the source of the extract element. 17717 SDValue Extract = Zext.getOperand(0); 17718 unsigned DestSize = Zext.getValueSizeInBits(); 17719 unsigned SrcSize = Extract.getValueSizeInBits(); 17720 if (DestSize % SrcSize != 0 || 17721 Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits()) 17722 return SDValue(); 17723 17724 // Create a shuffle mask that will combine the extracted element with zeros 17725 // and undefs. 17726 int ZextRatio = DestSize / SrcSize; 17727 int NumMaskElts = NumBVOps * ZextRatio; 17728 SmallVector<int, 32> ShufMask(NumMaskElts, -1); 17729 for (int i = 0; i != NumMaskElts; ++i) { 17730 if (i / ZextRatio == ZextElt) { 17731 // The low bits of the (potentially translated) extracted element map to 17732 // the source vector. The high bits map to zero. We will use a zero vector 17733 // as the 2nd source operand of the shuffle, so use the 1st element of 17734 // that vector (mask value is number-of-elements) for the high bits. 17735 if (i % ZextRatio == 0) 17736 ShufMask[i] = Extract.getConstantOperandVal(1); 17737 else 17738 ShufMask[i] = NumMaskElts; 17739 } 17740 17741 // Undef elements of the build vector remain undef because we initialize 17742 // the shuffle mask with -1. 17743 } 17744 17745 // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... --> 17746 // bitcast (shuffle V, ZeroVec, VectorMask) 17747 SDLoc DL(BV); 17748 EVT VecVT = Extract.getOperand(0).getValueType(); 17749 SDValue ZeroVec = DAG.getConstant(0, DL, VecVT); 17750 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17751 SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0), 17752 ZeroVec, ShufMask, DAG); 17753 if (!Shuf) 17754 return SDValue(); 17755 return DAG.getBitcast(VT, Shuf); 17756 } 17757 17758 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT 17759 // operations. If the types of the vectors we're extracting from allow it, 17760 // turn this into a vector_shuffle node. 17761 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { 17762 SDLoc DL(N); 17763 EVT VT = N->getValueType(0); 17764 17765 // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. 17766 if (!isTypeLegal(VT)) 17767 return SDValue(); 17768 17769 if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG)) 17770 return V; 17771 17772 // May only combine to shuffle after legalize if shuffle is legal. 17773 if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) 17774 return SDValue(); 17775 17776 bool UsesZeroVector = false; 17777 unsigned NumElems = N->getNumOperands(); 17778 17779 // Record, for each element of the newly built vector, which input vector 17780 // that element comes from. -1 stands for undef, 0 for the zero vector, 17781 // and positive values for the input vectors. 17782 // VectorMask maps each element to its vector number, and VecIn maps vector 17783 // numbers to their initial SDValues. 17784 17785 SmallVector<int, 8> VectorMask(NumElems, -1); 17786 SmallVector<SDValue, 8> VecIn; 17787 VecIn.push_back(SDValue()); 17788 17789 for (unsigned i = 0; i != NumElems; ++i) { 17790 SDValue Op = N->getOperand(i); 17791 17792 if (Op.isUndef()) 17793 continue; 17794 17795 // See if we can use a blend with a zero vector. 17796 // TODO: Should we generalize this to a blend with an arbitrary constant 17797 // vector? 17798 if (isNullConstant(Op) || isNullFPConstant(Op)) { 17799 UsesZeroVector = true; 17800 VectorMask[i] = 0; 17801 continue; 17802 } 17803 17804 // Not an undef or zero. If the input is something other than an 17805 // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. 17806 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 17807 !isa<ConstantSDNode>(Op.getOperand(1))) 17808 return SDValue(); 17809 SDValue ExtractedFromVec = Op.getOperand(0); 17810 17811 const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); 17812 if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) 17813 return SDValue(); 17814 17815 // All inputs must have the same element type as the output. 17816 if (VT.getVectorElementType() != 17817 ExtractedFromVec.getValueType().getVectorElementType()) 17818 return SDValue(); 17819 17820 // Have we seen this input vector before? 17821 // The vectors are expected to be tiny (usually 1 or 2 elements), so using 17822 // a map back from SDValues to numbers isn't worth it. 17823 unsigned Idx = std::distance( 17824 VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec)); 17825 if (Idx == VecIn.size()) 17826 VecIn.push_back(ExtractedFromVec); 17827 17828 VectorMask[i] = Idx; 17829 } 17830 17831 // If we didn't find at least one input vector, bail out. 17832 if (VecIn.size() < 2) 17833 return SDValue(); 17834 17835 // If all the Operands of BUILD_VECTOR extract from same 17836 // vector, then split the vector efficiently based on the maximum 17837 // vector access index and adjust the VectorMask and 17838 // VecIn accordingly. 17839 bool DidSplitVec = false; 17840 if (VecIn.size() == 2) { 17841 unsigned MaxIndex = 0; 17842 unsigned NearestPow2 = 0; 17843 SDValue Vec = VecIn.back(); 17844 EVT InVT = Vec.getValueType(); 17845 SmallVector<unsigned, 8> IndexVec(NumElems, 0); 17846 17847 for (unsigned i = 0; i < NumElems; i++) { 17848 if (VectorMask[i] <= 0) 17849 continue; 17850 unsigned Index = N->getOperand(i).getConstantOperandVal(1); 17851 IndexVec[i] = Index; 17852 MaxIndex = std::max(MaxIndex, Index); 17853 } 17854 17855 NearestPow2 = PowerOf2Ceil(MaxIndex); 17856 if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && 17857 NumElems * 2 < NearestPow2) { 17858 unsigned SplitSize = NearestPow2 / 2; 17859 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), 17860 InVT.getVectorElementType(), SplitSize); 17861 if (TLI.isTypeLegal(SplitVT)) { 17862 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 17863 DAG.getVectorIdxConstant(SplitSize, DL)); 17864 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 17865 DAG.getVectorIdxConstant(0, DL)); 17866 VecIn.pop_back(); 17867 VecIn.push_back(VecIn1); 17868 VecIn.push_back(VecIn2); 17869 DidSplitVec = true; 17870 17871 for (unsigned i = 0; i < NumElems; i++) { 17872 if (VectorMask[i] <= 0) 17873 continue; 17874 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; 17875 } 17876 } 17877 } 17878 } 17879 17880 // TODO: We want to sort the vectors by descending length, so that adjacent 17881 // pairs have similar length, and the longer vector is always first in the 17882 // pair. 17883 17884 // TODO: Should this fire if some of the input vectors has illegal type (like 17885 // it does now), or should we let legalization run its course first? 17886 17887 // Shuffle phase: 17888 // Take pairs of vectors, and shuffle them so that the result has elements 17889 // from these vectors in the correct places. 17890 // For example, given: 17891 // t10: i32 = extract_vector_elt t1, Constant:i64<0> 17892 // t11: i32 = extract_vector_elt t2, Constant:i64<0> 17893 // t12: i32 = extract_vector_elt t3, Constant:i64<0> 17894 // t13: i32 = extract_vector_elt t1, Constant:i64<1> 17895 // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13 17896 // We will generate: 17897 // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2 17898 // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef 17899 SmallVector<SDValue, 4> Shuffles; 17900 for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) { 17901 unsigned LeftIdx = 2 * In + 1; 17902 SDValue VecLeft = VecIn[LeftIdx]; 17903 SDValue VecRight = 17904 (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); 17905 17906 if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, 17907 VecRight, LeftIdx, DidSplitVec)) 17908 Shuffles.push_back(Shuffle); 17909 else 17910 return SDValue(); 17911 } 17912 17913 // If we need the zero vector as an "ingredient" in the blend tree, add it 17914 // to the list of shuffles. 17915 if (UsesZeroVector) 17916 Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT) 17917 : DAG.getConstantFP(0.0, DL, VT)); 17918 17919 // If we only have one shuffle, we're done. 17920 if (Shuffles.size() == 1) 17921 return Shuffles[0]; 17922 17923 // Update the vector mask to point to the post-shuffle vectors. 17924 for (int &Vec : VectorMask) 17925 if (Vec == 0) 17926 Vec = Shuffles.size() - 1; 17927 else 17928 Vec = (Vec - 1) / 2; 17929 17930 // More than one shuffle. Generate a binary tree of blends, e.g. if from 17931 // the previous step we got the set of shuffles t10, t11, t12, t13, we will 17932 // generate: 17933 // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2 17934 // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4 17935 // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6 17936 // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8 17937 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11 17938 // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13 17939 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21 17940 17941 // Make sure the initial size of the shuffle list is even. 17942 if (Shuffles.size() % 2) 17943 Shuffles.push_back(DAG.getUNDEF(VT)); 17944 17945 for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) { 17946 if (CurSize % 2) { 17947 Shuffles[CurSize] = DAG.getUNDEF(VT); 17948 CurSize++; 17949 } 17950 for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { 17951 int Left = 2 * In; 17952 int Right = 2 * In + 1; 17953 SmallVector<int, 8> Mask(NumElems, -1); 17954 for (unsigned i = 0; i != NumElems; ++i) { 17955 if (VectorMask[i] == Left) { 17956 Mask[i] = i; 17957 VectorMask[i] = In; 17958 } else if (VectorMask[i] == Right) { 17959 Mask[i] = i + NumElems; 17960 VectorMask[i] = In; 17961 } 17962 } 17963 17964 Shuffles[In] = 17965 DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); 17966 } 17967 } 17968 return Shuffles[0]; 17969 } 17970 17971 // Try to turn a build vector of zero extends of extract vector elts into a 17972 // a vector zero extend and possibly an extract subvector. 17973 // TODO: Support sign extend? 17974 // TODO: Allow undef elements? 17975 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { 17976 if (LegalOperations) 17977 return SDValue(); 17978 17979 EVT VT = N->getValueType(0); 17980 17981 bool FoundZeroExtend = false; 17982 SDValue Op0 = N->getOperand(0); 17983 auto checkElem = [&](SDValue Op) -> int64_t { 17984 unsigned Opc = Op.getOpcode(); 17985 FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND); 17986 if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) && 17987 Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17988 Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) 17989 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1))) 17990 return C->getZExtValue(); 17991 return -1; 17992 }; 17993 17994 // Make sure the first element matches 17995 // (zext (extract_vector_elt X, C)) 17996 int64_t Offset = checkElem(Op0); 17997 if (Offset < 0) 17998 return SDValue(); 17999 18000 unsigned NumElems = N->getNumOperands(); 18001 SDValue In = Op0.getOperand(0).getOperand(0); 18002 EVT InSVT = In.getValueType().getScalarType(); 18003 EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems); 18004 18005 // Don't create an illegal input type after type legalization. 18006 if (LegalTypes && !TLI.isTypeLegal(InVT)) 18007 return SDValue(); 18008 18009 // Ensure all the elements come from the same vector and are adjacent. 18010 for (unsigned i = 1; i != NumElems; ++i) { 18011 if ((Offset + i) != checkElem(N->getOperand(i))) 18012 return SDValue(); 18013 } 18014 18015 SDLoc DL(N); 18016 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, 18017 Op0.getOperand(0).getOperand(1)); 18018 return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL, 18019 VT, In); 18020 } 18021 18022 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { 18023 EVT VT = N->getValueType(0); 18024 18025 // A vector built entirely of undefs is undef. 18026 if (ISD::allOperandsUndef(N)) 18027 return DAG.getUNDEF(VT); 18028 18029 // If this is a splat of a bitcast from another vector, change to a 18030 // concat_vector. 18031 // For example: 18032 // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) -> 18033 // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X)))) 18034 // 18035 // If X is a build_vector itself, the concat can become a larger build_vector. 18036 // TODO: Maybe this is useful for non-splat too? 18037 if (!LegalOperations) { 18038 if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) { 18039 Splat = peekThroughBitcasts(Splat); 18040 EVT SrcVT = Splat.getValueType(); 18041 if (SrcVT.isVector()) { 18042 unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements(); 18043 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), 18044 SrcVT.getVectorElementType(), NumElts); 18045 if (!LegalTypes || TLI.isTypeLegal(NewVT)) { 18046 SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat); 18047 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), 18048 NewVT, Ops); 18049 return DAG.getBitcast(VT, Concat); 18050 } 18051 } 18052 } 18053 } 18054 18055 // A splat of a single element is a SPLAT_VECTOR if supported on the target. 18056 if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand) 18057 if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) { 18058 assert(!V.isUndef() && "Splat of undef should have been handled earlier"); 18059 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V); 18060 } 18061 18062 // Check if we can express BUILD VECTOR via subvector extract. 18063 if (!LegalTypes && (N->getNumOperands() > 1)) { 18064 SDValue Op0 = N->getOperand(0); 18065 auto checkElem = [&](SDValue Op) -> uint64_t { 18066 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && 18067 (Op0.getOperand(0) == Op.getOperand(0))) 18068 if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 18069 return CNode->getZExtValue(); 18070 return -1; 18071 }; 18072 18073 int Offset = checkElem(Op0); 18074 for (unsigned i = 0; i < N->getNumOperands(); ++i) { 18075 if (Offset + i != checkElem(N->getOperand(i))) { 18076 Offset = -1; 18077 break; 18078 } 18079 } 18080 18081 if ((Offset == 0) && 18082 (Op0.getOperand(0).getValueType() == N->getValueType(0))) 18083 return Op0.getOperand(0); 18084 if ((Offset != -1) && 18085 ((Offset % N->getValueType(0).getVectorNumElements()) == 18086 0)) // IDX must be multiple of output size. 18087 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), 18088 Op0.getOperand(0), Op0.getOperand(1)); 18089 } 18090 18091 if (SDValue V = convertBuildVecZextToZext(N)) 18092 return V; 18093 18094 if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) 18095 return V; 18096 18097 if (SDValue V = reduceBuildVecTruncToBitCast(N)) 18098 return V; 18099 18100 if (SDValue V = reduceBuildVecToShuffle(N)) 18101 return V; 18102 18103 return SDValue(); 18104 } 18105 18106 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { 18107 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18108 EVT OpVT = N->getOperand(0).getValueType(); 18109 18110 // If the operands are legal vectors, leave them alone. 18111 if (TLI.isTypeLegal(OpVT)) 18112 return SDValue(); 18113 18114 SDLoc DL(N); 18115 EVT VT = N->getValueType(0); 18116 SmallVector<SDValue, 8> Ops; 18117 18118 EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); 18119 SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 18120 18121 // Keep track of what we encounter. 18122 bool AnyInteger = false; 18123 bool AnyFP = false; 18124 for (const SDValue &Op : N->ops()) { 18125 if (ISD::BITCAST == Op.getOpcode() && 18126 !Op.getOperand(0).getValueType().isVector()) 18127 Ops.push_back(Op.getOperand(0)); 18128 else if (ISD::UNDEF == Op.getOpcode()) 18129 Ops.push_back(ScalarUndef); 18130 else 18131 return SDValue(); 18132 18133 // Note whether we encounter an integer or floating point scalar. 18134 // If it's neither, bail out, it could be something weird like x86mmx. 18135 EVT LastOpVT = Ops.back().getValueType(); 18136 if (LastOpVT.isFloatingPoint()) 18137 AnyFP = true; 18138 else if (LastOpVT.isInteger()) 18139 AnyInteger = true; 18140 else 18141 return SDValue(); 18142 } 18143 18144 // If any of the operands is a floating point scalar bitcast to a vector, 18145 // use floating point types throughout, and bitcast everything. 18146 // Replace UNDEFs by another scalar UNDEF node, of the final desired type. 18147 if (AnyFP) { 18148 SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); 18149 ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 18150 if (AnyInteger) { 18151 for (SDValue &Op : Ops) { 18152 if (Op.getValueType() == SVT) 18153 continue; 18154 if (Op.isUndef()) 18155 Op = ScalarUndef; 18156 else 18157 Op = DAG.getBitcast(SVT, Op); 18158 } 18159 } 18160 } 18161 18162 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, 18163 VT.getSizeInBits() / SVT.getSizeInBits()); 18164 return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); 18165 } 18166 18167 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR 18168 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at 18169 // most two distinct vectors the same size as the result, attempt to turn this 18170 // into a legal shuffle. 18171 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { 18172 EVT VT = N->getValueType(0); 18173 EVT OpVT = N->getOperand(0).getValueType(); 18174 int NumElts = VT.getVectorNumElements(); 18175 int NumOpElts = OpVT.getVectorNumElements(); 18176 18177 SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); 18178 SmallVector<int, 8> Mask; 18179 18180 for (SDValue Op : N->ops()) { 18181 Op = peekThroughBitcasts(Op); 18182 18183 // UNDEF nodes convert to UNDEF shuffle mask values. 18184 if (Op.isUndef()) { 18185 Mask.append((unsigned)NumOpElts, -1); 18186 continue; 18187 } 18188 18189 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 18190 return SDValue(); 18191 18192 // What vector are we extracting the subvector from and at what index? 18193 SDValue ExtVec = Op.getOperand(0); 18194 18195 // We want the EVT of the original extraction to correctly scale the 18196 // extraction index. 18197 EVT ExtVT = ExtVec.getValueType(); 18198 ExtVec = peekThroughBitcasts(ExtVec); 18199 18200 // UNDEF nodes convert to UNDEF shuffle mask values. 18201 if (ExtVec.isUndef()) { 18202 Mask.append((unsigned)NumOpElts, -1); 18203 continue; 18204 } 18205 18206 if (!isa<ConstantSDNode>(Op.getOperand(1))) 18207 return SDValue(); 18208 int ExtIdx = Op.getConstantOperandVal(1); 18209 18210 // Ensure that we are extracting a subvector from a vector the same 18211 // size as the result. 18212 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) 18213 return SDValue(); 18214 18215 // Scale the subvector index to account for any bitcast. 18216 int NumExtElts = ExtVT.getVectorNumElements(); 18217 if (0 == (NumExtElts % NumElts)) 18218 ExtIdx /= (NumExtElts / NumElts); 18219 else if (0 == (NumElts % NumExtElts)) 18220 ExtIdx *= (NumElts / NumExtElts); 18221 else 18222 return SDValue(); 18223 18224 // At most we can reference 2 inputs in the final shuffle. 18225 if (SV0.isUndef() || SV0 == ExtVec) { 18226 SV0 = ExtVec; 18227 for (int i = 0; i != NumOpElts; ++i) 18228 Mask.push_back(i + ExtIdx); 18229 } else if (SV1.isUndef() || SV1 == ExtVec) { 18230 SV1 = ExtVec; 18231 for (int i = 0; i != NumOpElts; ++i) 18232 Mask.push_back(i + ExtIdx + NumElts); 18233 } else { 18234 return SDValue(); 18235 } 18236 } 18237 18238 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18239 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), 18240 DAG.getBitcast(VT, SV1), Mask, DAG); 18241 } 18242 18243 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { 18244 // If we only have one input vector, we don't need to do any concatenation. 18245 if (N->getNumOperands() == 1) 18246 return N->getOperand(0); 18247 18248 // Check if all of the operands are undefs. 18249 EVT VT = N->getValueType(0); 18250 if (ISD::allOperandsUndef(N)) 18251 return DAG.getUNDEF(VT); 18252 18253 // Optimize concat_vectors where all but the first of the vectors are undef. 18254 if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) { 18255 return Op.isUndef(); 18256 })) { 18257 SDValue In = N->getOperand(0); 18258 assert(In.getValueType().isVector() && "Must concat vectors"); 18259 18260 // If the input is a concat_vectors, just make a larger concat by padding 18261 // with smaller undefs. 18262 if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) { 18263 unsigned NumOps = N->getNumOperands() * In.getNumOperands(); 18264 SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end()); 18265 Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType())); 18266 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 18267 } 18268 18269 SDValue Scalar = peekThroughOneUseBitcasts(In); 18270 18271 // concat_vectors(scalar_to_vector(scalar), undef) -> 18272 // scalar_to_vector(scalar) 18273 if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR && 18274 Scalar.hasOneUse()) { 18275 EVT SVT = Scalar.getValueType().getVectorElementType(); 18276 if (SVT == Scalar.getOperand(0).getValueType()) 18277 Scalar = Scalar.getOperand(0); 18278 } 18279 18280 // concat_vectors(scalar, undef) -> scalar_to_vector(scalar) 18281 if (!Scalar.getValueType().isVector()) { 18282 // If the bitcast type isn't legal, it might be a trunc of a legal type; 18283 // look through the trunc so we can still do the transform: 18284 // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) 18285 if (Scalar->getOpcode() == ISD::TRUNCATE && 18286 !TLI.isTypeLegal(Scalar.getValueType()) && 18287 TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) 18288 Scalar = Scalar->getOperand(0); 18289 18290 EVT SclTy = Scalar.getValueType(); 18291 18292 if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) 18293 return SDValue(); 18294 18295 // Bail out if the vector size is not a multiple of the scalar size. 18296 if (VT.getSizeInBits() % SclTy.getSizeInBits()) 18297 return SDValue(); 18298 18299 unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); 18300 if (VNTNumElms < 2) 18301 return SDValue(); 18302 18303 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); 18304 if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) 18305 return SDValue(); 18306 18307 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar); 18308 return DAG.getBitcast(VT, Res); 18309 } 18310 } 18311 18312 // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. 18313 // We have already tested above for an UNDEF only concatenation. 18314 // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) 18315 // -> (BUILD_VECTOR A, B, ..., C, D, ...) 18316 auto IsBuildVectorOrUndef = [](const SDValue &Op) { 18317 return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); 18318 }; 18319 if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { 18320 SmallVector<SDValue, 8> Opnds; 18321 EVT SVT = VT.getScalarType(); 18322 18323 EVT MinVT = SVT; 18324 if (!SVT.isFloatingPoint()) { 18325 // If BUILD_VECTOR are from built from integer, they may have different 18326 // operand types. Get the smallest type and truncate all operands to it. 18327 bool FoundMinVT = false; 18328 for (const SDValue &Op : N->ops()) 18329 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 18330 EVT OpSVT = Op.getOperand(0).getValueType(); 18331 MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; 18332 FoundMinVT = true; 18333 } 18334 assert(FoundMinVT && "Concat vector type mismatch"); 18335 } 18336 18337 for (const SDValue &Op : N->ops()) { 18338 EVT OpVT = Op.getValueType(); 18339 unsigned NumElts = OpVT.getVectorNumElements(); 18340 18341 if (ISD::UNDEF == Op.getOpcode()) 18342 Opnds.append(NumElts, DAG.getUNDEF(MinVT)); 18343 18344 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 18345 if (SVT.isFloatingPoint()) { 18346 assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); 18347 Opnds.append(Op->op_begin(), Op->op_begin() + NumElts); 18348 } else { 18349 for (unsigned i = 0; i != NumElts; ++i) 18350 Opnds.push_back( 18351 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); 18352 } 18353 } 18354 } 18355 18356 assert(VT.getVectorNumElements() == Opnds.size() && 18357 "Concat vector type mismatch"); 18358 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 18359 } 18360 18361 // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. 18362 if (SDValue V = combineConcatVectorOfScalars(N, DAG)) 18363 return V; 18364 18365 // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. 18366 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) 18367 if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) 18368 return V; 18369 18370 // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR 18371 // nodes often generate nop CONCAT_VECTOR nodes. 18372 // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that 18373 // place the incoming vectors at the exact same location. 18374 SDValue SingleSource = SDValue(); 18375 unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); 18376 18377 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 18378 SDValue Op = N->getOperand(i); 18379 18380 if (Op.isUndef()) 18381 continue; 18382 18383 // Check if this is the identity extract: 18384 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 18385 return SDValue(); 18386 18387 // Find the single incoming vector for the extract_subvector. 18388 if (SingleSource.getNode()) { 18389 if (Op.getOperand(0) != SingleSource) 18390 return SDValue(); 18391 } else { 18392 SingleSource = Op.getOperand(0); 18393 18394 // Check the source type is the same as the type of the result. 18395 // If not, this concat may extend the vector, so we can not 18396 // optimize it away. 18397 if (SingleSource.getValueType() != N->getValueType(0)) 18398 return SDValue(); 18399 } 18400 18401 auto *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 18402 // The extract index must be constant. 18403 if (!CS) 18404 return SDValue(); 18405 18406 // Check that we are reading from the identity index. 18407 unsigned IdentityIndex = i * PartNumElem; 18408 if (CS->getAPIntValue() != IdentityIndex) 18409 return SDValue(); 18410 } 18411 18412 if (SingleSource.getNode()) 18413 return SingleSource; 18414 18415 return SDValue(); 18416 } 18417 18418 // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find 18419 // if the subvector can be sourced for free. 18420 static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) { 18421 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && 18422 V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) { 18423 return V.getOperand(1); 18424 } 18425 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 18426 if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS && 18427 V.getOperand(0).getValueType() == SubVT && 18428 (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) { 18429 uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements(); 18430 return V.getOperand(SubIdx); 18431 } 18432 return SDValue(); 18433 } 18434 18435 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, 18436 SelectionDAG &DAG) { 18437 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18438 SDValue BinOp = Extract->getOperand(0); 18439 unsigned BinOpcode = BinOp.getOpcode(); 18440 if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1) 18441 return SDValue(); 18442 18443 EVT VecVT = BinOp.getValueType(); 18444 SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1); 18445 if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType()) 18446 return SDValue(); 18447 18448 SDValue Index = Extract->getOperand(1); 18449 EVT SubVT = Extract->getValueType(0); 18450 if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT)) 18451 return SDValue(); 18452 18453 SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT); 18454 SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT); 18455 18456 // TODO: We could handle the case where only 1 operand is being inserted by 18457 // creating an extract of the other operand, but that requires checking 18458 // number of uses and/or costs. 18459 if (!Sub0 || !Sub1) 18460 return SDValue(); 18461 18462 // We are inserting both operands of the wide binop only to extract back 18463 // to the narrow vector size. Eliminate all of the insert/extract: 18464 // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y 18465 return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1, 18466 BinOp->getFlags()); 18467 } 18468 18469 /// If we are extracting a subvector produced by a wide binary operator try 18470 /// to use a narrow binary operator and/or avoid concatenation and extraction. 18471 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { 18472 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share 18473 // some of these bailouts with other transforms. 18474 18475 if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG)) 18476 return V; 18477 18478 // The extract index must be a constant, so we can map it to a concat operand. 18479 auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 18480 if (!ExtractIndexC) 18481 return SDValue(); 18482 18483 // We are looking for an optionally bitcasted wide vector binary operator 18484 // feeding an extract subvector. 18485 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18486 SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0)); 18487 unsigned BOpcode = BinOp.getOpcode(); 18488 if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1) 18489 return SDValue(); 18490 18491 // The binop must be a vector type, so we can extract some fraction of it. 18492 EVT WideBVT = BinOp.getValueType(); 18493 if (!WideBVT.isVector()) 18494 return SDValue(); 18495 18496 EVT VT = Extract->getValueType(0); 18497 unsigned ExtractIndex = ExtractIndexC->getZExtValue(); 18498 assert(ExtractIndex % VT.getVectorNumElements() == 0 && 18499 "Extract index is not a multiple of the vector length."); 18500 18501 // Bail out if this is not a proper multiple width extraction. 18502 unsigned WideWidth = WideBVT.getSizeInBits(); 18503 unsigned NarrowWidth = VT.getSizeInBits(); 18504 if (WideWidth % NarrowWidth != 0) 18505 return SDValue(); 18506 18507 // Bail out if we are extracting a fraction of a single operation. This can 18508 // occur because we potentially looked through a bitcast of the binop. 18509 unsigned NarrowingRatio = WideWidth / NarrowWidth; 18510 unsigned WideNumElts = WideBVT.getVectorNumElements(); 18511 if (WideNumElts % NarrowingRatio != 0) 18512 return SDValue(); 18513 18514 // Bail out if the target does not support a narrower version of the binop. 18515 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), 18516 WideNumElts / NarrowingRatio); 18517 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) 18518 return SDValue(); 18519 18520 // If extraction is cheap, we don't need to look at the binop operands 18521 // for concat ops. The narrow binop alone makes this transform profitable. 18522 // We can't just reuse the original extract index operand because we may have 18523 // bitcasted. 18524 unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements(); 18525 unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); 18526 if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && 18527 BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { 18528 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) 18529 SDLoc DL(Extract); 18530 SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL); 18531 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 18532 BinOp.getOperand(0), NewExtIndex); 18533 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 18534 BinOp.getOperand(1), NewExtIndex); 18535 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, 18536 BinOp.getNode()->getFlags()); 18537 return DAG.getBitcast(VT, NarrowBinOp); 18538 } 18539 18540 // Only handle the case where we are doubling and then halving. A larger ratio 18541 // may require more than two narrow binops to replace the wide binop. 18542 if (NarrowingRatio != 2) 18543 return SDValue(); 18544 18545 // TODO: The motivating case for this transform is an x86 AVX1 target. That 18546 // target has temptingly almost legal versions of bitwise logic ops in 256-bit 18547 // flavors, but no other 256-bit integer support. This could be extended to 18548 // handle any binop, but that may require fixing/adding other folds to avoid 18549 // codegen regressions. 18550 if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) 18551 return SDValue(); 18552 18553 // We need at least one concatenation operation of a binop operand to make 18554 // this transform worthwhile. The concat must double the input vector sizes. 18555 auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue { 18556 if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2) 18557 return V.getOperand(ConcatOpNum); 18558 return SDValue(); 18559 }; 18560 SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0))); 18561 SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1))); 18562 18563 if (SubVecL || SubVecR) { 18564 // If a binop operand was not the result of a concat, we must extract a 18565 // half-sized operand for our new narrow binop: 18566 // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN 18567 // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC) 18568 // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN 18569 SDLoc DL(Extract); 18570 SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL); 18571 SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL) 18572 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 18573 BinOp.getOperand(0), IndexC); 18574 18575 SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR) 18576 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 18577 BinOp.getOperand(1), IndexC); 18578 18579 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); 18580 return DAG.getBitcast(VT, NarrowBinOp); 18581 } 18582 18583 return SDValue(); 18584 } 18585 18586 /// If we are extracting a subvector from a wide vector load, convert to a 18587 /// narrow load to eliminate the extraction: 18588 /// (extract_subvector (load wide vector)) --> (load narrow vector) 18589 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { 18590 // TODO: Add support for big-endian. The offset calculation must be adjusted. 18591 if (DAG.getDataLayout().isBigEndian()) 18592 return SDValue(); 18593 18594 auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0)); 18595 auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 18596 if (!Ld || Ld->getExtensionType() || !Ld->isSimple() || 18597 !ExtIdx) 18598 return SDValue(); 18599 18600 // Allow targets to opt-out. 18601 EVT VT = Extract->getValueType(0); 18602 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18603 if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) 18604 return SDValue(); 18605 18606 // The narrow load will be offset from the base address of the old load if 18607 // we are extracting from something besides index 0 (little-endian). 18608 SDLoc DL(Extract); 18609 SDValue BaseAddr = Ld->getOperand(1); 18610 unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); 18611 18612 // TODO: Use "BaseIndexOffset" to make this more effective. 18613 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); 18614 MachineFunction &MF = DAG.getMachineFunction(); 18615 MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, 18616 VT.getStoreSize()); 18617 SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); 18618 DAG.makeEquivalentMemoryOrdering(Ld, NewLd); 18619 return NewLd; 18620 } 18621 18622 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { 18623 EVT NVT = N->getValueType(0); 18624 SDValue V = N->getOperand(0); 18625 18626 // Extract from UNDEF is UNDEF. 18627 if (V.isUndef()) 18628 return DAG.getUNDEF(NVT); 18629 18630 if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) 18631 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) 18632 return NarrowLoad; 18633 18634 // Combine an extract of an extract into a single extract_subvector. 18635 // ext (ext X, C), 0 --> ext X, C 18636 SDValue Index = N->getOperand(1); 18637 if (isNullConstant(Index) && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && 18638 V.hasOneUse() && isa<ConstantSDNode>(V.getOperand(1))) { 18639 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), 18640 V.getConstantOperandVal(1)) && 18641 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { 18642 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0), 18643 V.getOperand(1)); 18644 } 18645 } 18646 18647 // Try to move vector bitcast after extract_subv by scaling extraction index: 18648 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') 18649 if (isa<ConstantSDNode>(Index) && V.getOpcode() == ISD::BITCAST && 18650 V.getOperand(0).getValueType().isVector()) { 18651 SDValue SrcOp = V.getOperand(0); 18652 EVT SrcVT = SrcOp.getValueType(); 18653 unsigned SrcNumElts = SrcVT.getVectorNumElements(); 18654 unsigned DestNumElts = V.getValueType().getVectorNumElements(); 18655 if ((SrcNumElts % DestNumElts) == 0) { 18656 unsigned SrcDestRatio = SrcNumElts / DestNumElts; 18657 unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio; 18658 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), 18659 NewExtNumElts); 18660 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 18661 unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio; 18662 SDLoc DL(N); 18663 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 18664 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 18665 V.getOperand(0), NewIndex); 18666 return DAG.getBitcast(NVT, NewExtract); 18667 } 18668 } 18669 if ((DestNumElts % SrcNumElts) == 0) { 18670 unsigned DestSrcRatio = DestNumElts / SrcNumElts; 18671 if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) { 18672 unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio; 18673 EVT ScalarVT = SrcVT.getScalarType(); 18674 if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0) { 18675 SDLoc DL(N); 18676 unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; 18677 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), 18678 ScalarVT, NewExtNumElts); 18679 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 18680 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 18681 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 18682 V.getOperand(0), NewIndex); 18683 return DAG.getBitcast(NVT, NewExtract); 18684 } 18685 if (NewExtNumElts == 1 && 18686 TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) { 18687 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 18688 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, 18689 V.getOperand(0), NewIndex); 18690 return DAG.getBitcast(NVT, NewExtract); 18691 } 18692 } 18693 } 18694 } 18695 } 18696 18697 if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index)) { 18698 EVT ConcatSrcVT = V.getOperand(0).getValueType(); 18699 assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() && 18700 "Concat and extract subvector do not change element type"); 18701 18702 unsigned ExtIdx = N->getConstantOperandVal(1); 18703 unsigned ExtNumElts = NVT.getVectorNumElements(); 18704 assert(ExtIdx % ExtNumElts == 0 && 18705 "Extract index is not a multiple of the input vector length."); 18706 18707 unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorNumElements(); 18708 unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts; 18709 18710 // If the concatenated source types match this extract, it's a direct 18711 // simplification: 18712 // extract_subvec (concat V1, V2, ...), i --> Vi 18713 if (ConcatSrcNumElts == ExtNumElts) 18714 return V.getOperand(ConcatOpIdx); 18715 18716 // If the concatenated source vectors are a multiple length of this extract, 18717 // then extract a fraction of one of those source vectors directly from a 18718 // concat operand. Example: 18719 // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> 18720 // v2i8 extract_subvec v8i8 Y, 6 18721 if (ConcatSrcNumElts % ExtNumElts == 0) { 18722 SDLoc DL(N); 18723 unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; 18724 assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && 18725 "Trying to extract from >1 concat operand?"); 18726 assert(NewExtIdx % ExtNumElts == 0 && 18727 "Extract index is not a multiple of the input vector length."); 18728 SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL); 18729 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, 18730 V.getOperand(ConcatOpIdx), NewIndexC); 18731 } 18732 } 18733 18734 V = peekThroughBitcasts(V); 18735 18736 // If the input is a build vector. Try to make a smaller build vector. 18737 if (V.getOpcode() == ISD::BUILD_VECTOR) { 18738 if (auto *IdxC = dyn_cast<ConstantSDNode>(Index)) { 18739 EVT InVT = V.getValueType(); 18740 unsigned ExtractSize = NVT.getSizeInBits(); 18741 unsigned EltSize = InVT.getScalarSizeInBits(); 18742 // Only do this if we won't split any elements. 18743 if (ExtractSize % EltSize == 0) { 18744 unsigned NumElems = ExtractSize / EltSize; 18745 EVT EltVT = InVT.getVectorElementType(); 18746 EVT ExtractVT = NumElems == 1 ? EltVT 18747 : EVT::getVectorVT(*DAG.getContext(), 18748 EltVT, NumElems); 18749 if ((Level < AfterLegalizeDAG || 18750 (NumElems == 1 || 18751 TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) && 18752 (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { 18753 unsigned IdxVal = IdxC->getZExtValue(); 18754 IdxVal *= NVT.getScalarSizeInBits(); 18755 IdxVal /= EltSize; 18756 18757 if (NumElems == 1) { 18758 SDValue Src = V->getOperand(IdxVal); 18759 if (EltVT != Src.getValueType()) 18760 Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); 18761 return DAG.getBitcast(NVT, Src); 18762 } 18763 18764 // Extract the pieces from the original build_vector. 18765 SDValue BuildVec = DAG.getBuildVector( 18766 ExtractVT, SDLoc(N), V->ops().slice(IdxVal, NumElems)); 18767 return DAG.getBitcast(NVT, BuildVec); 18768 } 18769 } 18770 } 18771 } 18772 18773 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { 18774 // Handle only simple case where vector being inserted and vector 18775 // being extracted are of same size. 18776 EVT SmallVT = V.getOperand(1).getValueType(); 18777 if (!NVT.bitsEq(SmallVT)) 18778 return SDValue(); 18779 18780 // Only handle cases where both indexes are constants. 18781 auto *ExtIdx = dyn_cast<ConstantSDNode>(Index); 18782 auto *InsIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); 18783 if (InsIdx && ExtIdx) { 18784 // Combine: 18785 // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) 18786 // Into: 18787 // indices are equal or bit offsets are equal => V1 18788 // otherwise => (extract_subvec V1, ExtIdx) 18789 if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() == 18790 ExtIdx->getZExtValue() * NVT.getScalarSizeInBits()) 18791 return DAG.getBitcast(NVT, V.getOperand(1)); 18792 return DAG.getNode( 18793 ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, 18794 DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), 18795 Index); 18796 } 18797 } 18798 18799 if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) 18800 return NarrowBOp; 18801 18802 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 18803 return SDValue(N, 0); 18804 18805 return SDValue(); 18806 } 18807 18808 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles 18809 /// followed by concatenation. Narrow vector ops may have better performance 18810 /// than wide ops, and this can unlock further narrowing of other vector ops. 18811 /// Targets can invert this transform later if it is not profitable. 18812 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf, 18813 SelectionDAG &DAG) { 18814 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1); 18815 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || 18816 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 || 18817 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) 18818 return SDValue(); 18819 18820 // Split the wide shuffle mask into halves. Any mask element that is accessing 18821 // operand 1 is offset down to account for narrowing of the vectors. 18822 ArrayRef<int> Mask = Shuf->getMask(); 18823 EVT VT = Shuf->getValueType(0); 18824 unsigned NumElts = VT.getVectorNumElements(); 18825 unsigned HalfNumElts = NumElts / 2; 18826 SmallVector<int, 16> Mask0(HalfNumElts, -1); 18827 SmallVector<int, 16> Mask1(HalfNumElts, -1); 18828 for (unsigned i = 0; i != NumElts; ++i) { 18829 if (Mask[i] == -1) 18830 continue; 18831 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts; 18832 if (i < HalfNumElts) 18833 Mask0[i] = M; 18834 else 18835 Mask1[i - HalfNumElts] = M; 18836 } 18837 18838 // Ask the target if this is a valid transform. 18839 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18840 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 18841 HalfNumElts); 18842 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) || 18843 !TLI.isShuffleMaskLegal(Mask1, HalfVT)) 18844 return SDValue(); 18845 18846 // shuffle (concat X, undef), (concat Y, undef), Mask --> 18847 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) 18848 SDValue X = N0.getOperand(0), Y = N1.getOperand(0); 18849 SDLoc DL(Shuf); 18850 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0); 18851 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1); 18852 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1); 18853 } 18854 18855 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, 18856 // or turn a shuffle of a single concat into simpler shuffle then concat. 18857 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { 18858 EVT VT = N->getValueType(0); 18859 unsigned NumElts = VT.getVectorNumElements(); 18860 18861 SDValue N0 = N->getOperand(0); 18862 SDValue N1 = N->getOperand(1); 18863 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 18864 ArrayRef<int> Mask = SVN->getMask(); 18865 18866 SmallVector<SDValue, 4> Ops; 18867 EVT ConcatVT = N0.getOperand(0).getValueType(); 18868 unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); 18869 unsigned NumConcats = NumElts / NumElemsPerConcat; 18870 18871 auto IsUndefMaskElt = [](int i) { return i == -1; }; 18872 18873 // Special case: shuffle(concat(A,B)) can be more efficiently represented 18874 // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high 18875 // half vector elements. 18876 if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && 18877 llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat), 18878 IsUndefMaskElt)) { 18879 N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), 18880 N0.getOperand(1), 18881 Mask.slice(0, NumElemsPerConcat)); 18882 N1 = DAG.getUNDEF(ConcatVT); 18883 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); 18884 } 18885 18886 // Look at every vector that's inserted. We're looking for exact 18887 // subvector-sized copies from a concatenated vector 18888 for (unsigned I = 0; I != NumConcats; ++I) { 18889 unsigned Begin = I * NumElemsPerConcat; 18890 ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat); 18891 18892 // Make sure we're dealing with a copy. 18893 if (llvm::all_of(SubMask, IsUndefMaskElt)) { 18894 Ops.push_back(DAG.getUNDEF(ConcatVT)); 18895 continue; 18896 } 18897 18898 int OpIdx = -1; 18899 for (int i = 0; i != (int)NumElemsPerConcat; ++i) { 18900 if (IsUndefMaskElt(SubMask[i])) 18901 continue; 18902 if ((SubMask[i] % (int)NumElemsPerConcat) != i) 18903 return SDValue(); 18904 int EltOpIdx = SubMask[i] / NumElemsPerConcat; 18905 if (0 <= OpIdx && EltOpIdx != OpIdx) 18906 return SDValue(); 18907 OpIdx = EltOpIdx; 18908 } 18909 assert(0 <= OpIdx && "Unknown concat_vectors op"); 18910 18911 if (OpIdx < (int)N0.getNumOperands()) 18912 Ops.push_back(N0.getOperand(OpIdx)); 18913 else 18914 Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands())); 18915 } 18916 18917 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 18918 } 18919 18920 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 18921 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 18922 // 18923 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always 18924 // a simplification in some sense, but it isn't appropriate in general: some 18925 // BUILD_VECTORs are substantially cheaper than others. The general case 18926 // of a BUILD_VECTOR requires inserting each element individually (or 18927 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of 18928 // all constants is a single constant pool load. A BUILD_VECTOR where each 18929 // element is identical is a splat. A BUILD_VECTOR where most of the operands 18930 // are undef lowers to a small number of element insertions. 18931 // 18932 // To deal with this, we currently use a bunch of mostly arbitrary heuristics. 18933 // We don't fold shuffles where one side is a non-zero constant, and we don't 18934 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate 18935 // non-constant operands. This seems to work out reasonably well in practice. 18936 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, 18937 SelectionDAG &DAG, 18938 const TargetLowering &TLI) { 18939 EVT VT = SVN->getValueType(0); 18940 unsigned NumElts = VT.getVectorNumElements(); 18941 SDValue N0 = SVN->getOperand(0); 18942 SDValue N1 = SVN->getOperand(1); 18943 18944 if (!N0->hasOneUse()) 18945 return SDValue(); 18946 18947 // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as 18948 // discussed above. 18949 if (!N1.isUndef()) { 18950 if (!N1->hasOneUse()) 18951 return SDValue(); 18952 18953 bool N0AnyConst = isAnyConstantBuildVector(N0); 18954 bool N1AnyConst = isAnyConstantBuildVector(N1); 18955 if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) 18956 return SDValue(); 18957 if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) 18958 return SDValue(); 18959 } 18960 18961 // If both inputs are splats of the same value then we can safely merge this 18962 // to a single BUILD_VECTOR with undef elements based on the shuffle mask. 18963 bool IsSplat = false; 18964 auto *BV0 = dyn_cast<BuildVectorSDNode>(N0); 18965 auto *BV1 = dyn_cast<BuildVectorSDNode>(N1); 18966 if (BV0 && BV1) 18967 if (SDValue Splat0 = BV0->getSplatValue()) 18968 IsSplat = (Splat0 == BV1->getSplatValue()); 18969 18970 SmallVector<SDValue, 8> Ops; 18971 SmallSet<SDValue, 16> DuplicateOps; 18972 for (int M : SVN->getMask()) { 18973 SDValue Op = DAG.getUNDEF(VT.getScalarType()); 18974 if (M >= 0) { 18975 int Idx = M < (int)NumElts ? M : M - NumElts; 18976 SDValue &S = (M < (int)NumElts ? N0 : N1); 18977 if (S.getOpcode() == ISD::BUILD_VECTOR) { 18978 Op = S.getOperand(Idx); 18979 } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { 18980 SDValue Op0 = S.getOperand(0); 18981 Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType()); 18982 } else { 18983 // Operand can't be combined - bail out. 18984 return SDValue(); 18985 } 18986 } 18987 18988 // Don't duplicate a non-constant BUILD_VECTOR operand unless we're 18989 // generating a splat; semantically, this is fine, but it's likely to 18990 // generate low-quality code if the target can't reconstruct an appropriate 18991 // shuffle. 18992 if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op)) 18993 if (!IsSplat && !DuplicateOps.insert(Op).second) 18994 return SDValue(); 18995 18996 Ops.push_back(Op); 18997 } 18998 18999 // BUILD_VECTOR requires all inputs to be of the same type, find the 19000 // maximum type and extend them all. 19001 EVT SVT = VT.getScalarType(); 19002 if (SVT.isInteger()) 19003 for (SDValue &Op : Ops) 19004 SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); 19005 if (SVT != VT.getScalarType()) 19006 for (SDValue &Op : Ops) 19007 Op = TLI.isZExtFree(Op.getValueType(), SVT) 19008 ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) 19009 : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); 19010 return DAG.getBuildVector(VT, SDLoc(SVN), Ops); 19011 } 19012 19013 // Match shuffles that can be converted to any_vector_extend_in_reg. 19014 // This is often generated during legalization. 19015 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) 19016 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. 19017 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, 19018 SelectionDAG &DAG, 19019 const TargetLowering &TLI, 19020 bool LegalOperations) { 19021 EVT VT = SVN->getValueType(0); 19022 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 19023 19024 // TODO Add support for big-endian when we have a test case. 19025 if (!VT.isInteger() || IsBigEndian) 19026 return SDValue(); 19027 19028 unsigned NumElts = VT.getVectorNumElements(); 19029 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 19030 ArrayRef<int> Mask = SVN->getMask(); 19031 SDValue N0 = SVN->getOperand(0); 19032 19033 // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) 19034 auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { 19035 for (unsigned i = 0; i != NumElts; ++i) { 19036 if (Mask[i] < 0) 19037 continue; 19038 if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) 19039 continue; 19040 return false; 19041 } 19042 return true; 19043 }; 19044 19045 // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for 19046 // power-of-2 extensions as they are the most likely. 19047 for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { 19048 // Check for non power of 2 vector sizes 19049 if (NumElts % Scale != 0) 19050 continue; 19051 if (!isAnyExtend(Scale)) 19052 continue; 19053 19054 EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); 19055 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); 19056 // Never create an illegal type. Only create unsupported operations if we 19057 // are pre-legalization. 19058 if (TLI.isTypeLegal(OutVT)) 19059 if (!LegalOperations || 19060 TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) 19061 return DAG.getBitcast(VT, 19062 DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, 19063 SDLoc(SVN), OutVT, N0)); 19064 } 19065 19066 return SDValue(); 19067 } 19068 19069 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of 19070 // each source element of a large type into the lowest elements of a smaller 19071 // destination type. This is often generated during legalization. 19072 // If the source node itself was a '*_extend_vector_inreg' node then we should 19073 // then be able to remove it. 19074 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, 19075 SelectionDAG &DAG) { 19076 EVT VT = SVN->getValueType(0); 19077 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 19078 19079 // TODO Add support for big-endian when we have a test case. 19080 if (!VT.isInteger() || IsBigEndian) 19081 return SDValue(); 19082 19083 SDValue N0 = peekThroughBitcasts(SVN->getOperand(0)); 19084 19085 unsigned Opcode = N0.getOpcode(); 19086 if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && 19087 Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && 19088 Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) 19089 return SDValue(); 19090 19091 SDValue N00 = N0.getOperand(0); 19092 ArrayRef<int> Mask = SVN->getMask(); 19093 unsigned NumElts = VT.getVectorNumElements(); 19094 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 19095 unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); 19096 unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); 19097 19098 if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) 19099 return SDValue(); 19100 unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; 19101 19102 // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> 19103 // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> 19104 // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> 19105 auto isTruncate = [&Mask, &NumElts](unsigned Scale) { 19106 for (unsigned i = 0; i != NumElts; ++i) { 19107 if (Mask[i] < 0) 19108 continue; 19109 if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) 19110 continue; 19111 return false; 19112 } 19113 return true; 19114 }; 19115 19116 // At the moment we just handle the case where we've truncated back to the 19117 // same size as before the extension. 19118 // TODO: handle more extension/truncation cases as cases arise. 19119 if (EltSizeInBits != ExtSrcSizeInBits) 19120 return SDValue(); 19121 19122 // We can remove *extend_vector_inreg only if the truncation happens at 19123 // the same scale as the extension. 19124 if (isTruncate(ExtScale)) 19125 return DAG.getBitcast(VT, N00); 19126 19127 return SDValue(); 19128 } 19129 19130 // Combine shuffles of splat-shuffles of the form: 19131 // shuffle (shuffle V, undef, splat-mask), undef, M 19132 // If splat-mask contains undef elements, we need to be careful about 19133 // introducing undef's in the folded mask which are not the result of composing 19134 // the masks of the shuffles. 19135 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, 19136 SelectionDAG &DAG) { 19137 if (!Shuf->getOperand(1).isUndef()) 19138 return SDValue(); 19139 auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 19140 if (!Splat || !Splat->isSplat()) 19141 return SDValue(); 19142 19143 ArrayRef<int> ShufMask = Shuf->getMask(); 19144 ArrayRef<int> SplatMask = Splat->getMask(); 19145 assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch"); 19146 19147 // Prefer simplifying to the splat-shuffle, if possible. This is legal if 19148 // every undef mask element in the splat-shuffle has a corresponding undef 19149 // element in the user-shuffle's mask or if the composition of mask elements 19150 // would result in undef. 19151 // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): 19152 // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] 19153 // In this case it is not legal to simplify to the splat-shuffle because we 19154 // may be exposing the users of the shuffle an undef element at index 1 19155 // which was not there before the combine. 19156 // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] 19157 // In this case the composition of masks yields SplatMask, so it's ok to 19158 // simplify to the splat-shuffle. 19159 // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] 19160 // In this case the composed mask includes all undef elements of SplatMask 19161 // and in addition sets element zero to undef. It is safe to simplify to 19162 // the splat-shuffle. 19163 auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask, 19164 ArrayRef<int> SplatMask) { 19165 for (unsigned i = 0, e = UserMask.size(); i != e; ++i) 19166 if (UserMask[i] != -1 && SplatMask[i] == -1 && 19167 SplatMask[UserMask[i]] != -1) 19168 return false; 19169 return true; 19170 }; 19171 if (CanSimplifyToExistingSplat(ShufMask, SplatMask)) 19172 return Shuf->getOperand(0); 19173 19174 // Create a new shuffle with a mask that is composed of the two shuffles' 19175 // masks. 19176 SmallVector<int, 32> NewMask; 19177 for (int Idx : ShufMask) 19178 NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); 19179 19180 return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), 19181 Splat->getOperand(0), Splat->getOperand(1), 19182 NewMask); 19183 } 19184 19185 /// If the shuffle mask is taking exactly one element from the first vector 19186 /// operand and passing through all other elements from the second vector 19187 /// operand, return the index of the mask element that is choosing an element 19188 /// from the first operand. Otherwise, return -1. 19189 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) { 19190 int MaskSize = Mask.size(); 19191 int EltFromOp0 = -1; 19192 // TODO: This does not match if there are undef elements in the shuffle mask. 19193 // Should we ignore undefs in the shuffle mask instead? The trade-off is 19194 // removing an instruction (a shuffle), but losing the knowledge that some 19195 // vector lanes are not needed. 19196 for (int i = 0; i != MaskSize; ++i) { 19197 if (Mask[i] >= 0 && Mask[i] < MaskSize) { 19198 // We're looking for a shuffle of exactly one element from operand 0. 19199 if (EltFromOp0 != -1) 19200 return -1; 19201 EltFromOp0 = i; 19202 } else if (Mask[i] != i + MaskSize) { 19203 // Nothing from operand 1 can change lanes. 19204 return -1; 19205 } 19206 } 19207 return EltFromOp0; 19208 } 19209 19210 /// If a shuffle inserts exactly one element from a source vector operand into 19211 /// another vector operand and we can access the specified element as a scalar, 19212 /// then we can eliminate the shuffle. 19213 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, 19214 SelectionDAG &DAG) { 19215 // First, check if we are taking one element of a vector and shuffling that 19216 // element into another vector. 19217 ArrayRef<int> Mask = Shuf->getMask(); 19218 SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end()); 19219 SDValue Op0 = Shuf->getOperand(0); 19220 SDValue Op1 = Shuf->getOperand(1); 19221 int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); 19222 if (ShufOp0Index == -1) { 19223 // Commute mask and check again. 19224 ShuffleVectorSDNode::commuteMask(CommutedMask); 19225 ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); 19226 if (ShufOp0Index == -1) 19227 return SDValue(); 19228 // Commute operands to match the commuted shuffle mask. 19229 std::swap(Op0, Op1); 19230 Mask = CommutedMask; 19231 } 19232 19233 // The shuffle inserts exactly one element from operand 0 into operand 1. 19234 // Now see if we can access that element as a scalar via a real insert element 19235 // instruction. 19236 // TODO: We can try harder to locate the element as a scalar. Examples: it 19237 // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. 19238 assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && 19239 "Shuffle mask value must be from operand 0"); 19240 if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) 19241 return SDValue(); 19242 19243 auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2)); 19244 if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) 19245 return SDValue(); 19246 19247 // There's an existing insertelement with constant insertion index, so we 19248 // don't need to check the legality/profitability of a replacement operation 19249 // that differs at most in the constant value. The target should be able to 19250 // lower any of those in a similar way. If not, legalization will expand this 19251 // to a scalar-to-vector plus shuffle. 19252 // 19253 // Note that the shuffle may move the scalar from the position that the insert 19254 // element used. Therefore, our new insert element occurs at the shuffle's 19255 // mask index value, not the insert's index value. 19256 // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' 19257 SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); 19258 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), 19259 Op1, Op0.getOperand(1), NewInsIndex); 19260 } 19261 19262 /// If we have a unary shuffle of a shuffle, see if it can be folded away 19263 /// completely. This has the potential to lose undef knowledge because the first 19264 /// shuffle may not have an undef mask element where the second one does. So 19265 /// only call this after doing simplifications based on demanded elements. 19266 static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { 19267 // shuf (shuf0 X, Y, Mask0), undef, Mask 19268 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 19269 if (!Shuf0 || !Shuf->getOperand(1).isUndef()) 19270 return SDValue(); 19271 19272 ArrayRef<int> Mask = Shuf->getMask(); 19273 ArrayRef<int> Mask0 = Shuf0->getMask(); 19274 for (int i = 0, e = (int)Mask.size(); i != e; ++i) { 19275 // Ignore undef elements. 19276 if (Mask[i] == -1) 19277 continue; 19278 assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value"); 19279 19280 // Is the element of the shuffle operand chosen by this shuffle the same as 19281 // the element chosen by the shuffle operand itself? 19282 if (Mask0[Mask[i]] != Mask0[i]) 19283 return SDValue(); 19284 } 19285 // Every element of this shuffle is identical to the result of the previous 19286 // shuffle, so we can replace this value. 19287 return Shuf->getOperand(0); 19288 } 19289 19290 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { 19291 EVT VT = N->getValueType(0); 19292 unsigned NumElts = VT.getVectorNumElements(); 19293 19294 SDValue N0 = N->getOperand(0); 19295 SDValue N1 = N->getOperand(1); 19296 19297 assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); 19298 19299 // Canonicalize shuffle undef, undef -> undef 19300 if (N0.isUndef() && N1.isUndef()) 19301 return DAG.getUNDEF(VT); 19302 19303 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 19304 19305 // Canonicalize shuffle v, v -> v, undef 19306 if (N0 == N1) { 19307 SmallVector<int, 8> NewMask; 19308 for (unsigned i = 0; i != NumElts; ++i) { 19309 int Idx = SVN->getMaskElt(i); 19310 if (Idx >= (int)NumElts) Idx -= NumElts; 19311 NewMask.push_back(Idx); 19312 } 19313 return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask); 19314 } 19315 19316 // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. 19317 if (N0.isUndef()) 19318 return DAG.getCommutedVectorShuffle(*SVN); 19319 19320 // Remove references to rhs if it is undef 19321 if (N1.isUndef()) { 19322 bool Changed = false; 19323 SmallVector<int, 8> NewMask; 19324 for (unsigned i = 0; i != NumElts; ++i) { 19325 int Idx = SVN->getMaskElt(i); 19326 if (Idx >= (int)NumElts) { 19327 Idx = -1; 19328 Changed = true; 19329 } 19330 NewMask.push_back(Idx); 19331 } 19332 if (Changed) 19333 return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); 19334 } 19335 19336 if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) 19337 return InsElt; 19338 19339 // A shuffle of a single vector that is a splatted value can always be folded. 19340 if (SDValue V = combineShuffleOfSplatVal(SVN, DAG)) 19341 return V; 19342 19343 // If it is a splat, check if the argument vector is another splat or a 19344 // build_vector. 19345 if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { 19346 int SplatIndex = SVN->getSplatIndex(); 19347 if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) && 19348 TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) { 19349 // splat (vector_bo L, R), Index --> 19350 // splat (scalar_bo (extelt L, Index), (extelt R, Index)) 19351 SDValue L = N0.getOperand(0), R = N0.getOperand(1); 19352 SDLoc DL(N); 19353 EVT EltVT = VT.getScalarType(); 19354 SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL); 19355 SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index); 19356 SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index); 19357 SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, 19358 N0.getNode()->getFlags()); 19359 SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO); 19360 SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0); 19361 return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask); 19362 } 19363 19364 // If this is a bit convert that changes the element type of the vector but 19365 // not the number of vector elements, look through it. Be careful not to 19366 // look though conversions that change things like v4f32 to v2f64. 19367 SDNode *V = N0.getNode(); 19368 if (V->getOpcode() == ISD::BITCAST) { 19369 SDValue ConvInput = V->getOperand(0); 19370 if (ConvInput.getValueType().isVector() && 19371 ConvInput.getValueType().getVectorNumElements() == NumElts) 19372 V = ConvInput.getNode(); 19373 } 19374 19375 if (V->getOpcode() == ISD::BUILD_VECTOR) { 19376 assert(V->getNumOperands() == NumElts && 19377 "BUILD_VECTOR has wrong number of operands"); 19378 SDValue Base; 19379 bool AllSame = true; 19380 for (unsigned i = 0; i != NumElts; ++i) { 19381 if (!V->getOperand(i).isUndef()) { 19382 Base = V->getOperand(i); 19383 break; 19384 } 19385 } 19386 // Splat of <u, u, u, u>, return <u, u, u, u> 19387 if (!Base.getNode()) 19388 return N0; 19389 for (unsigned i = 0; i != NumElts; ++i) { 19390 if (V->getOperand(i) != Base) { 19391 AllSame = false; 19392 break; 19393 } 19394 } 19395 // Splat of <x, x, x, x>, return <x, x, x, x> 19396 if (AllSame) 19397 return N0; 19398 19399 // Canonicalize any other splat as a build_vector. 19400 SDValue Splatted = V->getOperand(SplatIndex); 19401 SmallVector<SDValue, 8> Ops(NumElts, Splatted); 19402 SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); 19403 19404 // We may have jumped through bitcasts, so the type of the 19405 // BUILD_VECTOR may not match the type of the shuffle. 19406 if (V->getValueType(0) != VT) 19407 NewBV = DAG.getBitcast(VT, NewBV); 19408 return NewBV; 19409 } 19410 } 19411 19412 // Simplify source operands based on shuffle mask. 19413 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 19414 return SDValue(N, 0); 19415 19416 // This is intentionally placed after demanded elements simplification because 19417 // it could eliminate knowledge of undef elements created by this shuffle. 19418 if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) 19419 return ShufOp; 19420 19421 // Match shuffles that can be converted to any_vector_extend_in_reg. 19422 if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) 19423 return V; 19424 19425 // Combine "truncate_vector_in_reg" style shuffles. 19426 if (SDValue V = combineTruncationShuffle(SVN, DAG)) 19427 return V; 19428 19429 if (N0.getOpcode() == ISD::CONCAT_VECTORS && 19430 Level < AfterLegalizeVectorOps && 19431 (N1.isUndef() || 19432 (N1.getOpcode() == ISD::CONCAT_VECTORS && 19433 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { 19434 if (SDValue V = partitionShuffleOfConcats(N, DAG)) 19435 return V; 19436 } 19437 19438 // A shuffle of a concat of the same narrow vector can be reduced to use 19439 // only low-half elements of a concat with undef: 19440 // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask' 19441 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() && 19442 N0.getNumOperands() == 2 && 19443 N0.getOperand(0) == N0.getOperand(1)) { 19444 int HalfNumElts = (int)NumElts / 2; 19445 SmallVector<int, 8> NewMask; 19446 for (unsigned i = 0; i != NumElts; ++i) { 19447 int Idx = SVN->getMaskElt(i); 19448 if (Idx >= HalfNumElts) { 19449 assert(Idx < (int)NumElts && "Shuffle mask chooses undef op"); 19450 Idx -= HalfNumElts; 19451 } 19452 NewMask.push_back(Idx); 19453 } 19454 if (TLI.isShuffleMaskLegal(NewMask, VT)) { 19455 SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType()); 19456 SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 19457 N0.getOperand(0), UndefVec); 19458 return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask); 19459 } 19460 } 19461 19462 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 19463 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 19464 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) 19465 if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) 19466 return Res; 19467 19468 // If this shuffle only has a single input that is a bitcasted shuffle, 19469 // attempt to merge the 2 shuffles and suitably bitcast the inputs/output 19470 // back to their original types. 19471 if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 19472 N1.isUndef() && Level < AfterLegalizeVectorOps && 19473 TLI.isTypeLegal(VT)) { 19474 auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) { 19475 if (Scale == 1) 19476 return SmallVector<int, 8>(Mask.begin(), Mask.end()); 19477 19478 SmallVector<int, 8> NewMask; 19479 for (int M : Mask) 19480 for (int s = 0; s != Scale; ++s) 19481 NewMask.push_back(M < 0 ? -1 : Scale * M + s); 19482 return NewMask; 19483 }; 19484 19485 SDValue BC0 = peekThroughOneUseBitcasts(N0); 19486 if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { 19487 EVT SVT = VT.getScalarType(); 19488 EVT InnerVT = BC0->getValueType(0); 19489 EVT InnerSVT = InnerVT.getScalarType(); 19490 19491 // Determine which shuffle works with the smaller scalar type. 19492 EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; 19493 EVT ScaleSVT = ScaleVT.getScalarType(); 19494 19495 if (TLI.isTypeLegal(ScaleVT) && 19496 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 19497 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { 19498 int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 19499 int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 19500 19501 // Scale the shuffle masks to the smaller scalar type. 19502 ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0); 19503 SmallVector<int, 8> InnerMask = 19504 ScaleShuffleMask(InnerSVN->getMask(), InnerScale); 19505 SmallVector<int, 8> OuterMask = 19506 ScaleShuffleMask(SVN->getMask(), OuterScale); 19507 19508 // Merge the shuffle masks. 19509 SmallVector<int, 8> NewMask; 19510 for (int M : OuterMask) 19511 NewMask.push_back(M < 0 ? -1 : InnerMask[M]); 19512 19513 // Test for shuffle mask legality over both commutations. 19514 SDValue SV0 = BC0->getOperand(0); 19515 SDValue SV1 = BC0->getOperand(1); 19516 bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 19517 if (!LegalMask) { 19518 std::swap(SV0, SV1); 19519 ShuffleVectorSDNode::commuteMask(NewMask); 19520 LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 19521 } 19522 19523 if (LegalMask) { 19524 SV0 = DAG.getBitcast(ScaleVT, SV0); 19525 SV1 = DAG.getBitcast(ScaleVT, SV1); 19526 return DAG.getBitcast( 19527 VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); 19528 } 19529 } 19530 } 19531 } 19532 19533 // Canonicalize shuffles according to rules: 19534 // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) 19535 // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) 19536 // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 19537 if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && 19538 N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && 19539 TLI.isTypeLegal(VT)) { 19540 // The incoming shuffle must be of the same type as the result of the 19541 // current shuffle. 19542 assert(N1->getOperand(0).getValueType() == VT && 19543 "Shuffle types don't match"); 19544 19545 SDValue SV0 = N1->getOperand(0); 19546 SDValue SV1 = N1->getOperand(1); 19547 bool HasSameOp0 = N0 == SV0; 19548 bool IsSV1Undef = SV1.isUndef(); 19549 if (HasSameOp0 || IsSV1Undef || N0 == SV1) 19550 // Commute the operands of this shuffle so that next rule 19551 // will trigger. 19552 return DAG.getCommutedVectorShuffle(*SVN); 19553 } 19554 19555 // Try to fold according to rules: 19556 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 19557 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 19558 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 19559 // Don't try to fold shuffles with illegal type. 19560 // Only fold if this shuffle is the only user of the other shuffle. 19561 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && 19562 Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { 19563 ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0); 19564 19565 // Don't try to fold splats; they're likely to simplify somehow, or they 19566 // might be free. 19567 if (OtherSV->isSplat()) 19568 return SDValue(); 19569 19570 // The incoming shuffle must be of the same type as the result of the 19571 // current shuffle. 19572 assert(OtherSV->getOperand(0).getValueType() == VT && 19573 "Shuffle types don't match"); 19574 19575 SDValue SV0, SV1; 19576 SmallVector<int, 4> Mask; 19577 // Compute the combined shuffle mask for a shuffle with SV0 as the first 19578 // operand, and SV1 as the second operand. 19579 for (unsigned i = 0; i != NumElts; ++i) { 19580 int Idx = SVN->getMaskElt(i); 19581 if (Idx < 0) { 19582 // Propagate Undef. 19583 Mask.push_back(Idx); 19584 continue; 19585 } 19586 19587 SDValue CurrentVec; 19588 if (Idx < (int)NumElts) { 19589 // This shuffle index refers to the inner shuffle N0. Lookup the inner 19590 // shuffle mask to identify which vector is actually referenced. 19591 Idx = OtherSV->getMaskElt(Idx); 19592 if (Idx < 0) { 19593 // Propagate Undef. 19594 Mask.push_back(Idx); 19595 continue; 19596 } 19597 19598 CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) 19599 : OtherSV->getOperand(1); 19600 } else { 19601 // This shuffle index references an element within N1. 19602 CurrentVec = N1; 19603 } 19604 19605 // Simple case where 'CurrentVec' is UNDEF. 19606 if (CurrentVec.isUndef()) { 19607 Mask.push_back(-1); 19608 continue; 19609 } 19610 19611 // Canonicalize the shuffle index. We don't know yet if CurrentVec 19612 // will be the first or second operand of the combined shuffle. 19613 Idx = Idx % NumElts; 19614 if (!SV0.getNode() || SV0 == CurrentVec) { 19615 // Ok. CurrentVec is the left hand side. 19616 // Update the mask accordingly. 19617 SV0 = CurrentVec; 19618 Mask.push_back(Idx); 19619 continue; 19620 } 19621 19622 // Bail out if we cannot convert the shuffle pair into a single shuffle. 19623 if (SV1.getNode() && SV1 != CurrentVec) 19624 return SDValue(); 19625 19626 // Ok. CurrentVec is the right hand side. 19627 // Update the mask accordingly. 19628 SV1 = CurrentVec; 19629 Mask.push_back(Idx + NumElts); 19630 } 19631 19632 // Check if all indices in Mask are Undef. In case, propagate Undef. 19633 bool isUndefMask = true; 19634 for (unsigned i = 0; i != NumElts && isUndefMask; ++i) 19635 isUndefMask &= Mask[i] < 0; 19636 19637 if (isUndefMask) 19638 return DAG.getUNDEF(VT); 19639 19640 if (!SV0.getNode()) 19641 SV0 = DAG.getUNDEF(VT); 19642 if (!SV1.getNode()) 19643 SV1 = DAG.getUNDEF(VT); 19644 19645 // Avoid introducing shuffles with illegal mask. 19646 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 19647 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 19648 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 19649 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) 19650 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) 19651 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) 19652 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG); 19653 } 19654 19655 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) 19656 return V; 19657 19658 return SDValue(); 19659 } 19660 19661 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { 19662 SDValue InVal = N->getOperand(0); 19663 EVT VT = N->getValueType(0); 19664 19665 // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern 19666 // with a VECTOR_SHUFFLE and possible truncate. 19667 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 19668 SDValue InVec = InVal->getOperand(0); 19669 SDValue EltNo = InVal->getOperand(1); 19670 auto InVecT = InVec.getValueType(); 19671 if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) { 19672 SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1); 19673 int Elt = C0->getZExtValue(); 19674 NewMask[0] = Elt; 19675 // If we have an implict truncate do truncate here as long as it's legal. 19676 // if it's not legal, this should 19677 if (VT.getScalarType() != InVal.getValueType() && 19678 InVal.getValueType().isScalarInteger() && 19679 isTypeLegal(VT.getScalarType())) { 19680 SDValue Val = 19681 DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); 19682 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); 19683 } 19684 if (VT.getScalarType() == InVecT.getScalarType() && 19685 VT.getVectorNumElements() <= InVecT.getVectorNumElements()) { 19686 SDValue LegalShuffle = 19687 TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec, 19688 DAG.getUNDEF(InVecT), NewMask, DAG); 19689 if (LegalShuffle) { 19690 // If the initial vector is the correct size this shuffle is a 19691 // valid result. 19692 if (VT == InVecT) 19693 return LegalShuffle; 19694 // If not we must truncate the vector. 19695 if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { 19696 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N)); 19697 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), 19698 InVecT.getVectorElementType(), 19699 VT.getVectorNumElements()); 19700 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, 19701 LegalShuffle, ZeroIdx); 19702 } 19703 } 19704 } 19705 } 19706 } 19707 19708 return SDValue(); 19709 } 19710 19711 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { 19712 EVT VT = N->getValueType(0); 19713 SDValue N0 = N->getOperand(0); 19714 SDValue N1 = N->getOperand(1); 19715 SDValue N2 = N->getOperand(2); 19716 19717 // If inserting an UNDEF, just return the original vector. 19718 if (N1.isUndef()) 19719 return N0; 19720 19721 // If this is an insert of an extracted vector into an undef vector, we can 19722 // just use the input to the extract. 19723 if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && 19724 N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) 19725 return N1.getOperand(0); 19726 19727 // If we are inserting a bitcast value into an undef, with the same 19728 // number of elements, just use the bitcast input of the extract. 19729 // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> 19730 // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) 19731 if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && 19732 N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && 19733 N1.getOperand(0).getOperand(1) == N2 && 19734 N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == 19735 VT.getVectorNumElements() && 19736 N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == 19737 VT.getSizeInBits()) { 19738 return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); 19739 } 19740 19741 // If both N1 and N2 are bitcast values on which insert_subvector 19742 // would makes sense, pull the bitcast through. 19743 // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> 19744 // BITCAST (INSERT_SUBVECTOR N0 N1 N2) 19745 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { 19746 SDValue CN0 = N0.getOperand(0); 19747 SDValue CN1 = N1.getOperand(0); 19748 EVT CN0VT = CN0.getValueType(); 19749 EVT CN1VT = CN1.getValueType(); 19750 if (CN0VT.isVector() && CN1VT.isVector() && 19751 CN0VT.getVectorElementType() == CN1VT.getVectorElementType() && 19752 CN0VT.getVectorNumElements() == VT.getVectorNumElements()) { 19753 SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), 19754 CN0.getValueType(), CN0, CN1, N2); 19755 return DAG.getBitcast(VT, NewINSERT); 19756 } 19757 } 19758 19759 // Combine INSERT_SUBVECTORs where we are inserting to the same index. 19760 // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) 19761 // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) 19762 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && 19763 N0.getOperand(1).getValueType() == N1.getValueType() && 19764 N0.getOperand(2) == N2) 19765 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), 19766 N1, N2); 19767 19768 // Eliminate an intermediate insert into an undef vector: 19769 // insert_subvector undef, (insert_subvector undef, X, 0), N2 --> 19770 // insert_subvector undef, X, N2 19771 if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR && 19772 N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2))) 19773 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0, 19774 N1.getOperand(1), N2); 19775 19776 if (!isa<ConstantSDNode>(N2)) 19777 return SDValue(); 19778 19779 uint64_t InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); 19780 19781 // Push subvector bitcasts to the output, adjusting the index as we go. 19782 // insert_subvector(bitcast(v), bitcast(s), c1) 19783 // -> bitcast(insert_subvector(v, s, c2)) 19784 if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) && 19785 N1.getOpcode() == ISD::BITCAST) { 19786 SDValue N0Src = peekThroughBitcasts(N0); 19787 SDValue N1Src = peekThroughBitcasts(N1); 19788 EVT N0SrcSVT = N0Src.getValueType().getScalarType(); 19789 EVT N1SrcSVT = N1Src.getValueType().getScalarType(); 19790 if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) && 19791 N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) { 19792 EVT NewVT; 19793 SDLoc DL(N); 19794 SDValue NewIdx; 19795 LLVMContext &Ctx = *DAG.getContext(); 19796 unsigned NumElts = VT.getVectorNumElements(); 19797 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 19798 if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { 19799 unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); 19800 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale); 19801 NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL); 19802 } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { 19803 unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; 19804 if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) { 19805 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale); 19806 NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL); 19807 } 19808 } 19809 if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) { 19810 SDValue Res = DAG.getBitcast(NewVT, N0Src); 19811 Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx); 19812 return DAG.getBitcast(VT, Res); 19813 } 19814 } 19815 } 19816 19817 // Canonicalize insert_subvector dag nodes. 19818 // Example: 19819 // (insert_subvector (insert_subvector A, Idx0), Idx1) 19820 // -> (insert_subvector (insert_subvector A, Idx1), Idx0) 19821 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && 19822 N1.getValueType() == N0.getOperand(1).getValueType() && 19823 isa<ConstantSDNode>(N0.getOperand(2))) { 19824 unsigned OtherIdx = N0.getConstantOperandVal(2); 19825 if (InsIdx < OtherIdx) { 19826 // Swap nodes. 19827 SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, 19828 N0.getOperand(0), N1, N2); 19829 AddToWorklist(NewOp.getNode()); 19830 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), 19831 VT, NewOp, N0.getOperand(1), N0.getOperand(2)); 19832 } 19833 } 19834 19835 // If the input vector is a concatenation, and the insert replaces 19836 // one of the pieces, we can optimize into a single concat_vectors. 19837 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && 19838 N0.getOperand(0).getValueType() == N1.getValueType()) { 19839 unsigned Factor = N1.getValueType().getVectorNumElements(); 19840 19841 SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); 19842 Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1; 19843 19844 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 19845 } 19846 19847 // Simplify source operands based on insertion. 19848 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 19849 return SDValue(N, 0); 19850 19851 return SDValue(); 19852 } 19853 19854 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { 19855 SDValue N0 = N->getOperand(0); 19856 19857 // fold (fp_to_fp16 (fp16_to_fp op)) -> op 19858 if (N0->getOpcode() == ISD::FP16_TO_FP) 19859 return N0->getOperand(0); 19860 19861 return SDValue(); 19862 } 19863 19864 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { 19865 SDValue N0 = N->getOperand(0); 19866 19867 // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) 19868 if (N0->getOpcode() == ISD::AND) { 19869 ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); 19870 if (AndConst && AndConst->getAPIntValue() == 0xffff) { 19871 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), 19872 N0.getOperand(0)); 19873 } 19874 } 19875 19876 return SDValue(); 19877 } 19878 19879 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { 19880 SDValue N0 = N->getOperand(0); 19881 EVT VT = N0.getValueType(); 19882 unsigned Opcode = N->getOpcode(); 19883 19884 // VECREDUCE over 1-element vector is just an extract. 19885 if (VT.getVectorNumElements() == 1) { 19886 SDLoc dl(N); 19887 SDValue Res = 19888 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, 19889 DAG.getVectorIdxConstant(0, dl)); 19890 if (Res.getValueType() != N->getValueType(0)) 19891 Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res); 19892 return Res; 19893 } 19894 19895 // On an boolean vector an and/or reduction is the same as a umin/umax 19896 // reduction. Convert them if the latter is legal while the former isn't. 19897 if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) { 19898 unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND 19899 ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX; 19900 if (!TLI.isOperationLegalOrCustom(Opcode, VT) && 19901 TLI.isOperationLegalOrCustom(NewOpcode, VT) && 19902 DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits()) 19903 return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0); 19904 } 19905 19906 return SDValue(); 19907 } 19908 19909 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle 19910 /// with the destination vector and a zero vector. 19911 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> 19912 /// vector_shuffle V, Zero, <0, 4, 2, 4> 19913 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { 19914 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); 19915 19916 EVT VT = N->getValueType(0); 19917 SDValue LHS = N->getOperand(0); 19918 SDValue RHS = peekThroughBitcasts(N->getOperand(1)); 19919 SDLoc DL(N); 19920 19921 // Make sure we're not running after operation legalization where it 19922 // may have custom lowered the vector shuffles. 19923 if (LegalOperations) 19924 return SDValue(); 19925 19926 if (RHS.getOpcode() != ISD::BUILD_VECTOR) 19927 return SDValue(); 19928 19929 EVT RVT = RHS.getValueType(); 19930 unsigned NumElts = RHS.getNumOperands(); 19931 19932 // Attempt to create a valid clear mask, splitting the mask into 19933 // sub elements and checking to see if each is 19934 // all zeros or all ones - suitable for shuffle masking. 19935 auto BuildClearMask = [&](int Split) { 19936 int NumSubElts = NumElts * Split; 19937 int NumSubBits = RVT.getScalarSizeInBits() / Split; 19938 19939 SmallVector<int, 8> Indices; 19940 for (int i = 0; i != NumSubElts; ++i) { 19941 int EltIdx = i / Split; 19942 int SubIdx = i % Split; 19943 SDValue Elt = RHS.getOperand(EltIdx); 19944 // X & undef --> 0 (not undef). So this lane must be converted to choose 19945 // from the zero constant vector (same as if the element had all 0-bits). 19946 if (Elt.isUndef()) { 19947 Indices.push_back(i + NumSubElts); 19948 continue; 19949 } 19950 19951 APInt Bits; 19952 if (isa<ConstantSDNode>(Elt)) 19953 Bits = cast<ConstantSDNode>(Elt)->getAPIntValue(); 19954 else if (isa<ConstantFPSDNode>(Elt)) 19955 Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt(); 19956 else 19957 return SDValue(); 19958 19959 // Extract the sub element from the constant bit mask. 19960 if (DAG.getDataLayout().isBigEndian()) 19961 Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits); 19962 else 19963 Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits); 19964 19965 if (Bits.isAllOnesValue()) 19966 Indices.push_back(i); 19967 else if (Bits == 0) 19968 Indices.push_back(i + NumSubElts); 19969 else 19970 return SDValue(); 19971 } 19972 19973 // Let's see if the target supports this vector_shuffle. 19974 EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); 19975 EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); 19976 if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) 19977 return SDValue(); 19978 19979 SDValue Zero = DAG.getConstant(0, DL, ClearVT); 19980 return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL, 19981 DAG.getBitcast(ClearVT, LHS), 19982 Zero, Indices)); 19983 }; 19984 19985 // Determine maximum split level (byte level masking). 19986 int MaxSplit = 1; 19987 if (RVT.getScalarSizeInBits() % 8 == 0) 19988 MaxSplit = RVT.getScalarSizeInBits() / 8; 19989 19990 for (int Split = 1; Split <= MaxSplit; ++Split) 19991 if (RVT.getScalarSizeInBits() % Split == 0) 19992 if (SDValue S = BuildClearMask(Split)) 19993 return S; 19994 19995 return SDValue(); 19996 } 19997 19998 /// If a vector binop is performed on splat values, it may be profitable to 19999 /// extract, scalarize, and insert/splat. 20000 static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) { 20001 SDValue N0 = N->getOperand(0); 20002 SDValue N1 = N->getOperand(1); 20003 unsigned Opcode = N->getOpcode(); 20004 EVT VT = N->getValueType(0); 20005 EVT EltVT = VT.getVectorElementType(); 20006 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20007 20008 // TODO: Remove/replace the extract cost check? If the elements are available 20009 // as scalars, then there may be no extract cost. Should we ask if 20010 // inserting a scalar back into a vector is cheap instead? 20011 int Index0, Index1; 20012 SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); 20013 SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); 20014 if (!Src0 || !Src1 || Index0 != Index1 || 20015 Src0.getValueType().getVectorElementType() != EltVT || 20016 Src1.getValueType().getVectorElementType() != EltVT || 20017 !TLI.isExtractVecEltCheap(VT, Index0) || 20018 !TLI.isOperationLegalOrCustom(Opcode, EltVT)) 20019 return SDValue(); 20020 20021 SDLoc DL(N); 20022 SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); 20023 SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N0, IndexC); 20024 SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N1, IndexC); 20025 SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); 20026 20027 // If all lanes but 1 are undefined, no need to splat the scalar result. 20028 // TODO: Keep track of undefs and use that info in the general case. 20029 if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && 20030 count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 && 20031 count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) { 20032 // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> 20033 // build_vec ..undef, (bo X, Y), undef... 20034 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); 20035 Ops[Index0] = ScalarBO; 20036 return DAG.getBuildVector(VT, DL, Ops); 20037 } 20038 20039 // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index 20040 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO); 20041 return DAG.getBuildVector(VT, DL, Ops); 20042 } 20043 20044 /// Visit a binary vector operation, like ADD. 20045 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { 20046 assert(N->getValueType(0).isVector() && 20047 "SimplifyVBinOp only works on vectors!"); 20048 20049 SDValue LHS = N->getOperand(0); 20050 SDValue RHS = N->getOperand(1); 20051 SDValue Ops[] = {LHS, RHS}; 20052 EVT VT = N->getValueType(0); 20053 unsigned Opcode = N->getOpcode(); 20054 20055 // See if we can constant fold the vector operation. 20056 if (SDValue Fold = DAG.FoldConstantVectorArithmetic( 20057 Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) 20058 return Fold; 20059 20060 // Move unary shuffles with identical masks after a vector binop: 20061 // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask)) 20062 // --> shuffle (VBinOp A, B), Undef, Mask 20063 // This does not require type legality checks because we are creating the 20064 // same types of operations that are in the original sequence. We do have to 20065 // restrict ops like integer div that have immediate UB (eg, div-by-zero) 20066 // though. This code is adapted from the identical transform in instcombine. 20067 if (Opcode != ISD::UDIV && Opcode != ISD::SDIV && 20068 Opcode != ISD::UREM && Opcode != ISD::SREM && 20069 Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) { 20070 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS); 20071 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS); 20072 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) && 20073 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() && 20074 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) { 20075 SDLoc DL(N); 20076 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), 20077 RHS.getOperand(0), N->getFlags()); 20078 SDValue UndefV = LHS.getOperand(1); 20079 return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); 20080 } 20081 } 20082 20083 // The following pattern is likely to emerge with vector reduction ops. Moving 20084 // the binary operation ahead of insertion may allow using a narrower vector 20085 // instruction that has better performance than the wide version of the op: 20086 // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z 20087 if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && 20088 RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && 20089 LHS.getOperand(2) == RHS.getOperand(2) && 20090 (LHS.hasOneUse() || RHS.hasOneUse())) { 20091 SDValue X = LHS.getOperand(1); 20092 SDValue Y = RHS.getOperand(1); 20093 SDValue Z = LHS.getOperand(2); 20094 EVT NarrowVT = X.getValueType(); 20095 if (NarrowVT == Y.getValueType() && 20096 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { 20097 // (binop undef, undef) may not return undef, so compute that result. 20098 SDLoc DL(N); 20099 SDValue VecC = 20100 DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT)); 20101 SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y); 20102 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); 20103 } 20104 } 20105 20106 // Make sure all but the first op are undef or constant. 20107 auto ConcatWithConstantOrUndef = [](SDValue Concat) { 20108 return Concat.getOpcode() == ISD::CONCAT_VECTORS && 20109 std::all_of(std::next(Concat->op_begin()), Concat->op_end(), 20110 [](const SDValue &Op) { 20111 return Op.isUndef() || 20112 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); 20113 }); 20114 }; 20115 20116 // The following pattern is likely to emerge with vector reduction ops. Moving 20117 // the binary operation ahead of the concat may allow using a narrower vector 20118 // instruction that has better performance than the wide version of the op: 20119 // VBinOp (concat X, undef/constant), (concat Y, undef/constant) --> 20120 // concat (VBinOp X, Y), VecC 20121 if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) && 20122 (LHS.hasOneUse() || RHS.hasOneUse())) { 20123 EVT NarrowVT = LHS.getOperand(0).getValueType(); 20124 if (NarrowVT == RHS.getOperand(0).getValueType() && 20125 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { 20126 SDLoc DL(N); 20127 unsigned NumOperands = LHS.getNumOperands(); 20128 SmallVector<SDValue, 4> ConcatOps; 20129 for (unsigned i = 0; i != NumOperands; ++i) { 20130 // This constant fold for operands 1 and up. 20131 ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i), 20132 RHS.getOperand(i))); 20133 } 20134 20135 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 20136 } 20137 } 20138 20139 if (SDValue V = scalarizeBinOpOfSplats(N, DAG)) 20140 return V; 20141 20142 return SDValue(); 20143 } 20144 20145 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, 20146 SDValue N2) { 20147 assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); 20148 20149 SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, 20150 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 20151 20152 // If we got a simplified select_cc node back from SimplifySelectCC, then 20153 // break it down into a new SETCC node, and a new SELECT node, and then return 20154 // the SELECT node, since we were called with a SELECT node. 20155 if (SCC.getNode()) { 20156 // Check to see if we got a select_cc back (to turn into setcc/select). 20157 // Otherwise, just return whatever node we got back, like fabs. 20158 if (SCC.getOpcode() == ISD::SELECT_CC) { 20159 const SDNodeFlags Flags = N0.getNode()->getFlags(); 20160 SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), 20161 N0.getValueType(), 20162 SCC.getOperand(0), SCC.getOperand(1), 20163 SCC.getOperand(4), Flags); 20164 AddToWorklist(SETCC.getNode()); 20165 SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, 20166 SCC.getOperand(2), SCC.getOperand(3)); 20167 SelectNode->setFlags(Flags); 20168 return SelectNode; 20169 } 20170 20171 return SCC; 20172 } 20173 return SDValue(); 20174 } 20175 20176 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values 20177 /// being selected between, see if we can simplify the select. Callers of this 20178 /// should assume that TheSelect is deleted if this returns true. As such, they 20179 /// should return the appropriate thing (e.g. the node) back to the top-level of 20180 /// the DAG combiner loop to avoid it being looked at. 20181 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, 20182 SDValue RHS) { 20183 // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 20184 // The select + setcc is redundant, because fsqrt returns NaN for X < 0. 20185 if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { 20186 if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) { 20187 // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?)) 20188 SDValue Sqrt = RHS; 20189 ISD::CondCode CC; 20190 SDValue CmpLHS; 20191 const ConstantFPSDNode *Zero = nullptr; 20192 20193 if (TheSelect->getOpcode() == ISD::SELECT_CC) { 20194 CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get(); 20195 CmpLHS = TheSelect->getOperand(0); 20196 Zero = isConstOrConstSplatFP(TheSelect->getOperand(1)); 20197 } else { 20198 // SELECT or VSELECT 20199 SDValue Cmp = TheSelect->getOperand(0); 20200 if (Cmp.getOpcode() == ISD::SETCC) { 20201 CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get(); 20202 CmpLHS = Cmp.getOperand(0); 20203 Zero = isConstOrConstSplatFP(Cmp.getOperand(1)); 20204 } 20205 } 20206 if (Zero && Zero->isZero() && 20207 Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT || 20208 CC == ISD::SETULT || CC == ISD::SETLT)) { 20209 // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 20210 CombineTo(TheSelect, Sqrt); 20211 return true; 20212 } 20213 } 20214 } 20215 // Cannot simplify select with vector condition 20216 if (TheSelect->getOperand(0).getValueType().isVector()) return false; 20217 20218 // If this is a select from two identical things, try to pull the operation 20219 // through the select. 20220 if (LHS.getOpcode() != RHS.getOpcode() || 20221 !LHS.hasOneUse() || !RHS.hasOneUse()) 20222 return false; 20223 20224 // If this is a load and the token chain is identical, replace the select 20225 // of two loads with a load through a select of the address to load from. 20226 // This triggers in things like "select bool X, 10.0, 123.0" after the FP 20227 // constants have been dropped into the constant pool. 20228 if (LHS.getOpcode() == ISD::LOAD) { 20229 LoadSDNode *LLD = cast<LoadSDNode>(LHS); 20230 LoadSDNode *RLD = cast<LoadSDNode>(RHS); 20231 20232 // Token chains must be identical. 20233 if (LHS.getOperand(0) != RHS.getOperand(0) || 20234 // Do not let this transformation reduce the number of volatile loads. 20235 // Be conservative for atomics for the moment 20236 // TODO: This does appear to be legal for unordered atomics (see D66309) 20237 !LLD->isSimple() || !RLD->isSimple() || 20238 // FIXME: If either is a pre/post inc/dec load, 20239 // we'd need to split out the address adjustment. 20240 LLD->isIndexed() || RLD->isIndexed() || 20241 // If this is an EXTLOAD, the VT's must match. 20242 LLD->getMemoryVT() != RLD->getMemoryVT() || 20243 // If this is an EXTLOAD, the kind of extension must match. 20244 (LLD->getExtensionType() != RLD->getExtensionType() && 20245 // The only exception is if one of the extensions is anyext. 20246 LLD->getExtensionType() != ISD::EXTLOAD && 20247 RLD->getExtensionType() != ISD::EXTLOAD) || 20248 // FIXME: this discards src value information. This is 20249 // over-conservative. It would be beneficial to be able to remember 20250 // both potential memory locations. Since we are discarding 20251 // src value info, don't do the transformation if the memory 20252 // locations are not in the default address space. 20253 LLD->getPointerInfo().getAddrSpace() != 0 || 20254 RLD->getPointerInfo().getAddrSpace() != 0 || 20255 // We can't produce a CMOV of a TargetFrameIndex since we won't 20256 // generate the address generation required. 20257 LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 20258 RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 20259 !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), 20260 LLD->getBasePtr().getValueType())) 20261 return false; 20262 20263 // The loads must not depend on one another. 20264 if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) 20265 return false; 20266 20267 // Check that the select condition doesn't reach either load. If so, 20268 // folding this will induce a cycle into the DAG. If not, this is safe to 20269 // xform, so create a select of the addresses. 20270 20271 SmallPtrSet<const SDNode *, 32> Visited; 20272 SmallVector<const SDNode *, 16> Worklist; 20273 20274 // Always fail if LLD and RLD are not independent. TheSelect is a 20275 // predecessor to all Nodes in question so we need not search past it. 20276 20277 Visited.insert(TheSelect); 20278 Worklist.push_back(LLD); 20279 Worklist.push_back(RLD); 20280 20281 if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) || 20282 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)) 20283 return false; 20284 20285 SDValue Addr; 20286 if (TheSelect->getOpcode() == ISD::SELECT) { 20287 // We cannot do this optimization if any pair of {RLD, LLD} is a 20288 // predecessor to {RLD, LLD, CondNode}. As we've already compared the 20289 // Loads, we only need to check if CondNode is a successor to one of the 20290 // loads. We can further avoid this if there's no use of their chain 20291 // value. 20292 SDNode *CondNode = TheSelect->getOperand(0).getNode(); 20293 Worklist.push_back(CondNode); 20294 20295 if ((LLD->hasAnyUseOfValue(1) && 20296 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 20297 (RLD->hasAnyUseOfValue(1) && 20298 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 20299 return false; 20300 20301 Addr = DAG.getSelect(SDLoc(TheSelect), 20302 LLD->getBasePtr().getValueType(), 20303 TheSelect->getOperand(0), LLD->getBasePtr(), 20304 RLD->getBasePtr()); 20305 } else { // Otherwise SELECT_CC 20306 // We cannot do this optimization if any pair of {RLD, LLD} is a 20307 // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared 20308 // the Loads, we only need to check if CondLHS/CondRHS is a successor to 20309 // one of the loads. We can further avoid this if there's no use of their 20310 // chain value. 20311 20312 SDNode *CondLHS = TheSelect->getOperand(0).getNode(); 20313 SDNode *CondRHS = TheSelect->getOperand(1).getNode(); 20314 Worklist.push_back(CondLHS); 20315 Worklist.push_back(CondRHS); 20316 20317 if ((LLD->hasAnyUseOfValue(1) && 20318 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 20319 (RLD->hasAnyUseOfValue(1) && 20320 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 20321 return false; 20322 20323 Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), 20324 LLD->getBasePtr().getValueType(), 20325 TheSelect->getOperand(0), 20326 TheSelect->getOperand(1), 20327 LLD->getBasePtr(), RLD->getBasePtr(), 20328 TheSelect->getOperand(4)); 20329 } 20330 20331 SDValue Load; 20332 // It is safe to replace the two loads if they have different alignments, 20333 // but the new load must be the minimum (most restrictive) alignment of the 20334 // inputs. 20335 unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); 20336 MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); 20337 if (!RLD->isInvariant()) 20338 MMOFlags &= ~MachineMemOperand::MOInvariant; 20339 if (!RLD->isDereferenceable()) 20340 MMOFlags &= ~MachineMemOperand::MODereferenceable; 20341 if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { 20342 // FIXME: Discards pointer and AA info. 20343 Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), 20344 LLD->getChain(), Addr, MachinePointerInfo(), Alignment, 20345 MMOFlags); 20346 } else { 20347 // FIXME: Discards pointer and AA info. 20348 Load = DAG.getExtLoad( 20349 LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() 20350 : LLD->getExtensionType(), 20351 SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, 20352 MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); 20353 } 20354 20355 // Users of the select now use the result of the load. 20356 CombineTo(TheSelect, Load); 20357 20358 // Users of the old loads now use the new load's chain. We know the 20359 // old-load value is dead now. 20360 CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); 20361 CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); 20362 return true; 20363 } 20364 20365 return false; 20366 } 20367 20368 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and 20369 /// bitwise 'and'. 20370 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, 20371 SDValue N1, SDValue N2, SDValue N3, 20372 ISD::CondCode CC) { 20373 // If this is a select where the false operand is zero and the compare is a 20374 // check of the sign bit, see if we can perform the "gzip trick": 20375 // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A 20376 // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A 20377 EVT XType = N0.getValueType(); 20378 EVT AType = N2.getValueType(); 20379 if (!isNullConstant(N3) || !XType.bitsGE(AType)) 20380 return SDValue(); 20381 20382 // If the comparison is testing for a positive value, we have to invert 20383 // the sign bit mask, so only do that transform if the target has a bitwise 20384 // 'and not' instruction (the invert is free). 20385 if (CC == ISD::SETGT && TLI.hasAndNot(N2)) { 20386 // (X > -1) ? A : 0 20387 // (X > 0) ? X : 0 <-- This is canonical signed max. 20388 if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2))) 20389 return SDValue(); 20390 } else if (CC == ISD::SETLT) { 20391 // (X < 0) ? A : 0 20392 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min. 20393 if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2))) 20394 return SDValue(); 20395 } else { 20396 return SDValue(); 20397 } 20398 20399 // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit 20400 // constant. 20401 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 20402 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 20403 if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { 20404 unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; 20405 if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) { 20406 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 20407 SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); 20408 AddToWorklist(Shift.getNode()); 20409 20410 if (XType.bitsGT(AType)) { 20411 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 20412 AddToWorklist(Shift.getNode()); 20413 } 20414 20415 if (CC == ISD::SETGT) 20416 Shift = DAG.getNOT(DL, Shift, AType); 20417 20418 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 20419 } 20420 } 20421 20422 unsigned ShCt = XType.getSizeInBits() - 1; 20423 if (TLI.shouldAvoidTransformToShift(XType, ShCt)) 20424 return SDValue(); 20425 20426 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 20427 SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); 20428 AddToWorklist(Shift.getNode()); 20429 20430 if (XType.bitsGT(AType)) { 20431 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 20432 AddToWorklist(Shift.getNode()); 20433 } 20434 20435 if (CC == ISD::SETGT) 20436 Shift = DAG.getNOT(DL, Shift, AType); 20437 20438 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 20439 } 20440 20441 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" 20442 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 20443 /// in it. This may be a win when the constant is not otherwise available 20444 /// because it replaces two constant pool loads with one. 20445 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset( 20446 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 20447 ISD::CondCode CC) { 20448 if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType())) 20449 return SDValue(); 20450 20451 // If we are before legalize types, we want the other legalization to happen 20452 // first (for example, to avoid messing with soft float). 20453 auto *TV = dyn_cast<ConstantFPSDNode>(N2); 20454 auto *FV = dyn_cast<ConstantFPSDNode>(N3); 20455 EVT VT = N2.getValueType(); 20456 if (!TV || !FV || !TLI.isTypeLegal(VT)) 20457 return SDValue(); 20458 20459 // If a constant can be materialized without loads, this does not make sense. 20460 if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal || 20461 TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) || 20462 TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize)) 20463 return SDValue(); 20464 20465 // If both constants have multiple uses, then we won't need to do an extra 20466 // load. The values are likely around in registers for other users. 20467 if (!TV->hasOneUse() && !FV->hasOneUse()) 20468 return SDValue(); 20469 20470 Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()), 20471 const_cast<ConstantFP*>(TV->getConstantFPValue()) }; 20472 Type *FPTy = Elts[0]->getType(); 20473 const DataLayout &TD = DAG.getDataLayout(); 20474 20475 // Create a ConstantArray of the two constants. 20476 Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); 20477 SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), 20478 TD.getPrefTypeAlignment(FPTy)); 20479 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 20480 20481 // Get offsets to the 0 and 1 elements of the array, so we can select between 20482 // them. 20483 SDValue Zero = DAG.getIntPtrConstant(0, DL); 20484 unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); 20485 SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); 20486 SDValue Cond = 20487 DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); 20488 AddToWorklist(Cond.getNode()); 20489 SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); 20490 AddToWorklist(CstOffset.getNode()); 20491 CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); 20492 AddToWorklist(CPIdx.getNode()); 20493 return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, 20494 MachinePointerInfo::getConstantPool( 20495 DAG.getMachineFunction()), Alignment); 20496 } 20497 20498 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 20499 /// where 'cond' is the comparison specified by CC. 20500 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 20501 SDValue N2, SDValue N3, ISD::CondCode CC, 20502 bool NotExtCompare) { 20503 // (x ? y : y) -> y. 20504 if (N2 == N3) return N2; 20505 20506 EVT CmpOpVT = N0.getValueType(); 20507 EVT CmpResVT = getSetCCResultType(CmpOpVT); 20508 EVT VT = N2.getValueType(); 20509 auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode()); 20510 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 20511 auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode()); 20512 20513 // Determine if the condition we're dealing with is constant. 20514 if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) { 20515 AddToWorklist(SCC.getNode()); 20516 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) { 20517 // fold select_cc true, x, y -> x 20518 // fold select_cc false, x, y -> y 20519 return !(SCCC->isNullValue()) ? N2 : N3; 20520 } 20521 } 20522 20523 if (SDValue V = 20524 convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC)) 20525 return V; 20526 20527 if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) 20528 return V; 20529 20530 // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) 20531 // where y is has a single bit set. 20532 // A plaintext description would be, we can turn the SELECT_CC into an AND 20533 // when the condition can be materialized as an all-ones register. Any 20534 // single bit-test can be materialized as an all-ones register with 20535 // shift-left and shift-right-arith. 20536 if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && 20537 N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { 20538 SDValue AndLHS = N0->getOperand(0); 20539 auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 20540 if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { 20541 // Shift the tested bit over the sign bit. 20542 const APInt &AndMask = ConstAndRHS->getAPIntValue(); 20543 unsigned ShCt = AndMask.getBitWidth() - 1; 20544 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 20545 SDValue ShlAmt = 20546 DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), 20547 getShiftAmountTy(AndLHS.getValueType())); 20548 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); 20549 20550 // Now arithmetic right shift it all the way over, so the result is 20551 // either all-ones, or zero. 20552 SDValue ShrAmt = 20553 DAG.getConstant(ShCt, SDLoc(Shl), 20554 getShiftAmountTy(Shl.getValueType())); 20555 SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); 20556 20557 return DAG.getNode(ISD::AND, DL, VT, Shr, N3); 20558 } 20559 } 20560 } 20561 20562 // fold select C, 16, 0 -> shl C, 4 20563 bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2(); 20564 bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2(); 20565 20566 if ((Fold || Swap) && 20567 TLI.getBooleanContents(CmpOpVT) == 20568 TargetLowering::ZeroOrOneBooleanContent && 20569 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { 20570 20571 if (Swap) { 20572 CC = ISD::getSetCCInverse(CC, CmpOpVT); 20573 std::swap(N2C, N3C); 20574 } 20575 20576 // If the caller doesn't want us to simplify this into a zext of a compare, 20577 // don't do it. 20578 if (NotExtCompare && N2C->isOne()) 20579 return SDValue(); 20580 20581 SDValue Temp, SCC; 20582 // zext (setcc n0, n1) 20583 if (LegalTypes) { 20584 SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC); 20585 if (VT.bitsLT(SCC.getValueType())) 20586 Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT); 20587 else 20588 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 20589 } else { 20590 SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); 20591 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 20592 } 20593 20594 AddToWorklist(SCC.getNode()); 20595 AddToWorklist(Temp.getNode()); 20596 20597 if (N2C->isOne()) 20598 return Temp; 20599 20600 unsigned ShCt = N2C->getAPIntValue().logBase2(); 20601 if (TLI.shouldAvoidTransformToShift(VT, ShCt)) 20602 return SDValue(); 20603 20604 // shl setcc result by log2 n2c 20605 return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp, 20606 DAG.getConstant(ShCt, SDLoc(Temp), 20607 getShiftAmountTy(Temp.getValueType()))); 20608 } 20609 20610 // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) 20611 // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) 20612 // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) 20613 // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) 20614 // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) 20615 // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) 20616 // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) 20617 // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) 20618 if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 20619 SDValue ValueOnZero = N2; 20620 SDValue Count = N3; 20621 // If the condition is NE instead of E, swap the operands. 20622 if (CC == ISD::SETNE) 20623 std::swap(ValueOnZero, Count); 20624 // Check if the value on zero is a constant equal to the bits in the type. 20625 if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) { 20626 if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { 20627 // If the other operand is cttz/cttz_zero_undef of N0, and cttz is 20628 // legal, combine to just cttz. 20629 if ((Count.getOpcode() == ISD::CTTZ || 20630 Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && 20631 N0 == Count.getOperand(0) && 20632 (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) 20633 return DAG.getNode(ISD::CTTZ, DL, VT, N0); 20634 // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is 20635 // legal, combine to just ctlz. 20636 if ((Count.getOpcode() == ISD::CTLZ || 20637 Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && 20638 N0 == Count.getOperand(0) && 20639 (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) 20640 return DAG.getNode(ISD::CTLZ, DL, VT, N0); 20641 } 20642 } 20643 } 20644 20645 return SDValue(); 20646 } 20647 20648 /// This is a stub for TargetLowering::SimplifySetCC. 20649 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, 20650 ISD::CondCode Cond, const SDLoc &DL, 20651 bool foldBooleans) { 20652 TargetLowering::DAGCombinerInfo 20653 DagCombineInfo(DAG, Level, false, this); 20654 return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); 20655 } 20656 20657 /// Given an ISD::SDIV node expressing a divide by constant, return 20658 /// a DAG expression to select that will generate the same value by multiplying 20659 /// by a magic number. 20660 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 20661 SDValue DAGCombiner::BuildSDIV(SDNode *N) { 20662 // when optimising for minimum size, we don't want to expand a div to a mul 20663 // and a shift. 20664 if (DAG.getMachineFunction().getFunction().hasMinSize()) 20665 return SDValue(); 20666 20667 SmallVector<SDNode *, 8> Built; 20668 if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) { 20669 for (SDNode *N : Built) 20670 AddToWorklist(N); 20671 return S; 20672 } 20673 20674 return SDValue(); 20675 } 20676 20677 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a 20678 /// DAG expression that will generate the same value by right shifting. 20679 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { 20680 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); 20681 if (!C) 20682 return SDValue(); 20683 20684 // Avoid division by zero. 20685 if (C->isNullValue()) 20686 return SDValue(); 20687 20688 SmallVector<SDNode *, 8> Built; 20689 if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) { 20690 for (SDNode *N : Built) 20691 AddToWorklist(N); 20692 return S; 20693 } 20694 20695 return SDValue(); 20696 } 20697 20698 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG 20699 /// expression that will generate the same value by multiplying by a magic 20700 /// number. 20701 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 20702 SDValue DAGCombiner::BuildUDIV(SDNode *N) { 20703 // when optimising for minimum size, we don't want to expand a div to a mul 20704 // and a shift. 20705 if (DAG.getMachineFunction().getFunction().hasMinSize()) 20706 return SDValue(); 20707 20708 SmallVector<SDNode *, 8> Built; 20709 if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { 20710 for (SDNode *N : Built) 20711 AddToWorklist(N); 20712 return S; 20713 } 20714 20715 return SDValue(); 20716 } 20717 20718 /// Determines the LogBase2 value for a non-null input value using the 20719 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). 20720 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { 20721 EVT VT = V.getValueType(); 20722 unsigned EltBits = VT.getScalarSizeInBits(); 20723 SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); 20724 SDValue Base = DAG.getConstant(EltBits - 1, DL, VT); 20725 SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); 20726 return LogBase2; 20727 } 20728 20729 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 20730 /// For the reciprocal, we need to find the zero of the function: 20731 /// F(X) = A X - 1 [which has a zero at X = 1/A] 20732 /// => 20733 /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form 20734 /// does not require additional intermediate precision] 20735 /// For the last iteration, put numerator N into it to gain more precision: 20736 /// Result = N X_i + X_i (N - N A X_i) 20737 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op, 20738 SDNodeFlags Flags) { 20739 if (LegalDAG) 20740 return SDValue(); 20741 20742 // TODO: Handle half and/or extended types? 20743 EVT VT = Op.getValueType(); 20744 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 20745 return SDValue(); 20746 20747 // If estimates are explicitly disabled for this function, we're done. 20748 MachineFunction &MF = DAG.getMachineFunction(); 20749 int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); 20750 if (Enabled == TLI.ReciprocalEstimate::Disabled) 20751 return SDValue(); 20752 20753 // Estimates may be explicitly enabled for this type with a custom number of 20754 // refinement steps. 20755 int Iterations = TLI.getDivRefinementSteps(VT, MF); 20756 if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { 20757 AddToWorklist(Est.getNode()); 20758 20759 SDLoc DL(Op); 20760 if (Iterations) { 20761 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 20762 20763 // Newton iterations: Est = Est + Est (N - Arg * Est) 20764 // If this is the last iteration, also multiply by the numerator. 20765 for (int i = 0; i < Iterations; ++i) { 20766 SDValue MulEst = Est; 20767 20768 if (i == Iterations - 1) { 20769 MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags); 20770 AddToWorklist(MulEst.getNode()); 20771 } 20772 20773 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags); 20774 AddToWorklist(NewEst.getNode()); 20775 20776 NewEst = DAG.getNode(ISD::FSUB, DL, VT, 20777 (i == Iterations - 1 ? N : FPOne), NewEst, Flags); 20778 AddToWorklist(NewEst.getNode()); 20779 20780 NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 20781 AddToWorklist(NewEst.getNode()); 20782 20783 Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags); 20784 AddToWorklist(Est.getNode()); 20785 } 20786 } else { 20787 // If no iterations are available, multiply with N. 20788 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags); 20789 AddToWorklist(Est.getNode()); 20790 } 20791 20792 return Est; 20793 } 20794 20795 return SDValue(); 20796 } 20797 20798 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 20799 /// For the reciprocal sqrt, we need to find the zero of the function: 20800 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 20801 /// => 20802 /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) 20803 /// As a result, we precompute A/2 prior to the iteration loop. 20804 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, 20805 unsigned Iterations, 20806 SDNodeFlags Flags, bool Reciprocal) { 20807 EVT VT = Arg.getValueType(); 20808 SDLoc DL(Arg); 20809 SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); 20810 20811 // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that 20812 // this entire sequence requires only one FP constant. 20813 SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); 20814 HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); 20815 20816 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) 20817 for (unsigned i = 0; i < Iterations; ++i) { 20818 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); 20819 NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); 20820 NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); 20821 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 20822 } 20823 20824 // If non-reciprocal square root is requested, multiply the result by Arg. 20825 if (!Reciprocal) 20826 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); 20827 20828 return Est; 20829 } 20830 20831 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 20832 /// For the reciprocal sqrt, we need to find the zero of the function: 20833 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 20834 /// => 20835 /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) 20836 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, 20837 unsigned Iterations, 20838 SDNodeFlags Flags, bool Reciprocal) { 20839 EVT VT = Arg.getValueType(); 20840 SDLoc DL(Arg); 20841 SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); 20842 SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT); 20843 20844 // This routine must enter the loop below to work correctly 20845 // when (Reciprocal == false). 20846 assert(Iterations > 0); 20847 20848 // Newton iterations for reciprocal square root: 20849 // E = (E * -0.5) * ((A * E) * E + -3.0) 20850 for (unsigned i = 0; i < Iterations; ++i) { 20851 SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags); 20852 SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags); 20853 SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags); 20854 20855 // When calculating a square root at the last iteration build: 20856 // S = ((A * E) * -0.5) * ((A * E) * E + -3.0) 20857 // (notice a common subexpression) 20858 SDValue LHS; 20859 if (Reciprocal || (i + 1) < Iterations) { 20860 // RSQRT: LHS = (E * -0.5) 20861 LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); 20862 } else { 20863 // SQRT: LHS = (A * E) * -0.5 20864 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags); 20865 } 20866 20867 Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags); 20868 } 20869 20870 return Est; 20871 } 20872 20873 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case 20874 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if 20875 /// Op can be zero. 20876 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, 20877 bool Reciprocal) { 20878 if (LegalDAG) 20879 return SDValue(); 20880 20881 // TODO: Handle half and/or extended types? 20882 EVT VT = Op.getValueType(); 20883 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 20884 return SDValue(); 20885 20886 // If estimates are explicitly disabled for this function, we're done. 20887 MachineFunction &MF = DAG.getMachineFunction(); 20888 int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); 20889 if (Enabled == TLI.ReciprocalEstimate::Disabled) 20890 return SDValue(); 20891 20892 // Estimates may be explicitly enabled for this type with a custom number of 20893 // refinement steps. 20894 int Iterations = TLI.getSqrtRefinementSteps(VT, MF); 20895 20896 bool UseOneConstNR = false; 20897 if (SDValue Est = 20898 TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, 20899 Reciprocal)) { 20900 AddToWorklist(Est.getNode()); 20901 20902 if (Iterations) { 20903 Est = UseOneConstNR 20904 ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) 20905 : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); 20906 20907 if (!Reciprocal) { 20908 // The estimate is now completely wrong if the input was exactly 0.0 or 20909 // possibly a denormal. Force the answer to 0.0 for those cases. 20910 SDLoc DL(Op); 20911 EVT CCVT = getSetCCResultType(VT); 20912 ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; 20913 DenormalMode DenormMode = DAG.getDenormalMode(VT); 20914 if (DenormMode.Input == DenormalMode::IEEE) { 20915 // This is specifically a check for the handling of denormal inputs, 20916 // not the result. 20917 20918 // fabs(X) < SmallestNormal ? 0.0 : Est 20919 const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); 20920 APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); 20921 SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); 20922 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 20923 SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); 20924 SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); 20925 Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); 20926 } else { 20927 // X == 0.0 ? 0.0 : Est 20928 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 20929 SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); 20930 Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); 20931 } 20932 } 20933 } 20934 return Est; 20935 } 20936 20937 return SDValue(); 20938 } 20939 20940 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { 20941 return buildSqrtEstimateImpl(Op, Flags, true); 20942 } 20943 20944 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { 20945 return buildSqrtEstimateImpl(Op, Flags, false); 20946 } 20947 20948 /// Return true if there is any possibility that the two addresses overlap. 20949 bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const { 20950 20951 struct MemUseCharacteristics { 20952 bool IsVolatile; 20953 bool IsAtomic; 20954 SDValue BasePtr; 20955 int64_t Offset; 20956 Optional<int64_t> NumBytes; 20957 MachineMemOperand *MMO; 20958 }; 20959 20960 auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { 20961 if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { 20962 int64_t Offset = 0; 20963 if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) 20964 Offset = (LSN->getAddressingMode() == ISD::PRE_INC) 20965 ? C->getSExtValue() 20966 : (LSN->getAddressingMode() == ISD::PRE_DEC) 20967 ? -1 * C->getSExtValue() 20968 : 0; 20969 uint64_t Size = 20970 MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize()); 20971 return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(), 20972 Offset /*base offset*/, 20973 Optional<int64_t>(Size), 20974 LSN->getMemOperand()}; 20975 } 20976 if (const auto *LN = cast<LifetimeSDNode>(N)) 20977 return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 20978 (LN->hasOffset()) ? LN->getOffset() : 0, 20979 (LN->hasOffset()) ? Optional<int64_t>(LN->getSize()) 20980 : Optional<int64_t>(), 20981 (MachineMemOperand *)nullptr}; 20982 // Default. 20983 return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(), 20984 (int64_t)0 /*offset*/, 20985 Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr}; 20986 }; 20987 20988 MemUseCharacteristics MUC0 = getCharacteristics(Op0), 20989 MUC1 = getCharacteristics(Op1); 20990 20991 // If they are to the same address, then they must be aliases. 20992 if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr && 20993 MUC0.Offset == MUC1.Offset) 20994 return true; 20995 20996 // If they are both volatile then they cannot be reordered. 20997 if (MUC0.IsVolatile && MUC1.IsVolatile) 20998 return true; 20999 21000 // Be conservative about atomics for the moment 21001 // TODO: This is way overconservative for unordered atomics (see D66309) 21002 if (MUC0.IsAtomic && MUC1.IsAtomic) 21003 return true; 21004 21005 if (MUC0.MMO && MUC1.MMO) { 21006 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 21007 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 21008 return false; 21009 } 21010 21011 // Try to prove that there is aliasing, or that there is no aliasing. Either 21012 // way, we can return now. If nothing can be proved, proceed with more tests. 21013 bool IsAlias; 21014 if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes, 21015 DAG, IsAlias)) 21016 return IsAlias; 21017 21018 // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if 21019 // either are not known. 21020 if (!MUC0.MMO || !MUC1.MMO) 21021 return true; 21022 21023 // If one operation reads from invariant memory, and the other may store, they 21024 // cannot alias. These should really be checking the equivalent of mayWrite, 21025 // but it only matters for memory nodes other than load /store. 21026 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 21027 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 21028 return false; 21029 21030 // If we know required SrcValue1 and SrcValue2 have relatively large 21031 // alignment compared to the size and offset of the access, we may be able 21032 // to prove they do not alias. This check is conservative for now to catch 21033 // cases created by splitting vector types. 21034 int64_t SrcValOffset0 = MUC0.MMO->getOffset(); 21035 int64_t SrcValOffset1 = MUC1.MMO->getOffset(); 21036 unsigned OrigAlignment0 = MUC0.MMO->getBaseAlignment(); 21037 unsigned OrigAlignment1 = MUC1.MMO->getBaseAlignment(); 21038 if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && 21039 MUC0.NumBytes.hasValue() && MUC1.NumBytes.hasValue() && 21040 *MUC0.NumBytes == *MUC1.NumBytes && OrigAlignment0 > *MUC0.NumBytes) { 21041 int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; 21042 int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; 21043 21044 // There is no overlap between these relatively aligned accesses of 21045 // similar size. Return no alias. 21046 if ((OffAlign0 + *MUC0.NumBytes) <= OffAlign1 || 21047 (OffAlign1 + *MUC1.NumBytes) <= OffAlign0) 21048 return false; 21049 } 21050 21051 bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 21052 ? CombinerGlobalAA 21053 : DAG.getSubtarget().useAA(); 21054 #ifndef NDEBUG 21055 if (CombinerAAOnlyFunc.getNumOccurrences() && 21056 CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) 21057 UseAA = false; 21058 #endif 21059 21060 if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue()) { 21061 // Use alias analysis information. 21062 int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); 21063 int64_t Overlap0 = *MUC0.NumBytes + SrcValOffset0 - MinOffset; 21064 int64_t Overlap1 = *MUC1.NumBytes + SrcValOffset1 - MinOffset; 21065 AliasResult AAResult = AA->alias( 21066 MemoryLocation(MUC0.MMO->getValue(), Overlap0, 21067 UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()), 21068 MemoryLocation(MUC1.MMO->getValue(), Overlap1, 21069 UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())); 21070 if (AAResult == NoAlias) 21071 return false; 21072 } 21073 21074 // Otherwise we have to assume they alias. 21075 return true; 21076 } 21077 21078 /// Walk up chain skipping non-aliasing memory nodes, 21079 /// looking for aliasing nodes and adding them to the Aliases vector. 21080 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, 21081 SmallVectorImpl<SDValue> &Aliases) { 21082 SmallVector<SDValue, 8> Chains; // List of chains to visit. 21083 SmallPtrSet<SDNode *, 16> Visited; // Visited node set. 21084 21085 // Get alias information for node. 21086 // TODO: relax aliasing for unordered atomics (see D66309) 21087 const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple(); 21088 21089 // Starting off. 21090 Chains.push_back(OriginalChain); 21091 unsigned Depth = 0; 21092 21093 // Attempt to improve chain by a single step 21094 std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool { 21095 switch (C.getOpcode()) { 21096 case ISD::EntryToken: 21097 // No need to mark EntryToken. 21098 C = SDValue(); 21099 return true; 21100 case ISD::LOAD: 21101 case ISD::STORE: { 21102 // Get alias information for C. 21103 // TODO: Relax aliasing for unordered atomics (see D66309) 21104 bool IsOpLoad = isa<LoadSDNode>(C.getNode()) && 21105 cast<LSBaseSDNode>(C.getNode())->isSimple(); 21106 if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) { 21107 // Look further up the chain. 21108 C = C.getOperand(0); 21109 return true; 21110 } 21111 // Alias, so stop here. 21112 return false; 21113 } 21114 21115 case ISD::CopyFromReg: 21116 // Always forward past past CopyFromReg. 21117 C = C.getOperand(0); 21118 return true; 21119 21120 case ISD::LIFETIME_START: 21121 case ISD::LIFETIME_END: { 21122 // We can forward past any lifetime start/end that can be proven not to 21123 // alias the memory access. 21124 if (!isAlias(N, C.getNode())) { 21125 // Look further up the chain. 21126 C = C.getOperand(0); 21127 return true; 21128 } 21129 return false; 21130 } 21131 default: 21132 return false; 21133 } 21134 }; 21135 21136 // Look at each chain and determine if it is an alias. If so, add it to the 21137 // aliases list. If not, then continue up the chain looking for the next 21138 // candidate. 21139 while (!Chains.empty()) { 21140 SDValue Chain = Chains.pop_back_val(); 21141 21142 // Don't bother if we've seen Chain before. 21143 if (!Visited.insert(Chain.getNode()).second) 21144 continue; 21145 21146 // For TokenFactor nodes, look at each operand and only continue up the 21147 // chain until we reach the depth limit. 21148 // 21149 // FIXME: The depth check could be made to return the last non-aliasing 21150 // chain we found before we hit a tokenfactor rather than the original 21151 // chain. 21152 if (Depth > TLI.getGatherAllAliasesMaxDepth()) { 21153 Aliases.clear(); 21154 Aliases.push_back(OriginalChain); 21155 return; 21156 } 21157 21158 if (Chain.getOpcode() == ISD::TokenFactor) { 21159 // We have to check each of the operands of the token factor for "small" 21160 // token factors, so we queue them up. Adding the operands to the queue 21161 // (stack) in reverse order maintains the original order and increases the 21162 // likelihood that getNode will find a matching token factor (CSE.) 21163 if (Chain.getNumOperands() > 16) { 21164 Aliases.push_back(Chain); 21165 continue; 21166 } 21167 for (unsigned n = Chain.getNumOperands(); n;) 21168 Chains.push_back(Chain.getOperand(--n)); 21169 ++Depth; 21170 continue; 21171 } 21172 // Everything else 21173 if (ImproveChain(Chain)) { 21174 // Updated Chain Found, Consider new chain if one exists. 21175 if (Chain.getNode()) 21176 Chains.push_back(Chain); 21177 ++Depth; 21178 continue; 21179 } 21180 // No Improved Chain Possible, treat as Alias. 21181 Aliases.push_back(Chain); 21182 } 21183 } 21184 21185 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain 21186 /// (aliasing node.) 21187 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { 21188 if (OptLevel == CodeGenOpt::None) 21189 return OldChain; 21190 21191 // Ops for replacing token factor. 21192 SmallVector<SDValue, 8> Aliases; 21193 21194 // Accumulate all the aliases to this node. 21195 GatherAllAliases(N, OldChain, Aliases); 21196 21197 // If no operands then chain to entry token. 21198 if (Aliases.size() == 0) 21199 return DAG.getEntryNode(); 21200 21201 // If a single operand then chain to it. We don't need to revisit it. 21202 if (Aliases.size() == 1) 21203 return Aliases[0]; 21204 21205 // Construct a custom tailored token factor. 21206 return DAG.getTokenFactor(SDLoc(N), Aliases); 21207 } 21208 21209 namespace { 21210 // TODO: Replace with with std::monostate when we move to C++17. 21211 struct UnitT { } Unit; 21212 bool operator==(const UnitT &, const UnitT &) { return true; } 21213 bool operator!=(const UnitT &, const UnitT &) { return false; } 21214 } // namespace 21215 21216 // This function tries to collect a bunch of potentially interesting 21217 // nodes to improve the chains of, all at once. This might seem 21218 // redundant, as this function gets called when visiting every store 21219 // node, so why not let the work be done on each store as it's visited? 21220 // 21221 // I believe this is mainly important because MergeConsecutiveStores 21222 // is unable to deal with merging stores of different sizes, so unless 21223 // we improve the chains of all the potential candidates up-front 21224 // before running MergeConsecutiveStores, it might only see some of 21225 // the nodes that will eventually be candidates, and then not be able 21226 // to go from a partially-merged state to the desired final 21227 // fully-merged state. 21228 21229 bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { 21230 SmallVector<StoreSDNode *, 8> ChainedStores; 21231 StoreSDNode *STChain = St; 21232 // Intervals records which offsets from BaseIndex have been covered. In 21233 // the common case, every store writes to the immediately previous address 21234 // space and thus merged with the previous interval at insertion time. 21235 21236 using IMap = 21237 llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>; 21238 IMap::Allocator A; 21239 IMap Intervals(A); 21240 21241 // This holds the base pointer, index, and the offset in bytes from the base 21242 // pointer. 21243 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 21244 21245 // We must have a base and an offset. 21246 if (!BasePtr.getBase().getNode()) 21247 return false; 21248 21249 // Do not handle stores to undef base pointers. 21250 if (BasePtr.getBase().isUndef()) 21251 return false; 21252 21253 // BaseIndexOffset assumes that offsets are fixed-size, which 21254 // is not valid for scalable vectors where the offsets are 21255 // scaled by `vscale`, so bail out early. 21256 if (St->getMemoryVT().isScalableVector()) 21257 return false; 21258 21259 // Add ST's interval. 21260 Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit); 21261 21262 while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) { 21263 // If the chain has more than one use, then we can't reorder the mem ops. 21264 if (!SDValue(Chain, 0)->hasOneUse()) 21265 break; 21266 // TODO: Relax for unordered atomics (see D66309) 21267 if (!Chain->isSimple() || Chain->isIndexed()) 21268 break; 21269 21270 // Find the base pointer and offset for this memory node. 21271 const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG); 21272 // Check that the base pointer is the same as the original one. 21273 int64_t Offset; 21274 if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset)) 21275 break; 21276 int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8; 21277 // Make sure we don't overlap with other intervals by checking the ones to 21278 // the left or right before inserting. 21279 auto I = Intervals.find(Offset); 21280 // If there's a next interval, we should end before it. 21281 if (I != Intervals.end() && I.start() < (Offset + Length)) 21282 break; 21283 // If there's a previous interval, we should start after it. 21284 if (I != Intervals.begin() && (--I).stop() <= Offset) 21285 break; 21286 Intervals.insert(Offset, Offset + Length, Unit); 21287 21288 ChainedStores.push_back(Chain); 21289 STChain = Chain; 21290 } 21291 21292 // If we didn't find a chained store, exit. 21293 if (ChainedStores.size() == 0) 21294 return false; 21295 21296 // Improve all chained stores (St and ChainedStores members) starting from 21297 // where the store chain ended and return single TokenFactor. 21298 SDValue NewChain = STChain->getChain(); 21299 SmallVector<SDValue, 8> TFOps; 21300 for (unsigned I = ChainedStores.size(); I;) { 21301 StoreSDNode *S = ChainedStores[--I]; 21302 SDValue BetterChain = FindBetterChain(S, NewChain); 21303 S = cast<StoreSDNode>(DAG.UpdateNodeOperands( 21304 S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3))); 21305 TFOps.push_back(SDValue(S, 0)); 21306 ChainedStores[I] = S; 21307 } 21308 21309 // Improve St's chain. Use a new node to avoid creating a loop from CombineTo. 21310 SDValue BetterChain = FindBetterChain(St, NewChain); 21311 SDValue NewST; 21312 if (St->isTruncatingStore()) 21313 NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(), 21314 St->getBasePtr(), St->getMemoryVT(), 21315 St->getMemOperand()); 21316 else 21317 NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(), 21318 St->getBasePtr(), St->getMemOperand()); 21319 21320 TFOps.push_back(NewST); 21321 21322 // If we improved every element of TFOps, then we've lost the dependence on 21323 // NewChain to successors of St and we need to add it back to TFOps. Do so at 21324 // the beginning to keep relative order consistent with FindBetterChains. 21325 auto hasImprovedChain = [&](SDValue ST) -> bool { 21326 return ST->getOperand(0) != NewChain; 21327 }; 21328 bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain); 21329 if (AddNewChain) 21330 TFOps.insert(TFOps.begin(), NewChain); 21331 21332 SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps); 21333 CombineTo(St, TF); 21334 21335 // Add TF and its operands to the worklist. 21336 AddToWorklist(TF.getNode()); 21337 for (const SDValue &Op : TF->ops()) 21338 AddToWorklist(Op.getNode()); 21339 AddToWorklist(STChain); 21340 return true; 21341 } 21342 21343 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { 21344 if (OptLevel == CodeGenOpt::None) 21345 return false; 21346 21347 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 21348 21349 // We must have a base and an offset. 21350 if (!BasePtr.getBase().getNode()) 21351 return false; 21352 21353 // Do not handle stores to undef base pointers. 21354 if (BasePtr.getBase().isUndef()) 21355 return false; 21356 21357 // Directly improve a chain of disjoint stores starting at St. 21358 if (parallelizeChainedStores(St)) 21359 return true; 21360 21361 // Improve St's Chain.. 21362 SDValue BetterChain = FindBetterChain(St, St->getChain()); 21363 if (St->getChain() != BetterChain) { 21364 replaceStoreChain(St, BetterChain); 21365 return true; 21366 } 21367 return false; 21368 } 21369 21370 /// This is the entry point for the file. 21371 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, 21372 CodeGenOpt::Level OptLevel) { 21373 /// This is the main entry point to this class. 21374 DAGCombiner(*this, AA, OptLevel).Run(Level); 21375 } 21376